def __init__(self, N, d, name="PositiveBias"): """Builds the PositiveBias network. :param N: base number of neurons :param d: dimension of input signal :param name: name for network """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 biaslevel = 0.03 # the value to be output for negative inputs # threshold the input signal to detect positive values nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0, 0.1)) neg_thresh = net.make_array("neg_thresh", N, d, encoders=[[1]], node_factory=nfac) neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False) # create a population that tries to output biaslevel across # all dimensions bias_input = net.make_input("bias_input", [biaslevel]) bias_pop = net.make_array( "bias_pop", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)]) net.connect(bias_input, bias_pop, pstc=tauPSC) # the individual dimensions of bias_pop are then inhibited by the # output of neg_thresh (so any positive values don't get the bias) net.connect(neg_thresh, bias_pop, pstc=tauPSC, func=lambda x: [1.0] if x[0] > 0 else [0.0], transform=[[-10 if i == k else 0 for k in range(d)] for i in range(d) for _ in range(bias_pop.getNeurons() / d)]) # the whole population is inhibited by the learn signal, so that it # outputs 0 if the system isn't supposed to be learning bias_pop.addTermination("learn", [[-10] for _ in range(bias_pop.getNeurons())], tauPSC, False) self.exposeTermination(neg_thresh.getTermination("input"), "input") self.exposeTermination(bias_pop.getTermination("learn"), "learn") self.exposeOrigin(bias_pop.getOrigin("X"), "X")
def __init__(self, N, d, name="PositiveBias"): """Builds the PositiveBias network. :param N: base number of neurons :param d: dimension of input signal :param name: name for network """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 biaslevel = 0.03 # the value to be output for negative inputs # threshold the input signal to detect positive values nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0, 0.1)) neg_thresh = net.make_array("neg_thresh", N, d, encoders=[[1]], node_factory=nfac) neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False) # create a population that tries to output biaslevel across # all dimensions bias_input = net.make_input("bias_input", [biaslevel]) bias_pop = net.make_array("bias_pop", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)]) net.connect(bias_input, bias_pop, pstc=tauPSC) # the individual dimensions of bias_pop are then inhibited by the # output of neg_thresh (so any positive values don't get the bias) net.connect(neg_thresh, bias_pop, pstc=tauPSC, func=lambda x: [1.0] if x[0] > 0 else [0.0], transform=[[-10 if i == k else 0 for k in range(d)] for i in range(d) for _ in range(bias_pop.getNeurons() / d)]) # the whole population is inhibited by the learn signal, so that it # outputs 0 if the system isn't supposed to be learning bias_pop.addTermination("learn", [[-10] for _ in range(bias_pop.getNeurons())], tauPSC, False) self.exposeTermination(neg_thresh.getTermination("input"), "input") self.exposeTermination(bias_pop.getTermination("learn"), "learn") self.exposeOrigin(bias_pop.getOrigin("X"), "X")
def test_actionvalues(): net = nef.Network("testActionValues") stateN = 200 N = 100 stateD = 2 stateradius = 1.0 statelength = math.sqrt(2 * stateradius**2) init_Qs = 0.5 learningrate = 0.0 Qradius = 1 tauPSC = 0.007 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # state state_pop = net.make( "state_pop", stateN, stateD, radius=statelength, node_factory=HRLutils.node_fac(), eval_points=[[x / statelength, y / statelength] for x in range(-int(stateradius), int(stateradius)) for y in range(-int(stateradius), int(stateradius))]) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False) # set up action nodes decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("testActionValues", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # input inp = net.make_input("input", [0, 0]) net.connect(inp, state_pop.getTermination("state_input")) net.add_to_nengo() net.view()
def test_actionvalues(): net = nef.Network("testActionValues") stateN = 200 N = 100 stateD = 2 stateradius = 1.0 statelength = math.sqrt(2 * stateradius ** 2) init_Qs = 0.5 learningrate = 0.0 Qradius = 1 tauPSC = 0.007 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # state state_pop = net.make( "state_pop", stateN, stateD, radius=statelength, node_factory=HRLutils.node_fac(), eval_points=[ [x / statelength, y / statelength] for x in range(-int(stateradius), int(stateradius)) for y in range(-int(stateradius), int(stateradius)) ], ) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False) # set up action nodes decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues( "testActionValues", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders ) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # input inp = net.make_input("input", [0, 0]) net.connect(inp, state_pop.getTermination("state_input")) net.add_to_nengo() net.view()
def __init__(self, num_actions, Qradius=1.0, rewardradius=1.0, discount=0.3): """Builds the ErrorNetwork. :param num_actions: the number of actions available to the system :param Qradius: expected radius of Q values :param rewardradius: expected radius of reward signal :param discount: discount factor """ self.name = "ErrorNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 errorcap = 0.1 #soft cap on error magnitude (large errors seem to cause problems #with overly-generalizing the learning) #set up relays vals_relay = net.make("vals_relay", 1, num_actions, mode="direct") vals_relay.fixMode() vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) old_vals_relay = net.make("old_vals_relay", 1, num_actions, mode="direct") old_vals_relay.fixMode() old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) curr_bg_relay = net.make("curr_bg_relay", 1, num_actions, mode="direct") curr_bg_relay.fixMode() curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) saved_bg_relay = net.make("saved_bg_relay", 1, num_actions, mode="direct") saved_bg_relay.fixMode() saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) #select out only the currently chosen Q value gatedQ = net.make_array("gatedQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(vals_relay, gatedQ, pstc=tauPSC) net.connect(curr_bg_relay, gatedQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedQ.getNeurons() / num_actions)], pstc=tauPSC) currQ = net.make("currQ", 1, 1, mode="direct") currQ.fixMode() net.connect(gatedQ, currQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) #select out only the previously chosen Q value gatedstoreQ = net.make_array("gatedstoreQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC) net.connect(saved_bg_relay, gatedstoreQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedstoreQ.getNeurons() / num_actions)], pstc=tauPSC) storeQ = net.make("storeQ", 1, 1, mode="direct") storeQ.fixMode() net.connect(gatedstoreQ, storeQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) #create error calculation network error = errorcalc2.ErrorCalc2(discount, rewardradius=rewardradius, Qradius=Qradius) net.add(error) net.connect(currQ, error.getTermination("currQ")) net.connect(storeQ, error.getTermination("storeQ")) #gate error by learning signal and saved BG output (we only want error when the #system is supposed to be learning, and we only want error related to the action #that was selected) gatederror = net.make_array("gatederror", N * 2, num_actions, radius=errorcap, node_factory=HRLutils.node_fac()) gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(error, gatederror, transform=[[1.0 / Qradius] for _ in range(num_actions)], pstc=tauPSC) #scale the error by Qradius, so that we don't get super huge errors (screws up the gating) learninggate = net.make("learninggate", N, 1, node_factory=HRLutils.node_fac()) learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC, False) net.connect(learninggate, gatederror, func=lambda x: [1.0], transform=[[-12] for _ in range(gatederror.getNeurons())], pstc=tauPSC) net.connect(saved_bg_relay, gatederror, transform=[[-12 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatederror.getNeurons() / num_actions)], pstc=tauPSC) #add a positive bias to the error anywhere the Q values are negative (to stop #Q values from getting too negative, which screws up the action selection) posbias = positivebias.PositiveBias(N, num_actions) net.add(posbias) net.connect(old_vals_relay, posbias.getTermination("input")) net.connect(learninggate, posbias.getTermination("learn"), func=lambda x: [1.0]) biasederror = net.make("biasederror", 1, num_actions, mode="direct") biasederror.fixMode() net.connect(gatederror, biasederror, pstc=0.001) net.connect(posbias, biasederror, pstc=0.001) self.exposeTermination(curr_bg_relay.getTermination("input"), "curr_bg_input") self.exposeTermination(saved_bg_relay.getTermination("input"), "saved_bg_input") self.exposeTermination(vals_relay.getTermination("input"), "vals") self.exposeTermination(old_vals_relay.getTermination("input"), "old_vals") self.exposeTermination(error.getTermination("reward"), "reward") self.exposeTermination(error.getTermination("reset"), "reset") self.exposeTermination(learninggate.getTermination("gate"), "learn") self.exposeOrigin(biasederror.getOrigin("X"), "error")
def __init__(self, discount, rewardradius=1.0, Qradius=1.0): """Builds the ErrorCalc2 network. :param discount: discount factor, controls rate of integration :param rewardradius: expected radius of reward value :param Qradius: expected radius of Q values """ self.name = "ErrorCalc" net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 intPSC = 0.1 N = 50 # relay for current Q input currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) currQ.fixMode() currQ.addDecodedTermination("input", [[1]], 0.001, False) # input population for resetting the network reset_nodefac = HRLutils.node_fac() reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0)) reset = net.make("reset", N, 1, encoders=[[1]], node_factory=reset_nodefac) reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) # this population will begin outputting a value once the reset # signal exceeds the threshold, and that output will then be # used to reset the rest of the network reset.addDecodedTermination("input", [[1]], tauPSC, False) # relay for stored previous value of Q storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) storeQ.fixMode() storeQ.addDecodedTermination("input", [[1]], 0.001, False) #calculate "discount" by integrating output of storeQ acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50) net.add(acc_storeQ) zero_input = net.make_input("zero_input", [0]) net.connect(zero_input, acc_storeQ.getTermination("target")) net.connect(reset, acc_storeQ.getTermination("transfer")) # threshold storeQ value so it won't go below zero. that is, if we have # negative Q values, we don't want to have a negative discount, or that will just drive # the highest (negative) Q value upwards, and it will always be selected. negative Q # values are instead pushed upwards by the PositiveBias mechanism. Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]], eval_points=[[x * 0.001] for x in range(1000)], radius=Qradius, intercept=(0, 1)) net.connect(storeQ, Qthresh, pstc=tauPSC) net.connect(Qthresh, acc_storeQ, pstc=intPSC, transform=[[discount * intPSC]], func=lambda x: max(x[0], 0.0)) # accumulate reward reward = memory.Memory("reward", N * 4, 1, radius=rewardradius, inputscale=50) net.add(reward) reward.addDecodedTermination("input", [[intPSC]], intPSC, False) net.connect(zero_input, reward.getTermination("target")) net.connect(reset, reward.getTermination("transfer")) # put reward, currQ, storeQ, and discount together to calculate error error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac()) net.connect(currQ, error, pstc=tauPSC) net.connect(reward, error, pstc=tauPSC) net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]]) net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]]) self.exposeTermination(reward.getTermination("input"), "reward") self.exposeTermination(reset.getTermination("input"), "reset") self.exposeTermination(currQ.getTermination("input"), "currQ") self.exposeTermination(storeQ.getTermination("input"), "storeQ") self.exposeOrigin(error.getOrigin("X"), "X")
def __init__(self, name, N, d, radius=1.0, inputscale=1.0, recurweight=1.0, direct_storage=False): """Builds the Memory network. :param name: name of network :param N: base number of neurons :param d: dimension of stored value :param radius: radius of stored value :param inputscale: controls how fast the stored value moves to the target :param recurweight: controls the preservation of the stored value :param direct_storage: if True, use directmode for the memory """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.dimension = d self.radius = radius tauPSC = 0.007 intPSC = 0.1 # population that will store the value if not direct_storage: storage = net.make_array("storage", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.001] for x in range(-1000, 1000)]) else: storage = net.make("storage", 1, d, mode="direct") storage.fixMode() net.connect(storage, storage, transform=MU.diag([recurweight for _ in range(d)]), pstc=intPSC) # storageinput will represent (target - stored_value), which when used # as input to storage will drive the stored value to target storageinput = net.make_array("storageinput", N, d, node_factory=HRLutils.node_fac()) storageinput.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) storageinput.addDecodedTermination("target", MU.diag([1.0 / radius for _ in range(d)]), tauPSC, False) # note: store everything in -1 -- 1 range by dividing by radius # scale storageinput value by inputscale to control rate at which # it moves to the target net.connect(storageinput, storage, pstc=intPSC, transform=MU.diag([inputscale * intPSC for _ in range(d)])) # subtract currently stored value net.connect(storage, storageinput, pstc=tauPSC, transform=MU.diag([-1 for _ in range(d)])) # we want to open the input gate when the transfer signal arrives (to # transfer storageinput to storage). using a double inhibition setup # (rather than just feeding it e.g. the the inverse of the transfer # signal) so that we get a nice clean zero # this inhibits the storageinput population (to block input to the # storage) transferinhib = net.make("transferinhib", N, 1, node_factory=HRLutils.node_fac()) transferinhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) transferinhib.addTermination("gate", [[-10] for _ in range(transferinhib.getNeurons())], tauPSC, False) net.connect(transferinhib, storageinput, pstc=tauPSC, transform=[[-10] for _ in range(storageinput.getNeurons())]) # this drives the transferinhib population (so that by default it will # block any input). inhibiting transferinhib will thus remove the # inhibition on storageinput, and change the stored value biasinput = net.make_input("biasinput", [1]) net.connect(biasinput, transferinhib, pstc=tauPSC) # output population (to undo radius scaling) storageoutput = net.make("storageoutput", 1, d, mode="direct") storageoutput.fixMode() net.connect(storage, storageoutput, pstc=0.001, transform=MU.diag([radius for _ in range(d)])) self.exposeTermination(transferinhib.getTermination("gate"), "transfer") self.exposeTermination(storageinput.getTermination("target"), "target") self.exposeOrigin(storageoutput.getOrigin("X"), "X")
def __init__(self, num_actions, Qradius=1.0, rewardradius=1.0, discount=0.3): """Builds the ErrorNetwork. :param num_actions: the number of actions available to the system :param Qradius: expected radius of Q values :param rewardradius: expected radius of reward signal :param discount: discount factor """ self.name = "ErrorNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 errorcap = 0.1 # soft cap on error magnitude (large errors seem to # cause problems with overly-generalizing the learning) # set up relays vals_relay = net.make("vals_relay", 1, num_actions, mode="direct") vals_relay.fixMode() vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) old_vals_relay = net.make("old_vals_relay", 1, num_actions, mode="direct") old_vals_relay.fixMode() old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) curr_bg_relay = net.make("curr_bg_relay", 1, num_actions, mode="direct") curr_bg_relay.fixMode() curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) saved_bg_relay = net.make("saved_bg_relay", 1, num_actions, mode="direct") saved_bg_relay.fixMode() saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) # select out only the currently chosen Q value gatedQ = net.make_array("gatedQ", N * 2, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(vals_relay, gatedQ, pstc=tauPSC) net.connect( curr_bg_relay, gatedQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedQ.getNeurons() / num_actions)], pstc=tauPSC) currQ = net.make("currQ", 1, 1, mode="direct") currQ.fixMode() net.connect(gatedQ, currQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) # select out only the previously chosen Q value gatedstoreQ = net.make_array("gatedstoreQ", N * 2, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC) net.connect( saved_bg_relay, gatedstoreQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedstoreQ.getNeurons() / num_actions)], pstc=tauPSC) storeQ = net.make("storeQ", 1, 1, mode="direct") storeQ.fixMode() net.connect(gatedstoreQ, storeQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) # create error calculation network error = errorcalc2.ErrorCalc2(discount, rewardradius=rewardradius, Qradius=Qradius) net.add(error) net.connect(currQ, error.getTermination("currQ")) net.connect(storeQ, error.getTermination("storeQ")) # gate error by learning signal and saved BG output (we only want error # when the system is supposed to be learning, and we only want error # related to the action that was selected) gatederror = net.make_array("gatederror", N * 2, num_actions, radius=errorcap, node_factory=HRLutils.node_fac()) gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(error, gatederror, transform=[[1.0 / Qradius] for _ in range(num_actions)], pstc=tauPSC) # scale the error by Qradius, so that we don't get super huge errors # (causes problems with the gating) learninggate = net.make("learninggate", N, 1, node_factory=HRLutils.node_fac()) learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC, False) net.connect(learninggate, gatederror, func=lambda x: [1.0], transform=[[-12] for _ in range(gatederror.getNeurons())], pstc=tauPSC) net.connect( saved_bg_relay, gatederror, transform=[[-12 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatederror.getNeurons() / num_actions)], pstc=tauPSC) # add a positive bias to the error anywhere the Q values are negative # (to stop Q values from getting too negative, which causes problems # with the action selection) posbias = positivebias.PositiveBias(N, num_actions) net.add(posbias) net.connect(old_vals_relay, posbias.getTermination("input")) net.connect(learninggate, posbias.getTermination("learn"), func=lambda x: [1.0]) biasederror = net.make("biasederror", 1, num_actions, mode="direct") biasederror.fixMode() net.connect(gatederror, biasederror, pstc=0.001) net.connect(posbias, biasederror, pstc=0.001) self.exposeTermination(curr_bg_relay.getTermination("input"), "curr_bg_input") self.exposeTermination(saved_bg_relay.getTermination("input"), "saved_bg_input") self.exposeTermination(vals_relay.getTermination("input"), "vals") self.exposeTermination(old_vals_relay.getTermination("input"), "old_vals") self.exposeTermination(error.getTermination("reward"), "reward") self.exposeTermination(error.getTermination("reset"), "reset") self.exposeTermination(learninggate.getTermination("gate"), "learn") self.exposeOrigin(biasederror.getOrigin("X"), "error")
def __init__(self, actions, Qradius=1, noiselevel=0.03): """Builds the BGNetwork. :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param Qradius: expected radius of Q values :param noiselevel: standard deviation of noise added to Q values for exploration """ self.name = "BGNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.N = 50 self.d = len(actions) self.mut_inhib = 1.0 # mutual inhibition between actions self.tauPSC = 0.007 # make basal ganglia netbg = nef.Network("bg") bginput = netbg.make("bginput", 1, self.d, mode="direct") bginput.fixMode() bginput.addDecodedTermination("input", MU.diag([1.0 / Qradius for _ in range(self.d)]), 0.001, False) # divide by Q radius to get values back into 0 -- 1 range bgoutput = netbg.make("bgoutput", 1, self.d, mode="direct") bgoutput.fixMode() basalganglia.make_basal_ganglia(netbg, bginput, bgoutput, dimensions=self.d, neurons=200) bg = netbg.network net.add(bg) bg.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) bg.exposeTermination(bginput.getTermination("input"), "input") bg.exposeOrigin(bgoutput.getOrigin("X"), "X") # insert noise (used to give some randomness to drive exploration) noiselevel = net.make_input("noiselevel", [noiselevel]) noise = noisenode.NoiseNode(1, dimension=len(actions)) net.add(noise) net.connect(noiselevel, noise.getTermination("scale")) net.connect(noise.getOrigin("noise"), "bg.bginput", pstc=0.001) # add bias to shift everything up to 0.5--1.5 biasinput = net.make_input("biasinput", [0.5]) net.connect(biasinput, "bg.bginput", transform=[[1] for _ in range(self.d)], pstc=0.001) # invert BG output (so the "selected" action will have a positive value # and the rest zero) invert = thalamus.make(net, name="invert", neurons=self.N, dimensions=self.d, useQuick=False) invert.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(bg, invert.getTermination("bg_input")) # add mutual inhibition net.connect(invert.getOrigin("xBiased"), invert, pstc=self.tauPSC, transform=[[0 if i == j else -self.mut_inhib for j in range(self.d)] for i in range(self.d)]) # threshold output values so that you get a nice clean 0 for # non-selected and 1 for selected threshf = HRLutils.node_fac() threshold = 0.1 threshf.setIntercept(IndicatorPDF(threshold, 1.0)) val_threshold = net.make_array("val_threshold", self.N * 2, self.d, node_factory=threshf, encoders=[[1]]) val_threshold.addDecodedOrigin( "output", [PiecewiseConstantFunction([threshold], [0, 1]) for _ in range(self.d)], "AXON", True) net.connect(invert.getOrigin("xBiased"), val_threshold, pstc=self.tauPSC) # output action (action vectors weighted by BG output) weight_actions = net.make_array("weight_actions", 50, len(actions[0][1]), intercept=(0, 1)) net.connect(val_threshold.getOrigin("output"), weight_actions, transform=MU.transpose([actions[i][1] for i in range(self.d)]), pstc=0.007) # save the BG output (selected action and selected action value) save_relay = net.make("save_relay", 1, 1, mode="direct") save_relay.fixMode() save_relay.addDecodedTermination("input", [[1]], 0.001, False) saved_action = memory.Memory("saved_action", self.N * 2, len(actions[0][1]), inputscale=75) net.add(saved_action) net.connect(weight_actions, saved_action.getTermination("target")) net.connect(save_relay, saved_action.getTermination("transfer")) saved_vals = memory.Memory("saved_values", self.N * 2, self.d, inputscale=75) net.add(saved_vals) net.connect(val_threshold.getOrigin("output"), saved_vals.getTermination("target")) net.connect(save_relay, saved_vals.getTermination("transfer")) # put the saved values through a threshold (we want a nice clean # zero for non-selected values) nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0.2, 1)) saved_vals_threshold = net.make_array("saved_vals_threshold", self.N, self.d, node_factory=nfac, encoders=[[1]]) saved_vals_threshold.addDecodedOrigin( "output", [PiecewiseConstantFunction([0.3], [0, 1]) for _ in range(self.d)], "AXON", True) net.connect(saved_vals, saved_vals_threshold, pstc=self.tauPSC) self.exposeTermination(bg.getTermination("input"), "input") self.exposeTermination(save_relay.getTermination("input"), "save_output") self.exposeOrigin(val_threshold.getOrigin("output"), "curr_vals") self.exposeOrigin(weight_actions.getOrigin("X"), "curr_action") self.exposeOrigin(saved_vals_threshold.getOrigin("output"), "saved_vals") self.exposeOrigin(saved_action.getOrigin("X"), "saved_action")
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None, state_evals=None, state_threshold=0.0): """Builds the QNetwork. :param stateN: number of neurons to use to represent state :param stateD: dimension of state vector :param state_encoders: encoders to use for neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learningrate for action value learning rule :param stateradius: expected radius of state values :param Qradius: expected radius of Q values :param load_weights: filename to load Q value weights from :param state_evals: evaluation points to use for state population. This is used when initializing the Q values (may be necessary if the input states don't tend to fall in the hypersphere). :param state_threshold: threshold of state neurons (minimum intercept) """ self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 num_actions = len(actions) init_Qs = 0.2 #initial value for all Q values self.neuron_learning = False # if True, use neuron--neuron weight learning, # otherwise, use decoder learning # set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() # This apparently fixes the simulator mode to the curremt mode, so I'm guessing we just don't want it over-ridden by an over-zealous config file. state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) # create state population state_fac = HRLutils.node_fac() state_fac.setIntercept(IndicatorPDF(state_threshold, 1.0)) print("making the state_pop") state_pop = net.make("state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) # store the state value (used to drive population encoding previous state) print("create the saved state memory") saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) # create population representing previous state old_state_pop = net.make("old_state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) print("setup the action nodes") # set up action nodes if self.neuron_learning: # use ActionValues network to compute Q values # current Q values decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # Q values of previous state decoders = old_state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) else: # just use decoder on state population to compute Q values # current Q values origin = state_pop.addDecodedOrigin("vals", [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)], "AXON") state_dlnode = decoderlearningnode.DecoderLearningNode(state_pop, origin, learningrate, num_actions, name="state_learningnode") net.add(state_dlnode) # just a little relay node, so that things match up for the rest of the script # when you have the neuron -- neuron learning actionvals = net.make("actionvals", 1, num_actions, mode="direct") actionvals.fixMode() net.connect(origin, actionvals, pstc=0.001) # Q values of previous state origin = old_state_pop.addDecodedOrigin("vals", [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)], "AXON") old_state_dlnode = decoderlearningnode.DecoderLearningNode(old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode") net.add(old_state_dlnode) old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct") old_actionvals.fixMode() net.connect(origin, old_actionvals, pstc=0.001) if load_weights != None: self.loadParams(load_weights) # find error between old_actionvals and actionvals (this will be used to drive learning # on the new actionvals) valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac()) net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC) # doubling the values to get a bigger error signal # calculate diff between curr_state and saved_state and use that to gate valdiff (we # only want to train the curr state based on previous state when the two have similar # values) # WTF does that mean and what is with these weird intercept statediff = net.make_array("statediff", N, stateD, intercept=(0.2, 1)) # note: threshold > 0 so that there is a deadzone in the middle (when the states # are similar) where there will be no output inhibition net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) # connect up valdiff to the error signal for current Q values, and expose # the error signal for the previous Q values to the external error if self.neuron_learning: net.connect(valdiff, actionvals.getTermination("error")) self.exposeTermination(old_actionvals.getTermination("error"), "error") else: net.connect(valdiff, state_dlnode.getTermination("error")) self.exposeTermination(old_state_dlnode.getTermination("error"), "error") self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
def __init__(self, name, N, stateN, actions, learningrate, Qradius=1.0, init_decoders=None): """Build ActionValues network. :param name: name of Network :param N: base number of neurons :param stateN: number of neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learning rate for PES rule :param Qradius: expected radius of Q values :param init_decoders: if specified, will be used to initialize the connection weights to whatever function is specified by decoders """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.N = N self.learningrate = learningrate self.supervision = 1.0 # don't use the unsupervised stuff at all self.tauPSC = 0.007 modterms = [] learnterms = [] # relays output = net.make("output", 1, len(actions), mode="direct") output.fixMode() for i, action in enumerate(actions): # create one population corresponding to each action act_pop = net.make("action_" + action[0], self.N * 4, 1, node_factory=HRLutils.node_fac()) act_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) # add error termination modterm = act_pop.addDecodedTermination( "error", [[0 if j != i else 1 for j in range(len(actions))]], 0.005, True) # set modulatory transform so that it selects one dimension of # the error signal # create learning termination if init_decoders is not None: weights = MU.prod(act_pop.getEncoders(), MU.transpose(init_decoders)) else: weights = [[ random.uniform(-1e-3, 1e-3) for j in range(stateN) ] for i in range(act_pop.getNeurons())] learningterm = act_pop.addHPESTermination("learning", weights, 0.005, False, None) # initialize the learning rule net.learn(act_pop, learningterm, modterm, rate=self.learningrate, supervisionRatio=self.supervision) # connect each action back to output relay net.connect(act_pop.getOrigin("X"), output, transform=[[0] if j != i else [Qradius] for j in range(len(actions))], pstc=0.001) # note, we learn all the Q values with radius 1, then just # multiply by the desired Q radius here modterms += [modterm] learnterms += [learningterm] # use EnsembleTerminations to group the individual action terminations # into one multi-dimensional termination self.exposeTermination(EnsembleTermination(self, "state", learnterms), "state") self.exposeTermination(EnsembleTermination(self, "error", modterms), "error") self.exposeOrigin(output.getOrigin("X"), "X")
def __init__(self, name, N, stateN, actions, learningrate, Qradius=1.0, init_decoders=None): """Build ActionValues network. :param name: name of Network :param N: base number of neurons :param stateN: number of neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learning rate for PES rule :param Qradius: expected radius of Q values :param init_decoders: if specified, will be used to initialize the connection weights to whatever function is specified by the decoders """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.N = N self.learningrate = learningrate self.supervision = 1.0 # don't use the unsupervised stuff at all self.tauPSC = 0.007 modterms = [] learnterms = [] # relays output = net.make("output", 1, len(actions), mode="direct") output.fixMode() for i, action in enumerate(actions): # create one population corresponding to each action act_pop = net.make("action_" + action[0], self.N * 4, 1, node_factory=HRLutils.node_fac()) act_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) # add error termination modterm = act_pop.addDecodedTermination("error", [[0 if j != i else 1 for j in range(len(actions))]], 0.005, True) # set modulatory transform so that it selects one dimension of the error signal # create learning termination if init_decoders != None: weights = MU.prod(act_pop.getEncoders(), MU.transpose(init_decoders)) else: weights = [[random.uniform(-1e-3, 1e-3) for j in range(stateN)] for i in range(act_pop.getNeurons())] learningterm = act_pop.addHPESTermination("learning", weights, 0.005, False, None) # initialize the learning rule net.learn(act_pop, learningterm, modterm, rate=self.learningrate, supervisionRatio=self.supervision) # connect each action back to output relay net.connect(act_pop.getOrigin("X"), output, transform=[[0] if j != i else [Qradius] for j in range(len(actions))], pstc=0.001) # note, we learn all the Q values with radius 1, then just multiply by the desired Q radius here modterms += [modterm] learnterms += [learningterm] # use EnsembleTerminations to group the individual action terminations into one multi-dimensional termination self.exposeTermination(EnsembleTermination(self, "state", learnterms), "state") self.exposeTermination(EnsembleTermination(self, "error", modterms), "error") self.exposeOrigin(output.getOrigin("X"), "X")
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None, state_evals=None, state_threshold=(0.0, 1.0), statediff_threshold=0.2, init_Qs=None): """Builds the QNetwork. :param stateN: number of neurons to use to represent state :param stateD: dimension of state vector :param state_encoders: encoders to use for neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learningrate for action value learning rule :param stateradius: expected radius of state values :param Qradius: expected radius of Q values :param load_weights: filename to load Q value weights from :param state_evals: evaluation points to use for state population. This is used when initializing the Q values (may be necessary if the input states don't tend to fall in the hypersphere). :param state_threshold: threshold range of state neurons :param statediff_threshold: maximum state difference for dual training :param init_Qs: initial Q values """ self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 num_actions = len(actions) init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs # if True, use neuron--neuron weight learning, otherwise, use decoder # learning self.neuron_learning = False # set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) # create state population state_fac = HRLutils.node_fac() if isinstance(state_threshold, (float, int)): state_threshold = (state_threshold, 1.0) state_fac.setIntercept( IndicatorPDF(state_threshold[0], state_threshold[1])) state_pop = net.make("state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) # store the state value (used to drive population encoding previous # state) saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) # create population representing previous state old_state_pop = net.make("old_state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) # set up action nodes if self.neuron_learning: # use ActionValues network to compute Q values # current Q values decoders = state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # Q values of previous state decoders = old_state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) else: # just use decoder on state population to compute Q values # current Q values origin = state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") state_dlnode = decoderlearningnode.DecoderLearningNode( state_pop, origin, learningrate, num_actions, name="state_learningnode") net.add(state_dlnode) # just a little relay node, so that things match up for the rest of # the script when you have the neuron -- neuron learning actionvals = net.make("actionvals", 1, num_actions, mode="direct") actionvals.fixMode() net.connect(origin, actionvals, pstc=0.001) # Q values of previous state origin = old_state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") old_state_dlnode = decoderlearningnode.DecoderLearningNode( old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode") net.add(old_state_dlnode) old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct") old_actionvals.fixMode() net.connect(origin, old_actionvals, pstc=0.001) if load_weights is not None: self.loadParams(load_weights) # find error between old_actionvals and actionvals (this will be used # to drive learning on the new actionvals) valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac()) # doubling the values to get a bigger error signal net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC) # calculate diff between curr_state and saved_state and use that to # gate valdiff (we only want to train the curr state based on previous # state when the two have similar values) # note: threshold > 0 so that there is a deadzone in the middle (when # the states are similar) where there will be no output inhibition statediff = net.make_array("statediff", N, stateD, intercept=(statediff_threshold, 1)) net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) # connect up valdiff to the error signal for current Q values, and # expose the error signal for the previous Q values to the external # error if self.neuron_learning: net.connect(valdiff, actionvals.getTermination("error")) self.exposeTermination(old_actionvals.getTermination("error"), "error") else: net.connect(valdiff, state_dlnode.getTermination("error")) self.exposeTermination(old_state_dlnode.getTermination("error"), "error") self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
def __init__(self, discount, rewardradius=1.0, Qradius=1.0): """Builds the ErrorCalc2 network. :param discount: discount factor, controls rate of integration :param rewardradius: expected radius of reward value :param Qradius: expected radius of Q values """ self.name = "ErrorCalc" net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 intPSC = 0.1 N = 50 # relay for current Q input currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) currQ.fixMode() currQ.addDecodedTermination("input", [[1]], 0.001, False) # input population for resetting the network reset_nodefac = HRLutils.node_fac() reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0)) reset = net.make("reset", N, 1, encoders=[[1]], node_factory=reset_nodefac) reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) # this population will begin outputting a value once the reset # signal exceeds the threshold, and that output will then be # used to reset the rest of the network reset.addDecodedTermination("input", [[1]], tauPSC, False) # relay for stored previous value of Q storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) storeQ.fixMode() storeQ.addDecodedTermination("input", [[1]], 0.001, False) # calculate "discount" by integrating output of storeQ acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50) net.add(acc_storeQ) zero_input = net.make_input("zero_input", [0]) net.connect(zero_input, acc_storeQ.getTermination("target")) net.connect(reset, acc_storeQ.getTermination("transfer")) # threshold storeQ value so it won't go below zero. that is, if we # have negative Q values, we don't want to have a negative discount, # or that will just drive the highest (negative) Q value upwards, and # it will always be selected. negative Q values are instead pushed # upwards by the PositiveBias mechanism. Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]], eval_points=[[x * 0.001] for x in range(1000)], radius=Qradius, intercept=(0, 1)) net.connect(storeQ, Qthresh, pstc=tauPSC) net.connect(Qthresh, acc_storeQ, pstc=intPSC, transform=[[discount * intPSC]], func=lambda x: max(x[0], 0.0)) # accumulate reward reward = memory.Memory("reward", N * 4, 1, radius=rewardradius, inputscale=50) net.add(reward) reward.addDecodedTermination("input", [[intPSC]], intPSC, False) net.connect(zero_input, reward.getTermination("target")) net.connect(reset, reward.getTermination("transfer")) # put reward, currQ, storeQ, and discount together to calculate error error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac()) net.connect(currQ, error, pstc=tauPSC) net.connect(reward, error, pstc=tauPSC) net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]]) net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]]) self.exposeTermination(reward.getTermination("input"), "reward") self.exposeTermination(reset.getTermination("input"), "reset") self.exposeTermination(currQ.getTermination("input"), "currQ") self.exposeTermination(storeQ.getTermination("input"), "storeQ") self.exposeOrigin(error.getOrigin("X"), "X")