def addDecodedOrigin(self, name, funcs, origin): net = nef.Network(self) o = self.getNode("storage").addDecodedOrigin(name, funcs, origin) #undo radius scaling funcout = net.make(name, 1, self.dimension, mode="direct") funcout.fixMode() net.connect(o, funcout, pstc=0.001, transform=MU.diag([self.radius for _ in range(self.dimension)])) self.exposeOrigin(funcout.getOrigin("X"), name) return self.getOrigin(name)
def __init__(self, name, N, d, radius=1.0, inputscale=1.0, recurweight=1.0, direct_storage=False): """Builds the Memory network. :param name: name of network :param N: base number of neurons :param d: dimension of stored value :param radius: radius of stored value :param inputscale: controls how fast the stored value moves to the target :param recurweight: controls the preservation of the stored value :param direct_storage: if True, use directmode for the memory """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.dimension = d self.radius = radius tauPSC = 0.007 intPSC = 0.1 # population that will store the value if not direct_storage: storage = net.make_array("storage", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.001] for x in range(-1000, 1000)]) else: storage = net.make("storage", 1, d, mode="direct") storage.fixMode() net.connect(storage, storage, transform=MU.diag([recurweight for _ in range(d)]), pstc=intPSC) # storageinput will represent (target - stored_value), which when used # as input to storage will drive the stored value to target storageinput = net.make_array("storageinput", N, d, node_factory=HRLutils.node_fac()) storageinput.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) storageinput.addDecodedTermination("target", MU.diag([1.0 / radius for _ in range(d)]), tauPSC, False) # note: store everything in -1 -- 1 range by dividing by radius # scale storageinput value by inputscale to control rate at which # it moves to the target net.connect(storageinput, storage, pstc=intPSC, transform=MU.diag([inputscale * intPSC for _ in range(d)])) # subtract currently stored value net.connect(storage, storageinput, pstc=tauPSC, transform=MU.diag([-1 for _ in range(d)])) # we want to open the input gate when the transfer signal arrives (to # transfer storageinput to storage). using a double inhibition setup # (rather than just feeding it e.g. the the inverse of the transfer # signal) so that we get a nice clean zero # this inhibits the storageinput population (to block input to the # storage) transferinhib = net.make("transferinhib", N, 1, node_factory=HRLutils.node_fac()) transferinhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) transferinhib.addTermination("gate", [[-10] for _ in range(transferinhib.getNeurons())], tauPSC, False) net.connect(transferinhib, storageinput, pstc=tauPSC, transform=[[-10] for _ in range(storageinput.getNeurons())]) # this drives the transferinhib population (so that by default it will # block any input). inhibiting transferinhib will thus remove the # inhibition on storageinput, and change the stored value biasinput = net.make_input("biasinput", [1]) net.connect(biasinput, transferinhib, pstc=tauPSC) # output population (to undo radius scaling) storageoutput = net.make("storageoutput", 1, d, mode="direct") storageoutput.fixMode() net.connect(storage, storageoutput, pstc=0.001, transform=MU.diag([radius for _ in range(d)])) self.exposeTermination(transferinhib.getTermination("gate"), "transfer") self.exposeTermination(storageinput.getTermination("target"), "target") self.exposeOrigin(storageoutput.getOrigin("X"), "X")
def run_badreenvironment(nav_args, ctrl_args, bias=0.0, seed=None, flat=False, label="tmp"): """Runs the model on the Badre et al. (2010) task.""" if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_badreenvironment") env = badreenvironment.BadreEnvironment(flat=flat) net.add(env) # ##NAV AGENT stateN = 500 max_state_input = 3 enc = env.gen_encoders(stateN, 0, 0.0) # generate evaluation points orientations = MU.I(env.num_orientations) shapes = MU.I(env.num_shapes) colours = MU.I(env.num_colours) evals = ( list(MU.diag([3 for _ in range(env.stateD)])) + [o + s + c for o in orientations for s in shapes for c in colours]) # create lower level nav_agent = smdpagent.SMDPAgent(stateN, env.stateD, env.actions, name="NavAgent", stateradius=max_state_input, state_encoders=enc, state_evals=evals, discount=0.5, **nav_args) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # actions terminate on fixed schedule (aligned with environment) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="NavTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # ##CTRL AGENT stateN = 500 enc = RandomHypersphereVG().genVectors(stateN, env.stateD) actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, env.stateD, actions, name="CtrlAgent", state_encoders=enc, stateradius=max_state_input, state_evals=evals, discount=0.4, **ctrl_args) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() net.connect(env.getOrigin("state"), ctrl_agent.getTermination("state_input")) ctrl_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="CtrlTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(ctrl_term_node) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # ctrl gets a slight bonus if it selects a rule (as opposed to null), to # encourage it to not just pick null all the time reward_relay = net.make("reward_relay", 1, 3, mode="direct") reward_relay.fixMode() net.connect(env.getOrigin("reward"), reward_relay, transform=[[1], [0], [0]]) net.connect(ctrl_agent.getOrigin("action_output"), reward_relay, transform=[[0, 0], [1, 0], [0, 1]]) net.connect(reward_relay, ctrl_agent.getTermination("reward"), func=lambda x: ((x[0] + bias * abs(x[0])) if x[1] + x[2] > 0.5 else x[0]), origin_name="ctrl_reward") # ideal reward function (for testing) # def ctrl_reward_func(x): # if abs(x[0]) < 0.5: # return 0.0 # # if flat: # return 1.5 if x[1] + x[2] < 0.5 else -1.5 # else: # if x[1] + x[2] < 0.5: # return -1.5 # if [round(a) for a in env.state[-2:]] == [round(b) # for b in x[1:]]: # return 1.5 # else: # return -1.5 # net.connect(reward_relay, ctrl_agent.getTermination("reward"), # func=ctrl_reward_func) # nav rewarded for picking ctrl target def nav_reward_func(x): if abs(x[0]) < 0.5 or env.action is None: return 0.0 if x[1] + x[2] < 0.5: return x[0] if x[1] > x[2]: return (1.5 if env.action[1] == env.state[:env.num_orientations] else -1.5) else: return (1.5 if env.action[1] == env.state[env.num_orientations:-env.num_colours] else -1.5) net.connect(reward_relay, nav_agent.getTermination("reward"), func=nav_reward_func) # state for navagent controlled by ctrlagent ctrl_state_inhib = net.make_array("ctrl_state_inhib", 50, env.stateD, radius=2, mode=HRLutils.SIMULATION_MODE) ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \ [[-5, 0]] * 50 * env.num_shapes + \ [[-5, -5]] * 50 * env.num_colours # ctrl output inhibits all the non-selected aspects of the state net.connect(env.getOrigin("state"), ctrl_state_inhib) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_state_inhib, transform=inhib_matrix) # also give a boost to the selected aspects (so that neurons are roughly # equally activated). def boost_func(x): if x[0] > 0.5: return [3 * v for v in x[1:]] else: return x[1:] boost = net.make("boost", 1, 1 + env.stateD, mode="direct") boost.fixMode() net.connect(ctrl_state_inhib, boost, transform=([[0 for _ in range(env.stateD)]] + list(MU.I(env.stateD)))) net.connect(ctrl_agent.getOrigin("action_output"), boost, transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)]) net.connect(boost, nav_agent.getTermination("state_input"), func=boost_func) # save weights weight_save = 1.0 # period to save weights (realtime, not simulation time) threads = [ HRLutils.WeightSaveThread( nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, seed)), weight_save), HRLutils.WeightSaveThread( ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)), weight_save) ] for t in threads: t.start() # data collection node data = datanode.DataNode( period=1, filename=HRLutils.datafile("dataoutput_%s.txt" % label), header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat)) print "saving data to", data.filename print "header", data.header net.add(data) nav_q = nav_agent.getNode("QNetwork") ctrl_q = ctrl_agent.getNode("QNetwork") ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions") data.record_avg(env.getOrigin("reward")) data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X")) data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON")) data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON")) data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON")) data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON")) data.record(env.getOrigin("score")) # net.add_to_nengo() # net.network.simulator.run(0, 300, 0.001) net.view() for t in threads: t.stop()
def run_badreenvironment(nav_args, ctrl_args, bias=0.0, seed=None, flat=False, label="tmp"): """Runs the model on the Badre et al. (2010) task.""" if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_badreenvironment") env = badreenvironment.BadreEnvironment(flat=flat) net.add(env) # ##NAV AGENT stateN = 500 max_state_input = 3 enc = env.gen_encoders(stateN, 0, 0.0) # generate evaluation points orientations = MU.I(env.num_orientations) shapes = MU.I(env.num_shapes) colours = MU.I(env.num_colours) evals = (list(MU.diag([3 for _ in range(env.stateD)])) + [o + s + c for o in orientations for s in shapes for c in colours]) # create lower level nav_agent = smdpagent.SMDPAgent(stateN, env.stateD, env.actions, name="NavAgent", stateradius=max_state_input, state_encoders=enc, state_evals=evals, discount=0.5, **nav_args) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # actions terminate on fixed schedule (aligned with environment) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="NavTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # ##CTRL AGENT stateN = 500 enc = RandomHypersphereVG().genVectors(stateN, env.stateD) actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, env.stateD, actions, name="CtrlAgent", state_encoders=enc, stateradius=max_state_input, state_evals=evals, discount=0.4, **ctrl_args) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() net.connect(env.getOrigin("state"), ctrl_agent.getTermination("state_input")) ctrl_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="CtrlTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(ctrl_term_node) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # ctrl gets a slight bonus if it selects a rule (as opposed to null), to # encourage it to not just pick null all the time reward_relay = net.make("reward_relay", 1, 3, mode="direct") reward_relay.fixMode() net.connect(env.getOrigin("reward"), reward_relay, transform=[[1], [0], [0]]) net.connect(ctrl_agent.getOrigin("action_output"), reward_relay, transform=[[0, 0], [1, 0], [0, 1]]) net.connect(reward_relay, ctrl_agent.getTermination("reward"), func=lambda x: ((x[0] + bias * abs(x[0])) if x[1] + x[2] > 0.5 else x[0]), origin_name="ctrl_reward") # ideal reward function (for testing) # def ctrl_reward_func(x): # if abs(x[0]) < 0.5: # return 0.0 # # if flat: # return 1.5 if x[1] + x[2] < 0.5 else -1.5 # else: # if x[1] + x[2] < 0.5: # return -1.5 # if [round(a) for a in env.state[-2:]] == [round(b) # for b in x[1:]]: # return 1.5 # else: # return -1.5 # net.connect(reward_relay, ctrl_agent.getTermination("reward"), # func=ctrl_reward_func) # nav rewarded for picking ctrl target def nav_reward_func(x): if abs(x[0]) < 0.5 or env.action is None: return 0.0 if x[1] + x[2] < 0.5: return x[0] if x[1] > x[2]: return (1.5 if env.action[1] == env.state[:env.num_orientations] else -1.5) else: return (1.5 if env.action[1] == env.state[env.num_orientations: - env.num_colours] else -1.5) net.connect(reward_relay, nav_agent.getTermination("reward"), func=nav_reward_func) # state for navagent controlled by ctrlagent ctrl_state_inhib = net.make_array("ctrl_state_inhib", 50, env.stateD, radius=2, mode=HRLutils.SIMULATION_MODE) ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \ [[-5, 0]] * 50 * env.num_shapes + \ [[-5, -5]] * 50 * env.num_colours # ctrl output inhibits all the non-selected aspects of the state net.connect(env.getOrigin("state"), ctrl_state_inhib) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_state_inhib, transform=inhib_matrix) # also give a boost to the selected aspects (so that neurons are roughly # equally activated). def boost_func(x): if x[0] > 0.5: return [3 * v for v in x[1:]] else: return x[1:] boost = net.make("boost", 1, 1 + env.stateD, mode="direct") boost.fixMode() net.connect(ctrl_state_inhib, boost, transform=([[0 for _ in range(env.stateD)]] + list(MU.I(env.stateD)))) net.connect(ctrl_agent.getOrigin("action_output"), boost, transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)]) net.connect(boost, nav_agent.getTermination("state_input"), func=boost_func) # save weights weight_save = 1.0 # period to save weights (realtime, not simulation time) threads = [ HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, seed)), weight_save), HRLutils.WeightSaveThread(ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)), weight_save)] for t in threads: t.start() # data collection node data = datanode.DataNode(period=1, filename=HRLutils.datafile("dataoutput_%s.txt" % label), header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat)) print "saving data to", data.filename print "header", data.header net.add(data) nav_q = nav_agent.getNode("QNetwork") ctrl_q = ctrl_agent.getNode("QNetwork") ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions") data.record_avg(env.getOrigin("reward")) data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X")) data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON")) data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON")) data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON")) data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON")) data.record(env.getOrigin("score")) # net.add_to_nengo() # net.network.simulator.run(0, 300, 0.001) net.view() for t in threads: t.stop()
def __init__(self, actions, Qradius=1, noiselevel=0.03): """Builds the BGNetwork. :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param Qradius: expected radius of Q values :param noiselevel: standard deviation of noise added to Q values for exploration """ self.name = "BGNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.N = 50 self.d = len(actions) self.mut_inhib = 1.0 # mutual inhibition between actions self.tauPSC = 0.007 # make basal ganglia netbg = nef.Network("bg") bginput = netbg.make("bginput", 1, self.d, mode="direct") bginput.fixMode() bginput.addDecodedTermination("input", MU.diag([1.0 / Qradius for _ in range(self.d)]), 0.001, False) # divide by Q radius to get values back into 0 -- 1 range bgoutput = netbg.make("bgoutput", 1, self.d, mode="direct") bgoutput.fixMode() basalganglia.make_basal_ganglia(netbg, bginput, bgoutput, dimensions=self.d, neurons=200) bg = netbg.network net.add(bg) bg.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) bg.exposeTermination(bginput.getTermination("input"), "input") bg.exposeOrigin(bgoutput.getOrigin("X"), "X") # insert noise (used to give some randomness to drive exploration) noiselevel = net.make_input("noiselevel", [noiselevel]) noise = noisenode.NoiseNode(1, dimension=len(actions)) net.add(noise) net.connect(noiselevel, noise.getTermination("scale")) net.connect(noise.getOrigin("noise"), "bg.bginput", pstc=0.001) # add bias to shift everything up to 0.5--1.5 biasinput = net.make_input("biasinput", [0.5]) net.connect(biasinput, "bg.bginput", transform=[[1] for _ in range(self.d)], pstc=0.001) # invert BG output (so the "selected" action will have a positive value # and the rest zero) invert = thalamus.make(net, name="invert", neurons=self.N, dimensions=self.d, useQuick=False) invert.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(bg, invert.getTermination("bg_input")) # add mutual inhibition net.connect(invert.getOrigin("xBiased"), invert, pstc=self.tauPSC, transform=[[0 if i == j else -self.mut_inhib for j in range(self.d)] for i in range(self.d)]) # threshold output values so that you get a nice clean 0 for # non-selected and 1 for selected threshf = HRLutils.node_fac() threshold = 0.1 threshf.setIntercept(IndicatorPDF(threshold, 1.0)) val_threshold = net.make_array("val_threshold", self.N * 2, self.d, node_factory=threshf, encoders=[[1]]) val_threshold.addDecodedOrigin( "output", [PiecewiseConstantFunction([threshold], [0, 1]) for _ in range(self.d)], "AXON", True) net.connect(invert.getOrigin("xBiased"), val_threshold, pstc=self.tauPSC) # output action (action vectors weighted by BG output) weight_actions = net.make_array("weight_actions", 50, len(actions[0][1]), intercept=(0, 1)) net.connect(val_threshold.getOrigin("output"), weight_actions, transform=MU.transpose([actions[i][1] for i in range(self.d)]), pstc=0.007) # save the BG output (selected action and selected action value) save_relay = net.make("save_relay", 1, 1, mode="direct") save_relay.fixMode() save_relay.addDecodedTermination("input", [[1]], 0.001, False) saved_action = memory.Memory("saved_action", self.N * 2, len(actions[0][1]), inputscale=75) net.add(saved_action) net.connect(weight_actions, saved_action.getTermination("target")) net.connect(save_relay, saved_action.getTermination("transfer")) saved_vals = memory.Memory("saved_values", self.N * 2, self.d, inputscale=75) net.add(saved_vals) net.connect(val_threshold.getOrigin("output"), saved_vals.getTermination("target")) net.connect(save_relay, saved_vals.getTermination("transfer")) # put the saved values through a threshold (we want a nice clean # zero for non-selected values) nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0.2, 1)) saved_vals_threshold = net.make_array("saved_vals_threshold", self.N, self.d, node_factory=nfac, encoders=[[1]]) saved_vals_threshold.addDecodedOrigin( "output", [PiecewiseConstantFunction([0.3], [0, 1]) for _ in range(self.d)], "AXON", True) net.connect(saved_vals, saved_vals_threshold, pstc=self.tauPSC) self.exposeTermination(bg.getTermination("input"), "input") self.exposeTermination(save_relay.getTermination("input"), "save_output") self.exposeOrigin(val_threshold.getOrigin("output"), "curr_vals") self.exposeOrigin(weight_actions.getOrigin("X"), "curr_action") self.exposeOrigin(saved_vals_threshold.getOrigin("output"), "saved_vals") self.exposeOrigin(saved_action.getOrigin("X"), "saved_action")
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None, state_evals=None, state_threshold=0.0): """Builds the QNetwork. :param stateN: number of neurons to use to represent state :param stateD: dimension of state vector :param state_encoders: encoders to use for neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learningrate for action value learning rule :param stateradius: expected radius of state values :param Qradius: expected radius of Q values :param load_weights: filename to load Q value weights from :param state_evals: evaluation points to use for state population. This is used when initializing the Q values (may be necessary if the input states don't tend to fall in the hypersphere). :param state_threshold: threshold of state neurons (minimum intercept) """ self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 num_actions = len(actions) init_Qs = 0.2 #initial value for all Q values self.neuron_learning = False # if True, use neuron--neuron weight learning, # otherwise, use decoder learning # set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() # This apparently fixes the simulator mode to the curremt mode, so I'm guessing we just don't want it over-ridden by an over-zealous config file. state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) # create state population state_fac = HRLutils.node_fac() state_fac.setIntercept(IndicatorPDF(state_threshold, 1.0)) print("making the state_pop") state_pop = net.make("state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) # store the state value (used to drive population encoding previous state) print("create the saved state memory") saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) # create population representing previous state old_state_pop = net.make("old_state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) print("setup the action nodes") # set up action nodes if self.neuron_learning: # use ActionValues network to compute Q values # current Q values decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # Q values of previous state decoders = old_state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) else: # just use decoder on state population to compute Q values # current Q values origin = state_pop.addDecodedOrigin("vals", [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)], "AXON") state_dlnode = decoderlearningnode.DecoderLearningNode(state_pop, origin, learningrate, num_actions, name="state_learningnode") net.add(state_dlnode) # just a little relay node, so that things match up for the rest of the script # when you have the neuron -- neuron learning actionvals = net.make("actionvals", 1, num_actions, mode="direct") actionvals.fixMode() net.connect(origin, actionvals, pstc=0.001) # Q values of previous state origin = old_state_pop.addDecodedOrigin("vals", [ConstantFunction(num_actions, init_Qs) for _ in range(num_actions)], "AXON") old_state_dlnode = decoderlearningnode.DecoderLearningNode(old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode") net.add(old_state_dlnode) old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct") old_actionvals.fixMode() net.connect(origin, old_actionvals, pstc=0.001) if load_weights != None: self.loadParams(load_weights) # find error between old_actionvals and actionvals (this will be used to drive learning # on the new actionvals) valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac()) net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC) # doubling the values to get a bigger error signal # calculate diff between curr_state and saved_state and use that to gate valdiff (we # only want to train the curr state based on previous state when the two have similar # values) # WTF does that mean and what is with these weird intercept statediff = net.make_array("statediff", N, stateD, intercept=(0.2, 1)) # note: threshold > 0 so that there is a deadzone in the middle (when the states # are similar) where there will be no output inhibition net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) # connect up valdiff to the error signal for current Q values, and expose # the error signal for the previous Q values to the external error if self.neuron_learning: net.connect(valdiff, actionvals.getTermination("error")) self.exposeTermination(old_actionvals.getTermination("error"), "error") else: net.connect(valdiff, state_dlnode.getTermination("error")) self.exposeTermination(old_state_dlnode.getTermination("error"), "error") self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None, state_evals=None, state_threshold=(0.0, 1.0), statediff_threshold=0.2, init_Qs=None): """Builds the QNetwork. :param stateN: number of neurons to use to represent state :param stateD: dimension of state vector :param state_encoders: encoders to use for neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learningrate for action value learning rule :param stateradius: expected radius of state values :param Qradius: expected radius of Q values :param load_weights: filename to load Q value weights from :param state_evals: evaluation points to use for state population. This is used when initializing the Q values (may be necessary if the input states don't tend to fall in the hypersphere). :param state_threshold: threshold range of state neurons :param statediff_threshold: maximum state difference for dual training :param init_Qs: initial Q values """ self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 num_actions = len(actions) init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs # if True, use neuron--neuron weight learning, otherwise, use decoder # learning self.neuron_learning = False # set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) # create state population state_fac = HRLutils.node_fac() if isinstance(state_threshold, (float, int)): state_threshold = (state_threshold, 1.0) state_fac.setIntercept( IndicatorPDF(state_threshold[0], state_threshold[1])) state_pop = net.make("state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) # store the state value (used to drive population encoding previous # state) saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) # create population representing previous state old_state_pop = net.make("old_state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) # set up action nodes if self.neuron_learning: # use ActionValues network to compute Q values # current Q values decoders = state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # Q values of previous state decoders = old_state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) else: # just use decoder on state population to compute Q values # current Q values origin = state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") state_dlnode = decoderlearningnode.DecoderLearningNode( state_pop, origin, learningrate, num_actions, name="state_learningnode") net.add(state_dlnode) # just a little relay node, so that things match up for the rest of # the script when you have the neuron -- neuron learning actionvals = net.make("actionvals", 1, num_actions, mode="direct") actionvals.fixMode() net.connect(origin, actionvals, pstc=0.001) # Q values of previous state origin = old_state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") old_state_dlnode = decoderlearningnode.DecoderLearningNode( old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode") net.add(old_state_dlnode) old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct") old_actionvals.fixMode() net.connect(origin, old_actionvals, pstc=0.001) if load_weights is not None: self.loadParams(load_weights) # find error between old_actionvals and actionvals (this will be used # to drive learning on the new actionvals) valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac()) # doubling the values to get a bigger error signal net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC) # calculate diff between curr_state and saved_state and use that to # gate valdiff (we only want to train the curr state based on previous # state when the two have similar values) # note: threshold > 0 so that there is a deadzone in the middle (when # the states are similar) where there will be no output inhibition statediff = net.make_array("statediff", N, stateD, intercept=(statediff_threshold, 1)) net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) # connect up valdiff to the error signal for current Q values, and # expose the error signal for the previous Q values to the external # error if self.neuron_learning: net.connect(valdiff, actionvals.getTermination("error")) self.exposeTermination(old_actionvals.getTermination("error"), "error") else: net.connect(valdiff, state_dlnode.getTermination("error")) self.exposeTermination(old_state_dlnode.getTermination("error"), "error") self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None): NetworkImpl.__init__(self) self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 statelength = math.sqrt(2*stateradius**2) tauPSC = 0.007 num_actions = len(actions) init_Qs = 0.0 weight_save = 600.0 #period to save weights (realtime, not simulation time) #set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) #create state population state_fac = HRLutils.node_fac() state_fac.setIntercept(IndicatorPDF(0,1)) state_pop = net.make("state_pop", stateN, stateD, radius=statelength, node_factory=state_fac, encoders=state_encoders) # eval_points=MU.I(stateD)) # state_pop = net.make_array("state_pop", stateN/stateD, stateD, # node_factory=state_fac) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) #create population tied to previous state (to be used in learning) saved_state = memory.Memory("saved_state", N*4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) old_state_pop = net.make("old_state_pop", stateN, stateD, radius=statelength, node_factory=state_fac, encoders=state_encoders) # eval_points=MU.I(stateD)) # old_state_pop = net.make_array("old_state_pop", stateN/stateD, stateD, # node_factory=state_fac) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) #set up action nodes decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) decoders = old_state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD,init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) if load_weights != None: self.loadWeights(load_weights) #find error between old_actionvals and actionvals valdiff = net.make_array("valdiff", N, num_actions, node_factory = HRLutils.node_fac()) net.connect(old_actionvals, valdiff, transform=MU.diag([2]*num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2]*num_actions), pstc=tauPSC) #doubling values to get a bigger error signal #calculate diff between curr_state and saved_state and use that to gate valdiff statediff = net.make_array("statediff", N, stateD, intercept=(0.2,1)) net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1]*stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform = [[-10]*stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) net.connect(valdiff, actionvals.getTermination("error")) #periodically save the weights class WeightSaveThread(threading.Thread): def __init__(self, func, prefix, period): threading.Thread.__init__(self) self.func = func self.prefix = prefix self.period = period def run(self): while True: time.sleep(self.period) self.func(self.prefix) wsn = WeightSaveThread(self.saveWeights, os.path.join("weights","tmp"), weight_save) wsn.start() self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(old_actionvals.getTermination("error"), "error") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")