def combine_files(): path = os.path.join("..", "..", "data", "delivery", "flat", "dataoutput_2") data = [] for i in range(10): try: data += [HRLutils.load_data(path + ".%s.txt" % i)] except IOError: continue print "found %s files to combine" % len(data) print len(data[0]), "records" starttime = 0.0 newdata = [[] for _ in data[0]] for d in data: if len(d) != len(newdata): print "uh oh, number of records is wrong" print len(d), len(newdata) for i, record in enumerate(d): for entry in record: newdata[i] += [[entry[0] + starttime, entry[1]]] starttime = newdata[0][-1][0] HRLutils.save_data(path + "_combined.txt", newdata)
def gen_encoders(self, N, contextD, context_scale): """Generate encoders for state population of learning agent. :param N: number of neurons in state population :param contextD: dimension of context vector representation :param context_scale: weight on context representation relative to state (1.0 = equal weighting) """ if contextD > 0: contexts = MU.I(contextD) else: contexts = [[]] # neurons each sensitive to different combinations of stimuli encs = (list(MU.I(self.stateD)) + [o + s + c for o in MU.I(self.num_orientations) for s in MU.I(self.num_shapes) for c in MU.I(self.num_colours)]) return [HRLutils.normalize( HRLutils.normalize(random.choice(encs)) + [x * context_scale for x in random.choice(contexts)]) for _ in range(N)]
def __init__(self, N, d, name="PositiveBias"): """Builds the PositiveBias network. :param N: base number of neurons :param d: dimension of input signal :param name: name for network """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 biaslevel = 0.03 # the value to be output for negative inputs # threshold the input signal to detect positive values nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0, 0.1)) neg_thresh = net.make_array("neg_thresh", N, d, encoders=[[1]], node_factory=nfac) neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False) # create a population that tries to output biaslevel across # all dimensions bias_input = net.make_input("bias_input", [biaslevel]) bias_pop = net.make_array( "bias_pop", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)]) net.connect(bias_input, bias_pop, pstc=tauPSC) # the individual dimensions of bias_pop are then inhibited by the # output of neg_thresh (so any positive values don't get the bias) net.connect(neg_thresh, bias_pop, pstc=tauPSC, func=lambda x: [1.0] if x[0] > 0 else [0.0], transform=[[-10 if i == k else 0 for k in range(d)] for i in range(d) for _ in range(bias_pop.getNeurons() / d)]) # the whole population is inhibited by the learn signal, so that it # outputs 0 if the system isn't supposed to be learning bias_pop.addTermination("learn", [[-10] for _ in range(bias_pop.getNeurons())], tauPSC, False) self.exposeTermination(neg_thresh.getTermination("input"), "input") self.exposeTermination(bias_pop.getTermination("learn"), "learn") self.exposeOrigin(bias_pop.getOrigin("X"), "X")
def optimal_run(seed=None): if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("optimal_run") actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={ -16777216: "wall", -1: "floor", -256: "a", -2088896: "b" }, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) class ActionRelay(nef.SimpleNode): def __init__(self): self.action = actions[0] nef.SimpleNode.__init__(self, "ActionRelay") def tick(self): pass def termination_action_in(self, x, dimensions=4): self.action = actions[x.index(max(x))] def origin_action_out(self): return self.action[1] em = ActionRelay() net.add(em) net.connect(env.getOrigin("optimal_move"), em.getTermination("action_in")) net.connect(em.getOrigin("action_out"), env.getTermination("action")) data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % seed)) net.add(data) data.record(env.getOrigin("reward")) # net.add_to_nengo() net.run(1000)
def tick(self): cond_active = False for c in self.conds: if isinstance(c, Timer): # if it is a timer entry, just update the timer and check if it # has expired c.tick() if c.ring(): self.reward = self.rewardval self.activate() c.reset() cond_active = True elif (self.env.is_in(self.env.state, c) and (self.conds[c] is None or HRLutils.similarity(HRLutils.normalize(self.context), self.conds[c]) > 0.3)): # if it is a state entry, check if the agent is in the region # associated with that state, and check if that region is the # one corresponding to the currently selected context self.reward = self.rewardval self.rewardamount += 1 if self.rewardamount > self.rewardresetamount: self.activate() self.rewardamount = 0 cond_active = True # if no termination conditions met, just give default reward if not cond_active: self.reward = self.defaultreward # reset rewardamount when the reset signal is sent (so that there won't # be any leftover rewardamount from the agent's previous decision) if self.t > self.resettime[0] and self.t < self.resettime[1]: self.rewardamount = 0 # add a penalty if the state hasn't changed (to help prevent agent from # getting stuck) if sum(self.prev_state) != 0 and \ HRLutils.similarity(HRLutils.normalize(self.env.state), HRLutils.normalize(self.prev_state)) < 1.0: self.state_penalty = 0.0 else: self.state_penalty += 0.0001 self.prev_state = copy.deepcopy(self.env.state) self.reward = self.reward - self.state_penalty
def tick(self): cond_active = False for c in self.conds: if isinstance(c, Timer): # if it is a timer entry, just update the timer and check if it # has expired c.tick() if c.ring(): self.reward = self.rewardval self.activate() c.reset() cond_active = True elif (self.env.is_in(self.env.state, c) and (self.conds[c] is None or HRLutils.similarity( HRLutils.normalize(self.context), self.conds[c]) > 0.3)): # if it is a state entry, check if the agent is in the region # associated with that state, and check if that region is the # one corresponding to the currently selected context self.reward = self.rewardval self.rewardamount += 1 if self.rewardamount > self.rewardresetamount: self.activate() self.rewardamount = 0 cond_active = True # if no termination conditions met, just give default reward if not cond_active: self.reward = self.defaultreward # reset rewardamount when the reset signal is sent (so that there won't # be any leftover rewardamount from the agent's previous decision) if self.t > self.resettime[0] and self.t < self.resettime[1]: self.rewardamount = 0 # add a penalty if the state hasn't changed (to help prevent agent from # getting stuck) if sum(self.prev_state) != 0 and \ HRLutils.similarity(HRLutils.normalize(self.env.state), HRLutils.normalize(self.prev_state)) < 1.0: self.state_penalty = 0.0 else: self.state_penalty += 0.0001 self.prev_state = copy.deepcopy(self.env.state) self.reward = self.reward - self.state_penalty
def __init__(self, N, d, name="PositiveBias"): """Builds the PositiveBias network. :param N: base number of neurons :param d: dimension of input signal :param name: name for network """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 biaslevel = 0.03 # the value to be output for negative inputs # threshold the input signal to detect positive values nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0, 0.1)) neg_thresh = net.make_array("neg_thresh", N, d, encoders=[[1]], node_factory=nfac) neg_thresh.addDecodedTermination("input", MU.I(d), tauPSC, False) # create a population that tries to output biaslevel across # all dimensions bias_input = net.make_input("bias_input", [biaslevel]) bias_pop = net.make_array("bias_pop", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.01] for x in range(0, biaslevel * 200)]) net.connect(bias_input, bias_pop, pstc=tauPSC) # the individual dimensions of bias_pop are then inhibited by the # output of neg_thresh (so any positive values don't get the bias) net.connect(neg_thresh, bias_pop, pstc=tauPSC, func=lambda x: [1.0] if x[0] > 0 else [0.0], transform=[[-10 if i == k else 0 for k in range(d)] for i in range(d) for _ in range(bias_pop.getNeurons() / d)]) # the whole population is inhibited by the learn signal, so that it # outputs 0 if the system isn't supposed to be learning bias_pop.addTermination("learn", [[-10] for _ in range(bias_pop.getNeurons())], tauPSC, False) self.exposeTermination(neg_thresh.getTermination("input"), "input") self.exposeTermination(bias_pop.getTermination("learn"), "learn") self.exposeOrigin(bias_pop.getOrigin("X"), "X")
def calc_optimal_move(self): """Calculate the optimal move for the agent to take in the current state/context.""" # basically the same as PlaceCellEnvironment.calc_optimal_move, except # we look at the current context to find the goal goal = [c for c in self.contexts if self.contexts[c] == self.context][0] stepsize = 0.1 self.optimal_move = None for y in [v * stepsize for v in range(int(-self.imgsize[1] / (2 * stepsize)) + 1, int(self.imgsize[1] / (2 * stepsize)) - 1)]: for x in [v * stepsize for v in range(int(-self.imgsize[0] / (2 * stepsize)) + 1, int(self.imgsize[0] / (2 * stepsize)) - 1)]: if self.is_in((x, y), goal): angle = math.atan2(y - self.state[1], x - self.state[0]) pt = (math.cos(angle), math.sin(angle)) self.optimal_move = max( self.actions, key=lambda x:-1 if self.is_in((x[1][0] * self.dx + self.state[0], x[1][1] * self.dx + self.state[1]), "wall") else HRLutils.similarity(x[1], pt))[0] return
def calc_optimal_move(self): """Calculate the optimal move for the agent to take in the current state/context.""" # basically the same as PlaceCellEnvironment.calc_optimal_move, except # we look at whether or not we have the package to pick a goal state stepsize = 0.1 self.optimal_move = None for y in [ v * stepsize for v in range( int(-self.imgsize[1] / (2 * stepsize)) + 1, int(self.imgsize[1] / (2 * stepsize)) - 1) ]: for x in [ v * stepsize for v in range( int(-self.imgsize[0] / (2 * stepsize)) + 1, int(self.imgsize[0] / (2 * stepsize)) - 1) ]: if ((self.is_in((x, y), "a") and not self.in_hand) or (self.is_in((x, y), "b") and self.in_hand)): angle = math.atan2(y - self.state[1], x - self.state[0]) pt = (math.cos(angle), math.sin(angle)) self.optimal_move = max( self.actions, key=lambda x: -1 if self.is_in((x[1][0] * self.dx + self.state[0], x[1][ 1] * self.dx + self.state[1]), "wall" ) else HRLutils.similarity(x[1], pt))[0] return
def calc_optimal_move(self): """Calculates the optimal move for the agent to make in the current state. Used for debugging mainly. """ # grid search the image with the given stepsize stepsize = 0.1 self.optimal_move = None for y in [v * stepsize for v in range(int(-self.imgsize[1] / (2 * stepsize)) + 1, int(self.imgsize[1] / (2 * stepsize)) - 1)]: for x in [v * stepsize for v in range(int(-self.imgsize[0] / (2 * stepsize)) + 1, int(self.imgsize[0] / (2 * stepsize)) - 1)]: # if the pt you're looking at is in the region you're looking for if self.is_in((x, y), "target"): # generate a target point in the direction from current location to target angle = math.atan2(y - self.state[1], x - self.state[0]) pt = (math.cos(angle), math.sin(angle)) # pick the action that is closest to the target point # note: penalize actions that would involve moving through a wall self.optimal_move = max(self.actions, key=lambda x:-1 if self.is_in((x[1][0] * self.dx + self.state[0], x[1][1] * self.dx + self.state[1]), "wall") else HRLutils.similarity(x[1], pt))[0] return
def calc_optimal_move(self): """Calculate the optimal move for the agent to take in the current state/context.""" # basically the same as PlaceCellEnvironment.calc_optimal_move, except # we look at the current context to find the goal goal = [c for c in self.contexts if self.contexts[c] == self.context][0] stepsize = 0.1 self.optimal_move = None for y in [ v * stepsize for v in range( int(-self.imgsize[1] / (2 * stepsize)) + 1, int(self.imgsize[1] / (2 * stepsize)) - 1) ]: for x in [ v * stepsize for v in range( int(-self.imgsize[0] / (2 * stepsize)) + 1, int(self.imgsize[0] / (2 * stepsize)) - 1) ]: if self.is_in((x, y), goal): angle = math.atan2(y - self.state[1], x - self.state[0]) pt = (math.cos(angle), math.sin(angle)) self.optimal_move = max( self.actions, key=lambda x: -1 if self.is_in((x[1][0] * self.dx + self.state[0], x[1][ 1] * self.dx + self.state[1]), "wall" ) else HRLutils.similarity(x[1], pt))[0] return
def calc_optimal_move(self): """Calculate the optimal move for the agent to take in the current state/context.""" # basically the same as PlaceCellEnvironment.calc_optimal_move, except # we look at whether or not we have the package to pick a goal state stepsize = 0.1 self.optimal_move = None for y in [v * stepsize for v in range(int(-self.imgsize[1] / (2 * stepsize)) + 1, int(self.imgsize[1] / (2 * stepsize)) - 1)]: for x in [v * stepsize for v in range(int(-self.imgsize[0] / (2 * stepsize)) + 1, int(self.imgsize[0] / (2 * stepsize)) - 1)]: if ((self.is_in((x, y), "a") and not self.in_hand) or (self.is_in((x, y), "b") and self.in_hand)): angle = math.atan2(y - self.state[1], x - self.state[0]) pt = (math.cos(angle), math.sin(angle)) self.optimal_move = max( self.actions, key=lambda x:-1 if self.is_in((x[1][0] * self.dx + self.state[0], x[1][1] * self.dx + self.state[1]), "wall") else HRLutils.similarity(x[1], pt))[0] return
def test_terminationnode(): net = nef.Network("testTerminationNode") actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"}, imgsize=(5, 5), dx=0.001, placedev=0.5, ) net.add(env) term_node = terminationnode.TerminationNode( {"a": [0, 1], "b": [1, 0], terminationnode.Timer((30, 30)): None}, env, contextD=2, rewardval=1 ) net.add(term_node) print term_node.conds context_input = net.make_input("contextinput", {0.0: [0, 0.1], 0.5: [1, 0], 1.0: [0, 1]}) net.connect(context_input, term_node.getTermination("context")) net.add_to_nengo() net.view()
def __init__(self, actions, mapname, colormap, name="PlaceCellEnvironment", imgsize=(1.0, 1.0), dx=0.01, placedev=0.1, num_places=None): """Initialize environment variables. :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param mapname: name of file describing environment map :param colormap: dict mapping pixel colours to labels :param name: name for environment :param imgsize: width of space represented by the map image :param dx: distance agent moves each timestep :param placedev: standard deviation of gaussian place cell activations :param num_places: number of placecells to use (if None it will attempt to fill the space) """ EnvironmentTemplate.__init__(self, name, 2, actions) # parameters self.colormap = colormap self.rewardamount = 0 # number of timesteps spent in reward # number of timesteps to spend in reward before agent is reset # note: convenient to express this as time_in_reward / dt self.rewardresetamount = 0.6 / 0.001 self.num_actions = len(actions) self.imgsize = [float(x) for x in imgsize] self.dx = dx self.placedev = placedev self.num_places = num_places self.optimal_move = None self.defaultreward = -0.075 # load environment self.map = ImageIO.read(File(HRLutils.datafile(mapname))) # generate place cells self.gen_placecells(min_spread=1.0 * placedev) # initial conditions self.state = self.random_location(avoid=["wall", "target"]) self.place_activations = [0 for _ in self.placecells] self.create_origin("place", lambda: self.place_activations) # note: making the value small, so that the noise node will give us # some random exploration as well self.create_origin( "optimal_move", lambda: [0.1 if self.optimal_move == a[0] else 0.0 for a in self.actions])
def run_gridworld(args, seed=None): if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_gridworld") stateN = 400 stateD = 2 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] agent = smdpagent.SMDPAgent(stateN, stateD, actions, stateradius=3, **args) net.add(agent) env = gridworldenvironment.GridWorldEnvironment( stateD, actions, HRLutils.datafile("potjansgrid.txt"), cartesian=True, delay=(0.6, 0.9), datacollection=False) net.add(env) net.connect(env.getOrigin("state"), agent.getTermination("state_input")) net.connect(env.getOrigin("reward"), agent.getTermination("reward")) net.connect(env.getOrigin("reset"), agent.getTermination("reset")) net.connect(env.getOrigin("learn"), agent.getTermination("learn")) net.connect(env.getOrigin("reset"), agent.getTermination("save_state")) net.connect(env.getOrigin("reset"), agent.getTermination("save_action")) net.connect(agent.getOrigin("action_output"), env.getTermination("action")) net.connect(agent.getOrigin("Qs"), env.getTermination("Qs")) # net.add_to_nengo() # view = timeview.View(net.network, update_frequency=5) # view.add_watch(gridworldwatch.GridWorldWatch()) # view.restore() net.network.simulator.run(0, 1000, 0.001) print "latencies" print len(env.latencies) print env.latencies
def saveParams(self, prefix): #save connection weights if self.neuron_learning: self.getNode("actionvals").saveWeights(prefix) self.getNode("old_actionvals").saveWeights(prefix) else: dec = self.getNode("state_pop").getOrigin("vals").getDecoders() with open(HRLutils.datafile(prefix + "_state_decoders.txt"), "w") as f: f.write("\n".join([" ".join([str(x) for x in d]) for d in dec])) dec = self.getNode("old_state_pop").getOrigin("vals").getDecoders() with open(HRLutils.datafile(prefix + "_old_state_decoders.txt"), "w") as f: f.write("\n".join([" ".join([str(x) for x in d]) for d in dec])) #save state encoders enc = self.getNode("state_pop").getEncoders() with open(HRLutils.datafile(prefix + "_state_encoders.txt"), "w") as f: f.write("\n".join([" ".join([str(x) for x in e]) for e in enc]))
def loadParams(self, prefix): print "loading params: %s" % prefix #load connection weights if self.neuron_learning: self.getNode("actionvals").loadWeights(prefix) self.getNode("old_actionvals").loadWeights(prefix) else: with open(HRLutils.datafile(prefix + "_state_decoders.txt")) as f: self.getNode("state_pop").getOrigin("vals").setDecoders( [[float(x) for x in d.split(" ")] for d in f.readlines()]) with open(HRLutils.datafile(prefix + "_old_state_decoders.txt")) as f: self.getNode("old_state_pop").getOrigin("vals").setDecoders( [[float(x) for x in d.split(" ")] for d in f.readlines()]) #load state encoders with open(HRLutils.datafile(prefix + "_state_encoders.txt")) as f: enc = [[float(x) for x in e.split(" ")] for e in f.readlines()] self.getNode("state_pop").setEncoders(enc) self.getNode("old_state_pop").setEncoders(enc) #note we assume that state_pop and old_state_pop use the same encoders
def test_bmp(): from javax.imageio import ImageIO from java.io import File img = ImageIO.read(File(HRLutils.datafile("contextmap.bmp"))) colours = [int(val) for val in img.getRGB(0, 0, img.getWidth(), img.getHeight(), None, 0, img.getWidth())] unique_colours = [] for c in colours: if c not in unique_colours: unique_colours += [c] print unique_colours
def saveParams(self, prefix): # save connection weights if self.neuron_learning: self.getNode("actionvals").saveWeights(prefix) self.getNode("old_actionvals").saveWeights(prefix) else: dec = self.getNode("state_pop").getOrigin("vals").getDecoders() with open(HRLutils.datafile(prefix + "_state_decoders.txt"), "w") as f: f.write("\n".join([" ".join([str(x) for x in d]) for d in dec])) dec = self.getNode("old_state_pop").getOrigin("vals").getDecoders() with open(HRLutils.datafile(prefix + "_old_state_decoders.txt"), "w") as f: f.write("\n".join([" ".join([str(x) for x in d]) for d in dec])) # save state encoders enc = self.getNode("state_pop").getEncoders() with open(HRLutils.datafile(prefix + "_state_encoders.txt"), "w") as f: f.write("\n".join([" ".join([str(x) for x in e]) for e in enc]))
def test_actionvalues(): net = nef.Network("testActionValues") stateN = 200 N = 100 stateD = 2 stateradius = 1.0 statelength = math.sqrt(2 * stateradius**2) init_Qs = 0.5 learningrate = 0.0 Qradius = 1 tauPSC = 0.007 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # state state_pop = net.make( "state_pop", stateN, stateD, radius=statelength, node_factory=HRLutils.node_fac(), eval_points=[[x / statelength, y / statelength] for x in range(-int(stateradius), int(stateradius)) for y in range(-int(stateradius), int(stateradius))]) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False) # set up action nodes decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("testActionValues", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # input inp = net.make_input("input", [0, 0]) net.connect(inp, state_pop.getTermination("state_input")) net.add_to_nengo() net.view()
def __init__(self, actions, mapname, colormap, name="PlaceCellEnvironment", imgsize=(1.0, 1.0), dx=0.01, placedev=0.1, num_places=None): """Initialize environment variables. :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param mapname: name of file describing environment map :param colormap: dict mapping pixel colours to labels :param name: name for environment :param imgsize: width of space represented by the map image :param dx: distance agent moves each timestep :param placedev: standard deviation of gaussian place cell activations :param num_places: number of placecells to use (if None it will attempt to fill the space) """ EnvironmentTemplate.__init__(self, name, 2, actions) # parameters self.colormap = colormap self.rewardamount = 0 # number of timesteps spent in reward # number of timesteps to spend in reward before agent is reset # note: convenient to express this as time_in_reward / dt self.rewardresetamount = 0.6 / 0.001 self.num_actions = len(actions) self.imgsize = [float(x) for x in imgsize] self.dx = dx self.placedev = placedev self.num_places = num_places self.optimal_move = None self.defaultreward = -0.075 # load environment self.map = ImageIO.read(File(HRLutils.datafile(mapname))) # generate place cells self.gen_placecells(min_spread=1.0 * placedev) # initial conditions self.state = self.random_location(avoid=["wall", "target"]) self.place_activations = [0 for _ in self.placecells] self.create_origin("place", lambda: self.place_activations) # note: making the value small, so that the noise node will give us # some random exploration as well self.create_origin("optimal_move", lambda: [0.1 if self.optimal_move == a[0] else 0.0 for a in self.actions])
def loadParams(self, prefix): print "loading params: %s" % prefix # load connection weights if self.neuron_learning: self.getNode("actionvals").loadWeights(prefix) self.getNode("old_actionvals").loadWeights(prefix) else: with open(HRLutils.datafile(prefix + "_state_decoders.txt")) as f: self.getNode("state_pop").getOrigin("vals").setDecoders( [[float(x) for x in d.split(" ")] for d in f.readlines()]) with open(HRLutils.datafile(prefix + "_old_state_decoders.txt")) as f: self.getNode("old_state_pop").getOrigin("vals").setDecoders( [[float(x) for x in d.split(" ")] for d in f.readlines()]) # load state encoders with open(HRLutils.datafile(prefix + "_state_encoders.txt")) as f: enc = [[float(x) for x in e.split(" ")] for e in f.readlines()] self.getNode("state_pop").setEncoders(enc) # note we assume that state_pop and old_state_pop use the same encoders self.getNode("old_state_pop").setEncoders(enc)
def saveWeights(self, prefix): """Save the connection weights to file.""" prefix = prefix + "_" + self.name for n in self.getNodes(): if n.getName().startswith("action"): term = n.getTermination("learning") weights = [t.getWeights() for t in term.getNodeTerminations()] f = open(HRLutils.datafile(prefix + "_" + n.getName() + ".txt"), "w") f.write(str(HRLutils.SEED) + "\n") for row in weights: f.write(" ".join([str(x) for x in row]) + "\n") f.close()
def run_gridworld(args, seed=None): if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_gridworld") stateN = 400 stateD = 2 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] agent = smdpagent.SMDPAgent(stateN, stateD, actions, stateradius=3, **args) net.add(agent) env = gridworldenvironment.GridWorldEnvironment( stateD, actions, HRLutils.datafile("smallgrid.txt"), cartesian=True, delay=(0.6, 0.9), datacollection=False) net.add(env) net.connect(env.getOrigin("state"), agent.getTermination("state_input")) net.connect(env.getOrigin("reward"), agent.getTermination("reward")) net.connect(env.getOrigin("reset"), agent.getTermination("reset")) net.connect(env.getOrigin("learn"), agent.getTermination("learn")) net.connect(env.getOrigin("reset"), agent.getTermination("save_state")) net.connect(env.getOrigin("reset"), agent.getTermination("save_action")) net.connect(agent.getOrigin("action_output"), env.getTermination("action")) net.connect(agent.getOrigin("Qs"), env.getTermination("Qs")) net.add_to_nengo() view = timeview.View(net.network, update_frequency=5) view.add_watch(gridworldwatch.GridWorldWatch()) view.restore()
def saveWeights(self, prefix): """Save the connection weights to file.""" prefix = prefix + "_" + self.name for n in self.getNodes(): if n.getName().startswith("action"): term = n.getTermination("learning") weights = [t.getWeights() for t in term.getNodeTerminations()] f = open( HRLutils.datafile(prefix + "_" + n.getName() + ".txt"), "w") f.write(str(HRLutils.SEED) + "\n") for row in weights: f.write(" ".join([str(x) for x in row]) + "\n") f.close()
def test_bmp(): from javax.imageio import ImageIO from java.io import File img = ImageIO.read(File(HRLutils.datafile("contextmap.bmp"))) colours = [ int(val) for val in img.getRGB(0, 0, img.getWidth(), img.getHeight(), None, 0, img.getWidth()) ] unique_colours = [] for c in colours: if c not in unique_colours: unique_colours += [c] print unique_colours
def test_actionvalues(): net = nef.Network("testActionValues") stateN = 200 N = 100 stateD = 2 stateradius = 1.0 statelength = math.sqrt(2 * stateradius ** 2) init_Qs = 0.5 learningrate = 0.0 Qradius = 1 tauPSC = 0.007 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # state state_pop = net.make( "state_pop", stateN, stateD, radius=statelength, node_factory=HRLutils.node_fac(), eval_points=[ [x / statelength, y / statelength] for x in range(-int(stateradius), int(stateradius)) for y in range(-int(stateradius), int(stateradius)) ], ) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) state_pop.addDecodedTermination("state_input", MU.I(stateD), tauPSC, False) # set up action nodes decoders = state_pop.addDecodedOrigin("init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues( "testActionValues", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders ) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # input inp = net.make_input("input", [0, 0]) net.connect(inp, state_pop.getTermination("state_input")) net.add_to_nengo() net.view()
def loadWeights(self, prefix): """Load the connection weights from file.""" prefix = prefix + "_" + self.name for n in self.getNodes(): if n.getName().startswith("action"): f = open(HRLutils.datafile(prefix + "_" + n.getName() + ".txt"), "r") seed = int(f.readline()) if seed != HRLutils.SEED: print "Warning, loading weights with a seed (" + seed + ") that doesn't match current (" + HRLutils.SEED + ")" weights = [] for line in f: weights += [[float(x) for x in line.split()]] f.close() term = n.getTermination("learning") for i, t in enumerate(term.getNodeTerminations()): t.setWeights(weights[i], True)
def test_placecell_bmp(): net = nef.Network("TestPlacecellBmp") actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] env = placecell_bmp.PlaceCellEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={-16777216: "wall", -1: "floor", -256: "target", -2088896: "b"}, imgsize=(5, 5), dx=0.001, placedev=0.5, ) net.add(env) print "generated", len(env.placecells), "placecells" net.add_to_nengo() net.view()
def tick(self): # check if env is currently giving reward (we want to give pseudoreward at the same time) if self.env.reward != 0: if self.target_answer is None: self.reward = 0 else: # check if the selected action matches the correct action self.reward = self.rewardval if HRLutils.similarity(self.target_answer, self.action) > 0.5 else -self.rewardval else: self.reward = 0 # update the target_answer (the action the low level should be selecting given # the current context) if self.context[0] == "orientation": self.target_answer = self.env.state[:self.env.num_orientations] elif self.context[0] == "shape": self.target_answer = self.env.state[self.env.num_orientations:-self.env.num_colours] else: self.target_answer = None
def test_terminationnode(): net = nef.Network("testTerminationNode") actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={ -16777216: "wall", -1: "floor", -256: "a", -2088896: "b" }, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) term_node = terminationnode.TerminationNode( { "a": [0, 1], "b": [1, 0], terminationnode.Timer((30, 30)): None }, env, contextD=2, rewardval=1) net.add(term_node) print term_node.conds context_input = net.make_input("contextinput", { 0.0: [0, 0.1], 0.5: [1, 0], 1.0: [0, 1] }) net.connect(context_input, term_node.getTermination("context")) net.add_to_nengo() net.view()
def loadWeights(self, prefix): """Load the connection weights from file.""" prefix = prefix + "_" + self.name for n in self.getNodes(): if n.getName().startswith("action"): f = open( HRLutils.datafile(prefix + "_" + n.getName() + ".txt"), "r") seed = int(f.readline()) if seed != HRLutils.SEED: print("Warning, loading weights with a seed (" + seed + ") that doesn't match current (" + HRLutils.SEED + ")") weights = [] for line in f: weights += [[float(x) for x in line.split()]] f.close() term = n.getTermination("learning") for i, t in enumerate(term.getNodeTerminations()): t.setWeights(weights[i], True)
def calc_optimal_move(self): """Calculates the optimal move for the agent to make in the current state. Used for debugging. """ # grid search the image with the given stepsize stepsize = 0.1 self.optimal_move = None for y in [ v * stepsize for v in range( int(-self.imgsize[1] / (2 * stepsize)) + 1, int(self.imgsize[1] / (2 * stepsize)) - 1) ]: for x in [ v * stepsize for v in range( int(-self.imgsize[0] / (2 * stepsize)) + 1, int(self.imgsize[0] / (2 * stepsize)) - 1) ]: # if the pt you're looking at is in the region you're # looking for if self.is_in((x, y), "target"): # generate a target point in the direction from current # location to target angle = math.atan2(y - self.state[1], x - self.state[0]) pt = (math.cos(angle), math.sin(angle)) # pick the action that is closest to the target point # note: penalize actions that would involve moving through # a wall self.optimal_move = max( self.actions, key=lambda x: -1 if self.is_in((x[1][0] * self.dx + self.state[0], x[1][ 1] * self.dx + self.state[1]), "wall" ) else HRLutils.similarity(x[1], pt))[0] return
def test_placecell_bmp(): net = nef.Network("TestPlacecellBmp") actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] env = placecell_bmp.PlaceCellEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={ -16777216: "wall", -1: "floor", -256: "target", -2088896: "b" }, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" net.add_to_nengo() net.view()
def __init__(self, actions, Qradius=1, noiselevel=0.03): """Builds the BGNetwork. :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param Qradius: expected radius of Q values :param noiselevel: standard deviation of noise added to Q values for exploration """ self.name = "BGNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.N = 50 self.d = len(actions) self.mut_inhib = 1.0 # mutual inhibition between actions self.tauPSC = 0.007 # make basal ganglia netbg = nef.Network("bg") bginput = netbg.make("bginput", 1, self.d, mode="direct") bginput.fixMode() bginput.addDecodedTermination("input", MU.diag([1.0 / Qradius for _ in range(self.d)]), 0.001, False) # divide by Q radius to get values back into 0 -- 1 range bgoutput = netbg.make("bgoutput", 1, self.d, mode="direct") bgoutput.fixMode() basalganglia.make_basal_ganglia(netbg, bginput, bgoutput, dimensions=self.d, neurons=200) bg = netbg.network net.add(bg) bg.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) bg.exposeTermination(bginput.getTermination("input"), "input") bg.exposeOrigin(bgoutput.getOrigin("X"), "X") # insert noise (used to give some randomness to drive exploration) noiselevel = net.make_input("noiselevel", [noiselevel]) noise = noisenode.NoiseNode(1, dimension=len(actions)) net.add(noise) net.connect(noiselevel, noise.getTermination("scale")) net.connect(noise.getOrigin("noise"), "bg.bginput", pstc=0.001) # add bias to shift everything up to 0.5--1.5 biasinput = net.make_input("biasinput", [0.5]) net.connect(biasinput, "bg.bginput", transform=[[1] for _ in range(self.d)], pstc=0.001) # invert BG output (so the "selected" action will have a positive value # and the rest zero) invert = thalamus.make(net, name="invert", neurons=self.N, dimensions=self.d, useQuick=False) invert.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(bg, invert.getTermination("bg_input")) # add mutual inhibition net.connect(invert.getOrigin("xBiased"), invert, pstc=self.tauPSC, transform=[[0 if i == j else -self.mut_inhib for j in range(self.d)] for i in range(self.d)]) # threshold output values so that you get a nice clean 0 for # non-selected and 1 for selected threshf = HRLutils.node_fac() threshold = 0.1 threshf.setIntercept(IndicatorPDF(threshold, 1.0)) val_threshold = net.make_array("val_threshold", self.N * 2, self.d, node_factory=threshf, encoders=[[1]]) val_threshold.addDecodedOrigin( "output", [PiecewiseConstantFunction([threshold], [0, 1]) for _ in range(self.d)], "AXON", True) net.connect(invert.getOrigin("xBiased"), val_threshold, pstc=self.tauPSC) # output action (action vectors weighted by BG output) weight_actions = net.make_array("weight_actions", 50, len(actions[0][1]), intercept=(0, 1)) net.connect(val_threshold.getOrigin("output"), weight_actions, transform=MU.transpose([actions[i][1] for i in range(self.d)]), pstc=0.007) # save the BG output (selected action and selected action value) save_relay = net.make("save_relay", 1, 1, mode="direct") save_relay.fixMode() save_relay.addDecodedTermination("input", [[1]], 0.001, False) saved_action = memory.Memory("saved_action", self.N * 2, len(actions[0][1]), inputscale=75) net.add(saved_action) net.connect(weight_actions, saved_action.getTermination("target")) net.connect(save_relay, saved_action.getTermination("transfer")) saved_vals = memory.Memory("saved_values", self.N * 2, self.d, inputscale=75) net.add(saved_vals) net.connect(val_threshold.getOrigin("output"), saved_vals.getTermination("target")) net.connect(save_relay, saved_vals.getTermination("transfer")) # put the saved values through a threshold (we want a nice clean # zero for non-selected values) nfac = HRLutils.node_fac() nfac.setIntercept(IndicatorPDF(0.2, 1)) saved_vals_threshold = net.make_array("saved_vals_threshold", self.N, self.d, node_factory=nfac, encoders=[[1]]) saved_vals_threshold.addDecodedOrigin( "output", [PiecewiseConstantFunction([0.3], [0, 1]) for _ in range(self.d)], "AXON", True) net.connect(saved_vals, saved_vals_threshold, pstc=self.tauPSC) self.exposeTermination(bg.getTermination("input"), "input") self.exposeTermination(save_relay.getTermination("input"), "save_output") self.exposeOrigin(val_threshold.getOrigin("output"), "curr_vals") self.exposeOrigin(weight_actions.getOrigin("X"), "curr_action") self.exposeOrigin(saved_vals_threshold.getOrigin("output"), "saved_vals") self.exposeOrigin(saved_action.getOrigin("X"), "saved_action")
def __init__(self, gamma, rewardradius=1.0): """Builds the ErrorCalc network. :param gamma: discount factor :param rewardradius: expected radius of reward values """ self.name = "ErrorCalc" tauPSC = 0.007 intPSC = 0.1 N = 50 ef = HRLutils.defaultEnsembleFactory() #current Q input currQ = ef.make("currQ", 1, 1) currQ.addDecodedTermination("input", [[1]], 0.001, False) self.addNode(currQ) currQ.setMode(SimulationMode.DIRECT) currQ.fixMode() self.exposeTermination(currQ.getTermination("input"), "currQ") #input population for resetting the network resetef = HRLutils.defaultEnsembleFactory() resetef.setEncoderFactory(vectorgenerators.DirectedVectorGenerator([1])) resetef.getNodeFactory().setIntercept(IndicatorPDF(0.3, 1.0)) reset = resetef.make("reset", N, 1) reset.addDecodedTermination("input", [[1]], tauPSC, False) self.addNode(reset) reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) self.exposeTermination(reset.getTermination("input"), "reset") #store previous value of Q storeQ = memory.Memory("storeQ", N * 4, 1, inputscale=50) self.addNode(storeQ) self.addProjection(reset.getOrigin("X"), storeQ.getTermination("transfer")) self.addProjection(currQ.getOrigin("X"), storeQ.getTermination("target")) #calculate discount biasInput = FunctionInput("biasinput", [ConstantFunction(1, 1)], Units.UNK) self.addNode(biasInput) discount = memory.Memory("discount", N * 4, 1, inputscale=50, recurweight=gamma) self.addNode(discount) self.addProjection(biasInput.getOrigin("origin"), discount.getTermination("target")) self.addProjection(reset.getOrigin("X"), discount.getTermination("transfer")) #accumulate discounted reward #do we really need gamma to make this all work? if it proves to be a problem, could #try removing it, and just use un-discounted reward. we can just use the fact that #the reward integrator will saturate to prevent rewards from going to infinity discountreward = eprod.Eprod("discountreward", N * 4, 1, weights=[[[1.0 / rewardradius]], [[1.0]]], oneDinput=True) self.addNode(discountreward) self.exposeTermination(discountreward.getTermination("A"), "reward") self.addProjection(discount.getOrigin("X"), discountreward.getTermination("B")) reward = ef.make("reward", N * 4, 1) reward.addDecodedTermination("input", [[intPSC]], intPSC, False) reward.addDecodedTermination("feedback", [[1]], intPSC, False) reward.addTermination("gate", [[-8] for _ in range(reward.getNodeCount())], intPSC, False) self.addNode(reward) reward.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) self.addProjection(reward.getOrigin("X"), reward.getTermination("feedback")) self.addProjection(discountreward.getOrigin("X"), reward.getTermination("input")) self.addProjection(reset.getOrigin("X"), reward.getTermination("gate")) #weight currQ by discount discountcurrQ = eprod.Eprod("discountcurrQ", N * 4, 1, oneDinput=True) self.addNode(discountcurrQ) self.addProjection(currQ.getOrigin("X"), discountcurrQ.getTermination("A")) self.addProjection(discount.getOrigin("X"), discountcurrQ.getTermination("B")) #error calculation error = ef.make("error", N * 2, [2]) #radius of 2 since max error = maxQ + maxreward - 0 (unless we let Q values go negative) error.addDecodedTermination("currQ", [[1]], tauPSC, False) error.addDecodedTermination("reward", [[1]], tauPSC, False) error.addDecodedTermination("storeQ", [[-1]], tauPSC, False) self.addNode(error) self.addProjection(discountcurrQ.getOrigin("X"), error.getTermination("currQ")) self.addProjection(reward.getOrigin("X"), error.getTermination("reward")) self.addProjection(storeQ.getOrigin("X"), error.getTermination("storeQ")) self.exposeOrigin(error.getOrigin("X"), "X")
def run_badreenvironment(nav_args, ctrl_args, bias=0.0, seed=None, flat=False, label="tmp"): """Runs the model on the Badre et al. (2010) task.""" if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_badreenvironment") env = badreenvironment.BadreEnvironment(flat=flat) net.add(env) # ##NAV AGENT stateN = 500 max_state_input = 3 enc = env.gen_encoders(stateN, 0, 0.0) # generate evaluation points orientations = MU.I(env.num_orientations) shapes = MU.I(env.num_shapes) colours = MU.I(env.num_colours) evals = (list(MU.diag([3 for _ in range(env.stateD)])) + [o + s + c for o in orientations for s in shapes for c in colours]) # create lower level nav_agent = smdpagent.SMDPAgent(stateN, env.stateD, env.actions, name="NavAgent", stateradius=max_state_input, state_encoders=enc, state_evals=evals, discount=0.5, **nav_args) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # actions terminate on fixed schedule (aligned with environment) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="NavTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # ##CTRL AGENT stateN = 500 enc = RandomHypersphereVG().genVectors(stateN, env.stateD) actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, env.stateD, actions, name="CtrlAgent", state_encoders=enc, stateradius=max_state_input, state_evals=evals, discount=0.4, **ctrl_args) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() net.connect(env.getOrigin("state"), ctrl_agent.getTermination("state_input")) ctrl_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="CtrlTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(ctrl_term_node) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # ctrl gets a slight bonus if it selects a rule (as opposed to null), to # encourage it to not just pick null all the time reward_relay = net.make("reward_relay", 1, 3, mode="direct") reward_relay.fixMode() net.connect(env.getOrigin("reward"), reward_relay, transform=[[1], [0], [0]]) net.connect(ctrl_agent.getOrigin("action_output"), reward_relay, transform=[[0, 0], [1, 0], [0, 1]]) net.connect(reward_relay, ctrl_agent.getTermination("reward"), func=lambda x: ((x[0] + bias * abs(x[0])) if x[1] + x[2] > 0.5 else x[0]), origin_name="ctrl_reward") # ideal reward function (for testing) # def ctrl_reward_func(x): # if abs(x[0]) < 0.5: # return 0.0 # # if flat: # return 1.5 if x[1] + x[2] < 0.5 else -1.5 # else: # if x[1] + x[2] < 0.5: # return -1.5 # if [round(a) for a in env.state[-2:]] == [round(b) # for b in x[1:]]: # return 1.5 # else: # return -1.5 # net.connect(reward_relay, ctrl_agent.getTermination("reward"), # func=ctrl_reward_func) # nav rewarded for picking ctrl target def nav_reward_func(x): if abs(x[0]) < 0.5 or env.action is None: return 0.0 if x[1] + x[2] < 0.5: return x[0] if x[1] > x[2]: return (1.5 if env.action[1] == env.state[:env.num_orientations] else -1.5) else: return (1.5 if env.action[1] == env.state[env.num_orientations: - env.num_colours] else -1.5) net.connect(reward_relay, nav_agent.getTermination("reward"), func=nav_reward_func) # state for navagent controlled by ctrlagent ctrl_state_inhib = net.make_array("ctrl_state_inhib", 50, env.stateD, radius=2, mode=HRLutils.SIMULATION_MODE) ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \ [[-5, 0]] * 50 * env.num_shapes + \ [[-5, -5]] * 50 * env.num_colours # ctrl output inhibits all the non-selected aspects of the state net.connect(env.getOrigin("state"), ctrl_state_inhib) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_state_inhib, transform=inhib_matrix) # also give a boost to the selected aspects (so that neurons are roughly # equally activated). def boost_func(x): if x[0] > 0.5: return [3 * v for v in x[1:]] else: return x[1:] boost = net.make("boost", 1, 1 + env.stateD, mode="direct") boost.fixMode() net.connect(ctrl_state_inhib, boost, transform=([[0 for _ in range(env.stateD)]] + list(MU.I(env.stateD)))) net.connect(ctrl_agent.getOrigin("action_output"), boost, transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)]) net.connect(boost, nav_agent.getTermination("state_input"), func=boost_func) # save weights weight_save = 1.0 # period to save weights (realtime, not simulation time) threads = [ HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, seed)), weight_save), HRLutils.WeightSaveThread(ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)), weight_save)] for t in threads: t.start() # data collection node data = datanode.DataNode(period=1, filename=HRLutils.datafile("dataoutput_%s.txt" % label), header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat)) print "saving data to", data.filename print "header", data.header net.add(data) nav_q = nav_agent.getNode("QNetwork") ctrl_q = ctrl_agent.getNode("QNetwork") ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions") data.record_avg(env.getOrigin("reward")) data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X")) data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON")) data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON")) data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON")) data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON")) data.record(env.getOrigin("score")) # net.add_to_nengo() # net.network.simulator.run(0, 300, 0.001) net.view() for t in threads: t.stop()
def run_contextenvironment(args, seed=None): """Runs the model on the context task. :param args: kwargs for the agent :param seed: random seed """ if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("runContextEnvironment") if "load_weights" in args and args["load_weights"] is not None: args["load_weights"] += "_%s" % seed stateN = 1200 # number of neurons to use in state population contextD = 2 # dimension of context vector context_scale = 1.0 # scale of context representation max_state_input = 2 # max length of input vector for state population # actions (label and vector) available to the system actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # context labels and rewards for achieving those context goals rewards = {"a": 1.5, "b": 1.5} env = contextenvironment.ContextEnvironment( actions, HRLutils.datafile("contextmap.bmp"), contextD, rewards, colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"}, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" # termination node for agent (just goes off on some regular interval) term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.9)): 0.0}, env) net.add(term_node) # generate encoders and divide by max_state_input (so that all inputs # will end up being radius 1) enc = env.gen_encoders(stateN, contextD, context_scale) enc = MU.prod(enc, 1.0 / max_state_input) # load eval points from file with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f: print "loading contextbmp_evalpoints_%s.txt" % seed evals = [[float(x) for x in l.split(" ")] for l in f.readlines()] agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, state_encoders=enc, state_evals=evals, state_threshold=0.8, **args) net.add(agent) print "agent neurons:", agent.countNeurons() # period to save weights (realtime, not simulation time) weight_save = 600.0 t = HRLutils.WeightSaveThread(agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (agent.name, seed)), weight_save) t.start() # data collection node data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % seed)) net.add(data) q_net = agent.getNode("QNetwork") data.record(env.getOrigin("reward")) data.record(q_net.getNode("actionvals").getOrigin("X"), func=max) data.record(q_net.getNode("actionvals").getOrigin("X"), func=min) data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON")) data.record_avg(q_net.getNode("valdiff").getOrigin("X")) data.record_avg(env.getOrigin("state")) net.connect(env.getOrigin("placewcontext"), agent.getTermination("state_input")) net.connect(env.getOrigin("reward"), agent.getTermination("reward")) net.connect(term_node.getOrigin("reset"), agent.getTermination("reset")) net.connect(term_node.getOrigin("learn"), agent.getTermination("learn")) net.connect(term_node.getOrigin("reset"), agent.getTermination("save_state")) net.connect(term_node.getOrigin("reset"), agent.getTermination("save_action")) net.connect(agent.getOrigin("action_output"), env.getTermination("action")) # net.add_to_nengo() # net.run(2000) net.view() t.stop()
def __init__(self, discount, rewardradius=1.0, Qradius=1.0): """Builds the ErrorCalc2 network. :param discount: discount factor, controls rate of integration :param rewardradius: expected radius of reward value :param Qradius: expected radius of Q values """ self.name = "ErrorCalc" net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 intPSC = 0.1 N = 50 # relay for current Q input currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) currQ.fixMode() currQ.addDecodedTermination("input", [[1]], 0.001, False) # input population for resetting the network reset_nodefac = HRLutils.node_fac() reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0)) reset = net.make("reset", N, 1, encoders=[[1]], node_factory=reset_nodefac) reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) # this population will begin outputting a value once the reset # signal exceeds the threshold, and that output will then be # used to reset the rest of the network reset.addDecodedTermination("input", [[1]], tauPSC, False) # relay for stored previous value of Q storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) storeQ.fixMode() storeQ.addDecodedTermination("input", [[1]], 0.001, False) # calculate "discount" by integrating output of storeQ acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50) net.add(acc_storeQ) zero_input = net.make_input("zero_input", [0]) net.connect(zero_input, acc_storeQ.getTermination("target")) net.connect(reset, acc_storeQ.getTermination("transfer")) # threshold storeQ value so it won't go below zero. that is, if we # have negative Q values, we don't want to have a negative discount, # or that will just drive the highest (negative) Q value upwards, and # it will always be selected. negative Q values are instead pushed # upwards by the PositiveBias mechanism. Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]], eval_points=[[x * 0.001] for x in range(1000)], radius=Qradius, intercept=(0, 1)) net.connect(storeQ, Qthresh, pstc=tauPSC) net.connect(Qthresh, acc_storeQ, pstc=intPSC, transform=[[discount * intPSC]], func=lambda x: max(x[0], 0.0)) # accumulate reward reward = memory.Memory("reward", N * 4, 1, radius=rewardradius, inputscale=50) net.add(reward) reward.addDecodedTermination("input", [[intPSC]], intPSC, False) net.connect(zero_input, reward.getTermination("target")) net.connect(reset, reward.getTermination("transfer")) # put reward, currQ, storeQ, and discount together to calculate error error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac()) net.connect(currQ, error, pstc=tauPSC) net.connect(reward, error, pstc=tauPSC) net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]]) net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]]) self.exposeTermination(reward.getTermination("input"), "reward") self.exposeTermination(reset.getTermination("input"), "reset") self.exposeTermination(currQ.getTermination("input"), "currQ") self.exposeTermination(storeQ.getTermination("input"), "storeQ") self.exposeOrigin(error.getOrigin("X"), "X")
def __init__(self, num_actions, Qradius=1.0, rewardradius=1.0, discount=0.3): """Builds the ErrorNetwork. :param num_actions: the number of actions available to the system :param Qradius: expected radius of Q values :param rewardradius: expected radius of reward signal :param discount: discount factor """ self.name = "ErrorNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 errorcap = 0.1 #soft cap on error magnitude (large errors seem to cause problems #with overly-generalizing the learning) #set up relays vals_relay = net.make("vals_relay", 1, num_actions, mode="direct") vals_relay.fixMode() vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) old_vals_relay = net.make("old_vals_relay", 1, num_actions, mode="direct") old_vals_relay.fixMode() old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) curr_bg_relay = net.make("curr_bg_relay", 1, num_actions, mode="direct") curr_bg_relay.fixMode() curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) saved_bg_relay = net.make("saved_bg_relay", 1, num_actions, mode="direct") saved_bg_relay.fixMode() saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) #select out only the currently chosen Q value gatedQ = net.make_array("gatedQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(vals_relay, gatedQ, pstc=tauPSC) net.connect(curr_bg_relay, gatedQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedQ.getNeurons() / num_actions)], pstc=tauPSC) currQ = net.make("currQ", 1, 1, mode="direct") currQ.fixMode() net.connect(gatedQ, currQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) #select out only the previously chosen Q value gatedstoreQ = net.make_array("gatedstoreQ", N, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC) net.connect(saved_bg_relay, gatedstoreQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedstoreQ.getNeurons() / num_actions)], pstc=tauPSC) storeQ = net.make("storeQ", 1, 1, mode="direct") storeQ.fixMode() net.connect(gatedstoreQ, storeQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) #create error calculation network error = errorcalc2.ErrorCalc2(discount, rewardradius=rewardradius, Qradius=Qradius) net.add(error) net.connect(currQ, error.getTermination("currQ")) net.connect(storeQ, error.getTermination("storeQ")) #gate error by learning signal and saved BG output (we only want error when the #system is supposed to be learning, and we only want error related to the action #that was selected) gatederror = net.make_array("gatederror", N * 2, num_actions, radius=errorcap, node_factory=HRLutils.node_fac()) gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(error, gatederror, transform=[[1.0 / Qradius] for _ in range(num_actions)], pstc=tauPSC) #scale the error by Qradius, so that we don't get super huge errors (screws up the gating) learninggate = net.make("learninggate", N, 1, node_factory=HRLutils.node_fac()) learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC, False) net.connect(learninggate, gatederror, func=lambda x: [1.0], transform=[[-12] for _ in range(gatederror.getNeurons())], pstc=tauPSC) net.connect(saved_bg_relay, gatederror, transform=[[-12 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatederror.getNeurons() / num_actions)], pstc=tauPSC) #add a positive bias to the error anywhere the Q values are negative (to stop #Q values from getting too negative, which screws up the action selection) posbias = positivebias.PositiveBias(N, num_actions) net.add(posbias) net.connect(old_vals_relay, posbias.getTermination("input")) net.connect(learninggate, posbias.getTermination("learn"), func=lambda x: [1.0]) biasederror = net.make("biasederror", 1, num_actions, mode="direct") biasederror.fixMode() net.connect(gatederror, biasederror, pstc=0.001) net.connect(posbias, biasederror, pstc=0.001) self.exposeTermination(curr_bg_relay.getTermination("input"), "curr_bg_input") self.exposeTermination(saved_bg_relay.getTermination("input"), "saved_bg_input") self.exposeTermination(vals_relay.getTermination("input"), "vals") self.exposeTermination(old_vals_relay.getTermination("input"), "old_vals") self.exposeTermination(error.getTermination("reward"), "reward") self.exposeTermination(error.getTermination("reset"), "reset") self.exposeTermination(learninggate.getTermination("gate"), "learn") self.exposeOrigin(biasederror.getOrigin("X"), "error")
def run_deliveryenvironment(navargs, ctrlargs, tag=None, seed=None): """Runs the model on the delivery task. :param navargs: kwargs for the nav_agent (see SMDPAgent.__init__) :param ctrlargs: kwargs for the ctrl_agent (see SMDPAgent.__init__) :param tag: string appended to datafiles associated with this run :param seed: random seed used for this run """ if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED if tag is None: tag = str(seed) net = nef.Network("runDeliveryEnvironment", seed=seed) stateN = 1200 # number of neurons to use in state population contextD = 2 # dimension of context vector context_scale = 1.0 # relative scale of context vector vs state vector max_state_input = 2 # maximum length of input vector to state population # labels and vectors corresponding to basic actions available to the system actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] if "load_weights" in navargs and navargs["load_weights"] is not None: navargs["load_weights"] += "_%s" % tag if "load_weights" in ctrlargs and ctrlargs["load_weights"] is not None: ctrlargs["load_weights"] += "_%s" % tag # ##ENVIRONMENT env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={ -16777216: "wall", -1: "floor", -256: "a", -2088896: "b" }, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" # ##NAV AGENT # generate encoders and divide them by max_state_input (so that inputs # will be scaled down to radius 1) enc = env.gen_encoders(stateN, contextD, context_scale) enc = MU.prod(enc, 1.0 / max_state_input) # read in eval points from file with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % tag)) as f: evals = [[float(x) for x in l.split(" ")] for l in f.readlines()] nav_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="NavAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **navargs) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # output of nav_agent is what goes to the environment net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # termination node for nav_agent (just a timer that goes off regularly) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.9)): None}, env, contextD=2, name="NavTermNode") net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) # ##CTRL AGENT # actions corresponding to "go to A" or "go to B" actions = [("a", [0, 1]), ("b", [1, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="CtrlAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **ctrlargs) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() # ctrl_agent gets environmental state and reward net.connect(env.getOrigin("placewcontext"), ctrl_agent.getTermination("state_input")) net.connect(env.getOrigin("reward"), ctrl_agent.getTermination("reward")) # termination node for ctrl_agent (terminates whenever the agent is in the # state targeted by the ctrl_agent) # also has a long timer so that ctrl_agent doesn't get permanently stuck # in one action ctrl_term_node = terminationnode.TerminationNode( { "a": [0, 1], "b": [1, 0], terminationnode.Timer((30, 30)): None }, env, contextD=2, name="CtrlTermNode", rewardval=1.5) net.add(ctrl_term_node) # reward for nav_agent is the pseudoreward from ctrl_agent termination net.connect(ctrl_term_node.getOrigin("pseudoreward"), nav_agent.getTermination("reward")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # connect ctrl_agent action to termination context # this is used so that ctrl_term_node knows what the current goal is (to # determine termination and pseudoreward) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_term_node.getTermination("context")) # state input for nav_agent is the environmental state + the output of # ctrl_agent ctrl_output_relay = net.make("ctrl_output_relay", 1, len(env.placecells) + contextD, mode="direct") ctrl_output_relay.fixMode() trans = (list(MU.I(len(env.placecells))) + [[0 for _ in range(len(env.placecells))] for _ in range(contextD)]) net.connect(env.getOrigin("place"), ctrl_output_relay, transform=trans) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_output_relay, transform=([[0 for _ in range(contextD)] for _ in range(len(env.placecells))] + list(MU.I(contextD)))) net.connect(ctrl_output_relay, nav_agent.getTermination("state_input")) # periodically save the weights # period to save weights (realtime, not simulation time) weight_save = 600.0 threads = [ HRLutils.WeightSaveThread( nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, tag)), weight_save), HRLutils.WeightSaveThread( ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, tag)), weight_save) ] for t in threads: t.start() # data collection node data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % tag)) net.add(data) data.record(env.getOrigin("reward")) q_net = ctrl_agent.getNode("QNetwork") data.record(q_net.getNode("actionvals").getOrigin("X"), func=max) data.record(q_net.getNode("actionvals").getOrigin("X"), func=min) data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON")) data.record_avg(q_net.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) # net.add_to_nengo() # net.run(10000) net.view() for t in threads: t.stop()
def __init__(self, name, N, d, radius=1.0, inputscale=1.0, recurweight=1.0, direct_storage=False): """Builds the Memory network. :param name: name of network :param N: base number of neurons :param d: dimension of stored value :param radius: radius of stored value :param inputscale: controls how fast the stored value moves to the target :param recurweight: controls the preservation of the stored value :param direct_storage: if True, use directmode for the memory """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.dimension = d self.radius = radius tauPSC = 0.007 intPSC = 0.1 # population that will store the value if not direct_storage: storage = net.make_array("storage", N, d, node_factory=HRLutils.node_fac(), eval_points=[[x * 0.001] for x in range(-1000, 1000)]) else: storage = net.make("storage", 1, d, mode="direct") storage.fixMode() net.connect(storage, storage, transform=MU.diag([recurweight for _ in range(d)]), pstc=intPSC) # storageinput will represent (target - stored_value), which when used # as input to storage will drive the stored value to target storageinput = net.make_array("storageinput", N, d, node_factory=HRLutils.node_fac()) storageinput.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) storageinput.addDecodedTermination("target", MU.diag([1.0 / radius for _ in range(d)]), tauPSC, False) # note: store everything in -1 -- 1 range by dividing by radius # scale storageinput value by inputscale to control rate at which # it moves to the target net.connect(storageinput, storage, pstc=intPSC, transform=MU.diag([inputscale * intPSC for _ in range(d)])) # subtract currently stored value net.connect(storage, storageinput, pstc=tauPSC, transform=MU.diag([-1 for _ in range(d)])) # we want to open the input gate when the transfer signal arrives (to # transfer storageinput to storage). using a double inhibition setup # (rather than just feeding it e.g. the the inverse of the transfer # signal) so that we get a nice clean zero # this inhibits the storageinput population (to block input to the # storage) transferinhib = net.make("transferinhib", N, 1, node_factory=HRLutils.node_fac()) transferinhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) transferinhib.addTermination("gate", [[-10] for _ in range(transferinhib.getNeurons())], tauPSC, False) net.connect(transferinhib, storageinput, pstc=tauPSC, transform=[[-10] for _ in range(storageinput.getNeurons())]) # this drives the transferinhib population (so that by default it will # block any input). inhibiting transferinhib will thus remove the # inhibition on storageinput, and change the stored value biasinput = net.make_input("biasinput", [1]) net.connect(biasinput, transferinhib, pstc=tauPSC) # output population (to undo radius scaling) storageoutput = net.make("storageoutput", 1, d, mode="direct") storageoutput.fixMode() net.connect(storage, storageoutput, pstc=0.001, transform=MU.diag([radius for _ in range(d)])) self.exposeTermination(transferinhib.getTermination("gate"), "transfer") self.exposeTermination(storageinput.getTermination("target"), "target") self.exposeOrigin(storageoutput.getOrigin("X"), "X")
def run_deliveryenvironment(navargs, ctrlargs, tag=None, seed=None): """Runs the model on the delivery task. :param navargs: kwargs for the nav_agent (see SMDPAgent.__init__) :param ctrlargs: kwargs for the ctrl_agent (see SMDPAgent.__init__) :param tag: string appended to datafiles associated with this run :param seed: random seed used for this run """ if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED if tag is None: tag = str(seed) net = nef.Network("runDeliveryEnvironment", seed=seed) stateN = 1200 # number of neurons to use in state population contextD = 2 # dimension of context vector context_scale = 1.0 # relative scale of context vector vs state vector max_state_input = 2 # maximum length of input vector to state population # labels and vectors corresponding to basic actions available to the system actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] if "load_weights" in navargs and navargs["load_weights"] is not None: navargs["load_weights"] += "_%s" % tag if "load_weights" in ctrlargs and ctrlargs["load_weights"] is not None: ctrlargs["load_weights"] += "_%s" % tag # ##ENVIRONMENT env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"}, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" # ##NAV AGENT # generate encoders and divide them by max_state_input (so that inputs # will be scaled down to radius 1) enc = env.gen_encoders(stateN, contextD, context_scale) enc = MU.prod(enc, 1.0 / max_state_input) # read in eval points from file with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % tag)) as f: evals = [[float(x) for x in l.split(" ")] for l in f.readlines()] nav_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="NavAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **navargs) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # output of nav_agent is what goes to the environment net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # termination node for nav_agent (just a timer that goes off regularly) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.9)): None}, env, contextD=2, name="NavTermNode") net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) # ##CTRL AGENT # actions corresponding to "go to A" or "go to B" actions = [("a", [0, 1]), ("b", [1, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="CtrlAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **ctrlargs) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() # ctrl_agent gets environmental state and reward net.connect(env.getOrigin("placewcontext"), ctrl_agent.getTermination("state_input")) net.connect(env.getOrigin("reward"), ctrl_agent.getTermination("reward")) # termination node for ctrl_agent (terminates whenever the agent is in the # state targeted by the ctrl_agent) # also has a long timer so that ctrl_agent doesn't get permanently stuck # in one action ctrl_term_node = terminationnode.TerminationNode( {"a": [0, 1], "b": [1, 0], terminationnode.Timer((30, 30)): None}, env, contextD=2, name="CtrlTermNode", rewardval=1.5) net.add(ctrl_term_node) # reward for nav_agent is the pseudoreward from ctrl_agent termination net.connect(ctrl_term_node.getOrigin("pseudoreward"), nav_agent.getTermination("reward")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # connect ctrl_agent action to termination context # this is used so that ctrl_term_node knows what the current goal is (to # determine termination and pseudoreward) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_term_node.getTermination("context")) # state input for nav_agent is the environmental state + the output of # ctrl_agent ctrl_output_relay = net.make("ctrl_output_relay", 1, len(env.placecells) + contextD, mode="direct") ctrl_output_relay.fixMode() trans = (list(MU.I(len(env.placecells))) + [[0 for _ in range(len(env.placecells))] for _ in range(contextD)]) net.connect(env.getOrigin("place"), ctrl_output_relay, transform=trans) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_output_relay, transform=([[0 for _ in range(contextD)] for _ in range(len(env.placecells))] + list(MU.I(contextD)))) net.connect(ctrl_output_relay, nav_agent.getTermination("state_input")) # periodically save the weights # period to save weights (realtime, not simulation time) weight_save = 600.0 threads = [ HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, tag)), weight_save), HRLutils.WeightSaveThread(ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, tag)), weight_save)] for t in threads: t.start() # data collection node data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % tag)) net.add(data) data.record(env.getOrigin("reward")) q_net = ctrl_agent.getNode("QNetwork") data.record(q_net.getNode("actionvals").getOrigin("X"), func=max) data.record(q_net.getNode("actionvals").getOrigin("X"), func=min) data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON")) data.record_avg(q_net.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) # net.add_to_nengo() # net.run(10000) net.view() for t in threads: t.stop()
def run_contextenvironment(args, seed=None): """Runs the model on the context task. :param args: kwargs for the agent :param seed: random seed """ if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("runContextEnvironment") if "load_weights" in args and args["load_weights"] is not None: args["load_weights"] += "_%s" % seed stateN = 1200 # number of neurons to use in state population contextD = 2 # dimension of context vector context_scale = 1.0 # scale of context representation max_state_input = 2 # max length of input vector for state population # actions (label and vector) available to the system actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # context labels and rewards for achieving those context goals rewards = {"a": 1.5, "b": 1.5} env = contextenvironment.ContextEnvironment( actions, HRLutils.datafile("contextmap.bmp"), contextD, rewards, colormap={ -16777216: "wall", -1: "floor", -256: "a", -2088896: "b" }, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" # termination node for agent (just goes off on some regular interval) term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.9)): 0.0}, env) net.add(term_node) # generate encoders and divide by max_state_input (so that all inputs # will end up being radius 1) enc = env.gen_encoders(stateN, contextD, context_scale) enc = MU.prod(enc, 1.0 / max_state_input) # load eval points from file with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f: print "loading contextbmp_evalpoints_%s.txt" % seed evals = [[float(x) for x in l.split(" ")] for l in f.readlines()] agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, state_encoders=enc, state_evals=evals, state_threshold=0.8, **args) net.add(agent) print "agent neurons:", agent.countNeurons() # period to save weights (realtime, not simulation time) weight_save = 600.0 t = HRLutils.WeightSaveThread( agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (agent.name, seed)), weight_save) t.start() # data collection node data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % seed)) net.add(data) q_net = agent.getNode("QNetwork") data.record(env.getOrigin("reward")) data.record(q_net.getNode("actionvals").getOrigin("X"), func=max) data.record(q_net.getNode("actionvals").getOrigin("X"), func=min) data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON")) data.record_avg(q_net.getNode("valdiff").getOrigin("X")) data.record_avg(env.getOrigin("state")) net.connect(env.getOrigin("placewcontext"), agent.getTermination("state_input")) net.connect(env.getOrigin("reward"), agent.getTermination("reward")) net.connect(term_node.getOrigin("reset"), agent.getTermination("reset")) net.connect(term_node.getOrigin("learn"), agent.getTermination("learn")) net.connect(term_node.getOrigin("reset"), agent.getTermination("save_state")) net.connect(term_node.getOrigin("reset"), agent.getTermination("save_action")) net.connect(agent.getOrigin("action_output"), env.getTermination("action")) # net.add_to_nengo() # net.run(2000) net.view() t.stop()
def __init__(self, gamma, rewardradius=1.0): """Builds the ErrorCalc network. :param gamma: discount factor :param rewardradius: expected radius of reward values """ self.name = "ErrorCalc" tauPSC = 0.007 intPSC = 0.1 N = 50 ef = HRLutils.defaultEnsembleFactory() # current Q input currQ = ef.make("currQ", 1, 1) currQ.addDecodedTermination("input", [[1]], 0.001, False) self.addNode(currQ) currQ.setMode(SimulationMode.DIRECT) currQ.fixMode() self.exposeTermination(currQ.getTermination("input"), "currQ") # input population for resetting the network resetef = HRLutils.defaultEnsembleFactory() resetef.setEncoderFactory(vectorgenerators.DirectedVectorGenerator([1 ])) resetef.getNodeFactory().setIntercept(IndicatorPDF(0.3, 1.0)) reset = resetef.make("reset", N, 1) reset.addDecodedTermination("input", [[1]], tauPSC, False) self.addNode(reset) reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) self.exposeTermination(reset.getTermination("input"), "reset") # store previous value of Q storeQ = memory.Memory("storeQ", N * 4, 1, inputscale=50) self.addNode(storeQ) self.addProjection(reset.getOrigin("X"), storeQ.getTermination("transfer")) self.addProjection(currQ.getOrigin("X"), storeQ.getTermination("target")) # calculate discount biasInput = FunctionInput("biasinput", [ConstantFunction(1, 1)], Units.UNK) self.addNode(biasInput) discount = memory.Memory("discount", N * 4, 1, inputscale=50, recurweight=gamma) self.addNode(discount) self.addProjection(biasInput.getOrigin("origin"), discount.getTermination("target")) self.addProjection(reset.getOrigin("X"), discount.getTermination("transfer")) # accumulate discounted reward # do we really need gamma to make this all work? if it proves to be a # problem, could try removing it, and just use un-discounted reward. # we can just use the fact that the reward integrator will saturate to # prevent rewards from going to infinity discountreward = eprod.Eprod("discountreward", N * 4, 1, weights=[[[1.0 / rewardradius]], [[1.0]]], oneDinput=True) self.addNode(discountreward) self.exposeTermination(discountreward.getTermination("A"), "reward") self.addProjection(discount.getOrigin("X"), discountreward.getTermination("B")) reward = ef.make("reward", N * 4, 1) reward.addDecodedTermination("input", [[intPSC]], intPSC, False) reward.addDecodedTermination("feedback", [[1]], intPSC, False) reward.addTermination("gate", [[-8] for _ in range(reward.getNodeCount())], intPSC, False) self.addNode(reward) reward.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) self.addProjection(reward.getOrigin("X"), reward.getTermination("feedback")) self.addProjection(discountreward.getOrigin("X"), reward.getTermination("input")) self.addProjection(reset.getOrigin("X"), reward.getTermination("gate")) # weight currQ by discount discountcurrQ = eprod.Eprod("discountcurrQ", N * 4, 1, oneDinput=True) self.addNode(discountcurrQ) self.addProjection(currQ.getOrigin("X"), discountcurrQ.getTermination("A")) self.addProjection(discount.getOrigin("X"), discountcurrQ.getTermination("B")) # error calculation # radius of 2 since max error = maxQ + maxreward - 0 (unless we let Q # values go negative) error = ef.make("error", N * 2, [2]) error.addDecodedTermination("currQ", [[1]], tauPSC, False) error.addDecodedTermination("reward", [[1]], tauPSC, False) error.addDecodedTermination("storeQ", [[-1]], tauPSC, False) self.addNode(error) self.addProjection(discountcurrQ.getOrigin("X"), error.getTermination("currQ")) self.addProjection(reward.getOrigin("X"), error.getTermination("reward")) self.addProjection(storeQ.getOrigin("X"), error.getTermination("storeQ")) self.exposeOrigin(error.getOrigin("X"), "X")
def run_flat_delivery(args, seed=None): """Runs the model on the delivery task with only one hierarchical level.""" if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_flat_delivery") if "load_weights" in args and args["load_weights"] is not None: args["load_weights"] += "_%s" % seed stateN = 1200 contextD = 2 context_scale = 1.0 max_state_input = 2 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # ##ENVIRONMENT env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"}, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" # ##NAV AGENT enc = env.gen_encoders(stateN, contextD, context_scale) enc = MU.prod(enc, 1.0 / max_state_input) with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f: evals = [[float(x) for x in l.split(" ")] for l in f.readlines()] nav_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="NavAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **args) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) net.connect(env.getOrigin("placewcontext"), nav_agent.getTermination("state_input")) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.9)): None}, env, name="NavTermNode", contextD=2) net.add(nav_term_node) net.connect(env.getOrigin("context"), nav_term_node.getTermination("context")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) reward_relay = net.make("reward_relay", 1, 1, mode="direct") reward_relay.fixMode() net.connect(env.getOrigin("reward"), reward_relay) net.connect(nav_term_node.getOrigin("pseudoreward"), reward_relay) net.connect(reward_relay, nav_agent.getTermination("reward")) # period to save weights (realtime, not simulation time) weight_save = 600.0 HRLutils.WeightSaveThread(nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, seed)), weight_save).start() # data collection node data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % seed)) net.add(data) q_net = nav_agent.getNode("QNetwork") data.record_avg(env.getOrigin("reward")) data.record_avg(q_net.getNode("actionvals").getOrigin("X")) data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON")) data.record_avg(q_net.getNode("valdiff").getOrigin("X")) data.record_avg(nav_agent.getNode("ErrorNetwork").getOrigin("error")) # net.add_to_nengo() # net.run(10000) net.view()
def __init__(self, num_actions, Qradius=1.0, rewardradius=1.0, discount=0.3): """Builds the ErrorNetwork. :param num_actions: the number of actions available to the system :param Qradius: expected radius of Q values :param rewardradius: expected radius of reward signal :param discount: discount factor """ self.name = "ErrorNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 errorcap = 0.1 # soft cap on error magnitude (large errors seem to # cause problems with overly-generalizing the learning) # set up relays vals_relay = net.make("vals_relay", 1, num_actions, mode="direct") vals_relay.fixMode() vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) old_vals_relay = net.make("old_vals_relay", 1, num_actions, mode="direct") old_vals_relay.fixMode() old_vals_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) curr_bg_relay = net.make("curr_bg_relay", 1, num_actions, mode="direct") curr_bg_relay.fixMode() curr_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) saved_bg_relay = net.make("saved_bg_relay", 1, num_actions, mode="direct") saved_bg_relay.fixMode() saved_bg_relay.addDecodedTermination("input", MU.I(num_actions), 0.001, False) # select out only the currently chosen Q value gatedQ = net.make_array("gatedQ", N * 2, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(vals_relay, gatedQ, pstc=tauPSC) net.connect( curr_bg_relay, gatedQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedQ.getNeurons() / num_actions)], pstc=tauPSC) currQ = net.make("currQ", 1, 1, mode="direct") currQ.fixMode() net.connect(gatedQ, currQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) # select out only the previously chosen Q value gatedstoreQ = net.make_array("gatedstoreQ", N * 2, num_actions, node_factory=HRLutils.node_fac(), radius=Qradius) gatedstoreQ.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(old_vals_relay, gatedstoreQ, pstc=tauPSC) net.connect( saved_bg_relay, gatedstoreQ, transform=[[-3 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatedstoreQ.getNeurons() / num_actions)], pstc=tauPSC) storeQ = net.make("storeQ", 1, 1, mode="direct") storeQ.fixMode() net.connect(gatedstoreQ, storeQ, transform=[[1 for _ in range(num_actions)]], pstc=0.001) # create error calculation network error = errorcalc2.ErrorCalc2(discount, rewardradius=rewardradius, Qradius=Qradius) net.add(error) net.connect(currQ, error.getTermination("currQ")) net.connect(storeQ, error.getTermination("storeQ")) # gate error by learning signal and saved BG output (we only want error # when the system is supposed to be learning, and we only want error # related to the action that was selected) gatederror = net.make_array("gatederror", N * 2, num_actions, radius=errorcap, node_factory=HRLutils.node_fac()) gatederror.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(error, gatederror, transform=[[1.0 / Qradius] for _ in range(num_actions)], pstc=tauPSC) # scale the error by Qradius, so that we don't get super huge errors # (causes problems with the gating) learninggate = net.make("learninggate", N, 1, node_factory=HRLutils.node_fac()) learninggate.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) learninggate.addTermination("gate", [[-10] for _ in range(N)], tauPSC, False) net.connect(learninggate, gatederror, func=lambda x: [1.0], transform=[[-12] for _ in range(gatederror.getNeurons())], pstc=tauPSC) net.connect( saved_bg_relay, gatederror, transform=[[-12 if i != k else 0 for k in range(num_actions)] for i in range(num_actions) for _ in range(gatederror.getNeurons() / num_actions)], pstc=tauPSC) # add a positive bias to the error anywhere the Q values are negative # (to stop Q values from getting too negative, which causes problems # with the action selection) posbias = positivebias.PositiveBias(N, num_actions) net.add(posbias) net.connect(old_vals_relay, posbias.getTermination("input")) net.connect(learninggate, posbias.getTermination("learn"), func=lambda x: [1.0]) biasederror = net.make("biasederror", 1, num_actions, mode="direct") biasederror.fixMode() net.connect(gatederror, biasederror, pstc=0.001) net.connect(posbias, biasederror, pstc=0.001) self.exposeTermination(curr_bg_relay.getTermination("input"), "curr_bg_input") self.exposeTermination(saved_bg_relay.getTermination("input"), "saved_bg_input") self.exposeTermination(vals_relay.getTermination("input"), "vals") self.exposeTermination(old_vals_relay.getTermination("input"), "old_vals") self.exposeTermination(error.getTermination("reward"), "reward") self.exposeTermination(error.getTermination("reset"), "reset") self.exposeTermination(learninggate.getTermination("gate"), "learn") self.exposeOrigin(biasederror.getOrigin("X"), "error")
def __init__(self, name, N, stateN, actions, learningrate, Qradius=1.0, init_decoders=None): """Build ActionValues network. :param name: name of Network :param N: base number of neurons :param stateN: number of neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learning rate for PES rule :param Qradius: expected radius of Q values :param init_decoders: if specified, will be used to initialize the connection weights to whatever function is specified by decoders """ self.name = name net = nef.Network(self, seed=HRLutils.SEED, quick=False) self.N = N self.learningrate = learningrate self.supervision = 1.0 # don't use the unsupervised stuff at all self.tauPSC = 0.007 modterms = [] learnterms = [] # relays output = net.make("output", 1, len(actions), mode="direct") output.fixMode() for i, action in enumerate(actions): # create one population corresponding to each action act_pop = net.make("action_" + action[0], self.N * 4, 1, node_factory=HRLutils.node_fac()) act_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) # add error termination modterm = act_pop.addDecodedTermination( "error", [[0 if j != i else 1 for j in range(len(actions))]], 0.005, True) # set modulatory transform so that it selects one dimension of # the error signal # create learning termination if init_decoders is not None: weights = MU.prod(act_pop.getEncoders(), MU.transpose(init_decoders)) else: weights = [[ random.uniform(-1e-3, 1e-3) for j in range(stateN) ] for i in range(act_pop.getNeurons())] learningterm = act_pop.addHPESTermination("learning", weights, 0.005, False, None) # initialize the learning rule net.learn(act_pop, learningterm, modterm, rate=self.learningrate, supervisionRatio=self.supervision) # connect each action back to output relay net.connect(act_pop.getOrigin("X"), output, transform=[[0] if j != i else [Qradius] for j in range(len(actions))], pstc=0.001) # note, we learn all the Q values with radius 1, then just # multiply by the desired Q radius here modterms += [modterm] learnterms += [learningterm] # use EnsembleTerminations to group the individual action terminations # into one multi-dimensional termination self.exposeTermination(EnsembleTermination(self, "state", learnterms), "state") self.exposeTermination(EnsembleTermination(self, "error", modterms), "error") self.exposeOrigin(output.getOrigin("X"), "X")
def run_flat_delivery(args, seed=None): """Runs the model on the delivery task with only one hierarchical level.""" if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_flat_delivery") if "load_weights" in args and args["load_weights"] is not None: args["load_weights"] += "_%s" % seed stateN = 1200 contextD = 2 context_scale = 1.0 max_state_input = 2 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] # ##ENVIRONMENT env = deliveryenvironment.DeliveryEnvironment( actions, HRLutils.datafile("contextmap.bmp"), colormap={ -16777216: "wall", -1: "floor", -256: "a", -2088896: "b" }, imgsize=(5, 5), dx=0.001, placedev=0.5) net.add(env) print "generated", len(env.placecells), "placecells" # ##NAV AGENT enc = env.gen_encoders(stateN, contextD, context_scale) enc = MU.prod(enc, 1.0 / max_state_input) with open(HRLutils.datafile("contextbmp_evalpoints_%s.txt" % seed)) as f: evals = [[float(x) for x in l.split(" ")] for l in f.readlines()] nav_agent = smdpagent.SMDPAgent(stateN, len(env.placecells) + contextD, actions, name="NavAgent", state_encoders=enc, state_evals=evals, state_threshold=0.8, **args) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) net.connect(env.getOrigin("placewcontext"), nav_agent.getTermination("state_input")) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.9)): None}, env, name="NavTermNode", contextD=2) net.add(nav_term_node) net.connect(env.getOrigin("context"), nav_term_node.getTermination("context")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) reward_relay = net.make("reward_relay", 1, 1, mode="direct") reward_relay.fixMode() net.connect(env.getOrigin("reward"), reward_relay) net.connect(nav_term_node.getOrigin("pseudoreward"), reward_relay) net.connect(reward_relay, nav_agent.getTermination("reward")) # period to save weights (realtime, not simulation time) weight_save = 600.0 HRLutils.WeightSaveThread( nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, seed)), weight_save).start() # data collection node data = datanode.DataNode(period=5, filename=HRLutils.datafile("dataoutput_%s.txt" % seed)) net.add(data) q_net = nav_agent.getNode("QNetwork") data.record_avg(env.getOrigin("reward")) data.record_avg(q_net.getNode("actionvals").getOrigin("X")) data.record_sparsity(q_net.getNode("state_pop").getOrigin("AXON")) data.record_avg(q_net.getNode("valdiff").getOrigin("X")) data.record_avg(nav_agent.getNode("ErrorNetwork").getOrigin("error")) # net.add_to_nengo() # net.run(10000) net.view()
def __init__(self, stateN, stateD, state_encoders, actions, learningrate, stateradius=1.0, Qradius=1.0, load_weights=None, state_evals=None, state_threshold=(0.0, 1.0), statediff_threshold=0.2, init_Qs=None): """Builds the QNetwork. :param stateN: number of neurons to use to represent state :param stateD: dimension of state vector :param state_encoders: encoders to use for neurons in state population :param actions: actions available to the system :type actions: list of tuples (action_name,action_vector) :param learningrate: learningrate for action value learning rule :param stateradius: expected radius of state values :param Qradius: expected radius of Q values :param load_weights: filename to load Q value weights from :param state_evals: evaluation points to use for state population. This is used when initializing the Q values (may be necessary if the input states don't tend to fall in the hypersphere). :param state_threshold: threshold range of state neurons :param statediff_threshold: maximum state difference for dual training :param init_Qs: initial Q values """ self.name = "QNetwork" net = nef.Network(self, seed=HRLutils.SEED, quick=False) N = 50 tauPSC = 0.007 num_actions = len(actions) init_Qs = [0.2] * num_actions if init_Qs is None else init_Qs # if True, use neuron--neuron weight learning, otherwise, use decoder # learning self.neuron_learning = False # set up relays state_relay = net.make("state_relay", 1, stateD, mode="direct") state_relay.fixMode() state_relay.addDecodedTermination("input", MU.I(stateD), 0.001, False) # create state population state_fac = HRLutils.node_fac() if isinstance(state_threshold, (float, int)): state_threshold = (state_threshold, 1.0) state_fac.setIntercept( IndicatorPDF(state_threshold[0], state_threshold[1])) state_pop = net.make("state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(state_relay, state_pop, pstc=tauPSC) # store the state value (used to drive population encoding previous # state) saved_state = memory.Memory("saved_state", N * 4, stateD, inputscale=50, radius=stateradius, direct_storage=True) net.add(saved_state) net.connect(state_relay, saved_state.getTermination("target")) # create population representing previous state old_state_pop = net.make("old_state_pop", stateN, stateD, radius=stateradius, node_factory=state_fac, encoders=state_encoders, eval_points=state_evals) old_state_pop.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) net.connect(saved_state, old_state_pop, pstc=tauPSC) # set up action nodes if self.neuron_learning: # use ActionValues network to compute Q values # current Q values decoders = state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() actionvals = actionvalues.ActionValues("actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(actionvals) net.connect(state_pop.getOrigin("AXON"), actionvals.getTermination("state")) # Q values of previous state decoders = old_state_pop.addDecodedOrigin( "init_decoders", [ConstantFunction(stateD, init_Qs)], "AXON").getDecoders() old_actionvals = actionvalues.ActionValues("old_actionvals", N, stateN, actions, learningrate, Qradius=Qradius, init_decoders=decoders) net.add(old_actionvals) net.connect(old_state_pop.getOrigin("AXON"), old_actionvals.getTermination("state")) else: # just use decoder on state population to compute Q values # current Q values origin = state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") state_dlnode = decoderlearningnode.DecoderLearningNode( state_pop, origin, learningrate, num_actions, name="state_learningnode") net.add(state_dlnode) # just a little relay node, so that things match up for the rest of # the script when you have the neuron -- neuron learning actionvals = net.make("actionvals", 1, num_actions, mode="direct") actionvals.fixMode() net.connect(origin, actionvals, pstc=0.001) # Q values of previous state origin = old_state_pop.addDecodedOrigin("vals", [ ConstantFunction(num_actions, init_Qs[i]) for i in range(num_actions) ], "AXON") old_state_dlnode = decoderlearningnode.DecoderLearningNode( old_state_pop, origin, learningrate, num_actions, name="old_state_learningnode") net.add(old_state_dlnode) old_actionvals = net.make("old_actionvals", 1, num_actions, mode="direct") old_actionvals.fixMode() net.connect(origin, old_actionvals, pstc=0.001) if load_weights is not None: self.loadParams(load_weights) # find error between old_actionvals and actionvals (this will be used # to drive learning on the new actionvals) valdiff = net.make_array("valdiff", N, num_actions, node_factory=HRLutils.node_fac()) # doubling the values to get a bigger error signal net.connect(old_actionvals, valdiff, transform=MU.diag([2] * num_actions), pstc=tauPSC) net.connect(actionvals, valdiff, transform=MU.diag([-2] * num_actions), pstc=tauPSC) # calculate diff between curr_state and saved_state and use that to # gate valdiff (we only want to train the curr state based on previous # state when the two have similar values) # note: threshold > 0 so that there is a deadzone in the middle (when # the states are similar) where there will be no output inhibition statediff = net.make_array("statediff", N, stateD, intercept=(statediff_threshold, 1)) net.connect(state_relay, statediff, pstc=tauPSC) net.connect(saved_state, statediff, transform=MU.diag([-1] * stateD), pstc=tauPSC) net.connect(statediff, valdiff, func=lambda x: [abs(v) for v in x], transform=[[-10] * stateD for _ in range(valdiff.getNeurons())], pstc=tauPSC) # connect up valdiff to the error signal for current Q values, and # expose the error signal for the previous Q values to the external # error if self.neuron_learning: net.connect(valdiff, actionvals.getTermination("error")) self.exposeTermination(old_actionvals.getTermination("error"), "error") else: net.connect(valdiff, state_dlnode.getTermination("error")) self.exposeTermination(old_state_dlnode.getTermination("error"), "error") self.exposeTermination(state_relay.getTermination("input"), "state") self.exposeTermination(saved_state.getTermination("transfer"), "save_state") self.exposeOrigin(actionvals.getOrigin("X"), "vals") self.exposeOrigin(old_actionvals.getOrigin("X"), "old_vals")
def run_badreenvironment(nav_args, ctrl_args, bias=0.0, seed=None, flat=False, label="tmp"): """Runs the model on the Badre et al. (2010) task.""" if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("run_badreenvironment") env = badreenvironment.BadreEnvironment(flat=flat) net.add(env) # ##NAV AGENT stateN = 500 max_state_input = 3 enc = env.gen_encoders(stateN, 0, 0.0) # generate evaluation points orientations = MU.I(env.num_orientations) shapes = MU.I(env.num_shapes) colours = MU.I(env.num_colours) evals = ( list(MU.diag([3 for _ in range(env.stateD)])) + [o + s + c for o in orientations for s in shapes for c in colours]) # create lower level nav_agent = smdpagent.SMDPAgent(stateN, env.stateD, env.actions, name="NavAgent", stateradius=max_state_input, state_encoders=enc, state_evals=evals, discount=0.5, **nav_args) net.add(nav_agent) print "agent neurons:", nav_agent.countNeurons() # actions terminate on fixed schedule (aligned with environment) nav_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="NavTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(nav_term_node) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("reset")) net.connect(nav_term_node.getOrigin("learn"), nav_agent.getTermination("learn")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_state")) net.connect(nav_term_node.getOrigin("reset"), nav_agent.getTermination("save_action")) net.connect(nav_agent.getOrigin("action_output"), env.getTermination("action")) # ##CTRL AGENT stateN = 500 enc = RandomHypersphereVG().genVectors(stateN, env.stateD) actions = [("shape", [0, 1]), ("orientation", [1, 0]), ("null", [0, 0])] ctrl_agent = smdpagent.SMDPAgent(stateN, env.stateD, actions, name="CtrlAgent", state_encoders=enc, stateradius=max_state_input, state_evals=evals, discount=0.4, **ctrl_args) net.add(ctrl_agent) print "agent neurons:", ctrl_agent.countNeurons() net.connect(env.getOrigin("state"), ctrl_agent.getTermination("state_input")) ctrl_term_node = terminationnode.TerminationNode( {terminationnode.Timer((0.6, 0.6)): None}, env, name="CtrlTermNode", state_delay=0.1, reset_delay=0.05, reset_interval=0.1) net.add(ctrl_term_node) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("reset")) net.connect(ctrl_term_node.getOrigin("learn"), ctrl_agent.getTermination("learn")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_state")) net.connect(ctrl_term_node.getOrigin("reset"), ctrl_agent.getTermination("save_action")) # ctrl gets a slight bonus if it selects a rule (as opposed to null), to # encourage it to not just pick null all the time reward_relay = net.make("reward_relay", 1, 3, mode="direct") reward_relay.fixMode() net.connect(env.getOrigin("reward"), reward_relay, transform=[[1], [0], [0]]) net.connect(ctrl_agent.getOrigin("action_output"), reward_relay, transform=[[0, 0], [1, 0], [0, 1]]) net.connect(reward_relay, ctrl_agent.getTermination("reward"), func=lambda x: ((x[0] + bias * abs(x[0])) if x[1] + x[2] > 0.5 else x[0]), origin_name="ctrl_reward") # ideal reward function (for testing) # def ctrl_reward_func(x): # if abs(x[0]) < 0.5: # return 0.0 # # if flat: # return 1.5 if x[1] + x[2] < 0.5 else -1.5 # else: # if x[1] + x[2] < 0.5: # return -1.5 # if [round(a) for a in env.state[-2:]] == [round(b) # for b in x[1:]]: # return 1.5 # else: # return -1.5 # net.connect(reward_relay, ctrl_agent.getTermination("reward"), # func=ctrl_reward_func) # nav rewarded for picking ctrl target def nav_reward_func(x): if abs(x[0]) < 0.5 or env.action is None: return 0.0 if x[1] + x[2] < 0.5: return x[0] if x[1] > x[2]: return (1.5 if env.action[1] == env.state[:env.num_orientations] else -1.5) else: return (1.5 if env.action[1] == env.state[env.num_orientations:-env.num_colours] else -1.5) net.connect(reward_relay, nav_agent.getTermination("reward"), func=nav_reward_func) # state for navagent controlled by ctrlagent ctrl_state_inhib = net.make_array("ctrl_state_inhib", 50, env.stateD, radius=2, mode=HRLutils.SIMULATION_MODE) ctrl_state_inhib.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) inhib_matrix = [[0, -5]] * 50 * env.num_orientations + \ [[-5, 0]] * 50 * env.num_shapes + \ [[-5, -5]] * 50 * env.num_colours # ctrl output inhibits all the non-selected aspects of the state net.connect(env.getOrigin("state"), ctrl_state_inhib) net.connect(ctrl_agent.getOrigin("action_output"), ctrl_state_inhib, transform=inhib_matrix) # also give a boost to the selected aspects (so that neurons are roughly # equally activated). def boost_func(x): if x[0] > 0.5: return [3 * v for v in x[1:]] else: return x[1:] boost = net.make("boost", 1, 1 + env.stateD, mode="direct") boost.fixMode() net.connect(ctrl_state_inhib, boost, transform=([[0 for _ in range(env.stateD)]] + list(MU.I(env.stateD)))) net.connect(ctrl_agent.getOrigin("action_output"), boost, transform=[[1, 1]] + [[0, 0] for _ in range(env.stateD)]) net.connect(boost, nav_agent.getTermination("state_input"), func=boost_func) # save weights weight_save = 1.0 # period to save weights (realtime, not simulation time) threads = [ HRLutils.WeightSaveThread( nav_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (nav_agent.name, seed)), weight_save), HRLutils.WeightSaveThread( ctrl_agent.getNode("QNetwork").saveParams, os.path.join("weights", "%s_%s" % (ctrl_agent.name, seed)), weight_save) ] for t in threads: t.start() # data collection node data = datanode.DataNode( period=1, filename=HRLutils.datafile("dataoutput_%s.txt" % label), header="%s %s %s %s %s" % (nav_args, ctrl_args, bias, seed, flat)) print "saving data to", data.filename print "header", data.header net.add(data) nav_q = nav_agent.getNode("QNetwork") ctrl_q = ctrl_agent.getNode("QNetwork") ctrl_bg = ctrl_agent.getNode("BGNetwork").getNode("weight_actions") data.record_avg(env.getOrigin("reward")) data.record_avg(ctrl_q.getNode("actionvals").getOrigin("X")) data.record_sparsity(ctrl_q.getNode("state_pop").getOrigin("AXON")) data.record_sparsity(nav_q.getNode("state_pop").getOrigin("AXON")) data.record_avg(ctrl_q.getNode("valdiff").getOrigin("X")) data.record_avg(ctrl_agent.getNode("ErrorNetwork").getOrigin("error")) data.record_avg(ctrl_bg.getNode("0").getOrigin("AXON")) data.record_avg(ctrl_bg.getNode("1").getOrigin("AXON")) data.record(env.getOrigin("score")) # net.add_to_nengo() # net.network.simulator.run(0, 300, 0.001) net.view() for t in threads: t.stop()
def __init__(self, name, N, d, scale=1.0, weights=None, maxinput=1.0, oneDinput=False): # scale is a scale on the output of the multiplication # output = (input1.*input2)*scale # weights are optional matrices applied to each input # output = (C1*input1 .* C2*input2)*scale # maxinput is the maximum expected value of any dimension of the # inputs. this is used to scale the inputs internally so that the # length of the vectors in the intermediate populations are not # too small (which results in a lot of noise in the calculations) # oneDinput indicates that the second input is one dimensional, and is # just a scale on the first input rather than an element-wise product self.name = name tauPSC = 0.007 # the size of the intermediate populations smallN = int(math.ceil(float(N) / d)) # the maximum value of the vectors represented by the intermediate # populations. the vector is at most [maxinput maxinput], so the length # of that is sqrt(maxinput**2 + maxinput**2) maxlength = math.sqrt(2 * maxinput**2) if weights is not None and len(weights) != 2: print "Warning, other than 2 matrices given to eprod" if weights is None: weights = [MU.I(d), MU.I(d)] inputd = len(weights[0][0]) ef = HRLutils.defaultEnsembleFactory() # create input populations in1 = ef.make("in1", 1, inputd) in1.addDecodedTermination("input", MU.I(inputd), 0.001, False) self.addNode(in1) in1.setMode(SimulationMode.DIRECT) # since this is just a relay in1.fixMode() in2 = ef.make("in2", 1, inputd) if not oneDinput: in2.addDecodedTermination("input", MU.I(inputd), 0.001, False) else: # if it is a 1-D input we just expand it to a full vector of that # value so that we can treat it as an element-wise product in2.addDecodedTermination("input", [[1] for i in range(inputd)], 0.001, False) self.addNode(in2) in2.setMode(SimulationMode.DIRECT) # since this is just a relay in2.fixMode() # ensemble for intermediate populations multef = NEFEnsembleFactoryImpl() multef.nodeFactory.tauRC = 0.05 multef.nodeFactory.tauRef = 0.002 multef.nodeFactory.maxRate = IndicatorPDF(200, 500) multef.nodeFactory.intercept = IndicatorPDF(-1, 1) multef.encoderFactory = vectorgenerators.MultiplicationVectorGenerator( ) multef.beQuiet() result = ef.make("result", 1, d) result.setMode(SimulationMode.DIRECT) # since this is just a relay result.fixMode() self.addNode(result) resultTerm = [[0] for _ in range(d)] zeros = [0 for _ in range(inputd)] for e in range(d): # create a 2D population for each input dimension which will # combine the components from one dimension of each of the input # populations mpop = multef.make('mpop_' + str(e), smallN, 2) # make two connection that will select one component from each of # the input pops # we divide by maxlength to ensure that the maximum length of the # 2D vector is 1 # remember that (for some reason) the convention in Nengo is that # the input matrices are transpose of what they would be # mathematically mpop.addDecodedTermination('a', [[(1.0 / maxlength) * weights[0][e][i] for i in range(inputd)], zeros], tauPSC, False) mpop.addDecodedTermination('b', [ zeros, [(1.0 / maxlength) * weights[1][e][i] for i in range(inputd)] ], tauPSC, False) # multiply the two selected components together mpop.addDecodedOrigin("output", [PostfixFunction('x0*x1', 2)], "AXON") self.addNode(mpop) self.addProjection(in1.getOrigin('X'), mpop.getTermination('a')) self.addProjection(in2.getOrigin('X'), mpop.getTermination('b')) # combine the 1D results back into one vector. # we scaled each input by 1/maxlength, then multiplied them # together for a total scale of 1/maxlength**2, so to undo we # multiply by maxlength**2 resultTerm[e] = [maxlength**2 * scale] result.addDecodedTermination('in_' + str(e), resultTerm, 0.001, False) resultTerm[e] = [0] self.addProjection(mpop.getOrigin('output'), result.getTermination('in_' + str(e))) self.exposeTermination(in1.getTermination("input"), "A") self.exposeTermination(in2.getTermination("input"), "B") self.exposeOrigin(result.getOrigin("X"), "X")
def gen_evalpoints(filename, seed=None): """Runs an environment for some length of time and records state values, to be used as eval points for agent initialization. :param filename: name of file in which to save eval points :param seed: random seed """ if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("gen_evalpoints") contextD = 2 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] rewards = {"a": 1, "b": 1} env = contextenvironment.ContextEnvironment( actions, HRLutils.datafile("contextmap.bmp"), contextD, rewards, imgsize=(5, 5), dx=0.001, placedev=0.5, colormap={ -16777216: "wall", -1: "floor", -256: "a", -2088896: "b" }) net.add(env) stateD = len(env.placecells) + contextD actions = env.actions actionD = len(actions) class EvalRecorder(nef.SimpleNode): def __init__(self, evalfile): self.action = actions[0] self.evalpoints = [] self.evalfile = evalfile nef.SimpleNode.__init__(self, "EvalRecorder") def tick(self): if self.t % 0.1 < 0.001: self.evalpoints += [self.state] if self.t % 10.0 < 0.001: if len(self.evalpoints) > 10000: self.evalpoints = self.evalpoints[len(self.evalpoints) - 10000:] with open(self.evalfile, "w") as f: f.write("\n".join([ " ".join([str(x) for x in e]) for e in self.evalpoints ])) def termination_state(self, x, dimensions=stateD): self.state = x def termination_action_in(self, x, dimensions=actionD): self.action = actions[x.index(max(x))] def origin_action_out(self): return self.action[1] em = EvalRecorder(HRLutils.datafile("%s_%s.txt" % (filename, seed))) net.add(em) net.connect(em.getOrigin("action_out"), env.getTermination("action")) net.connect(env.getOrigin("optimal_move"), em.getTermination("action_in")) net.connect(env.getOrigin("placewcontext"), em.getTermination("state")) # net.add_to_nengo() net.run(10)
def __init__(self, discount, rewardradius=1.0, Qradius=1.0): """Builds the ErrorCalc2 network. :param discount: discount factor, controls rate of integration :param rewardradius: expected radius of reward value :param Qradius: expected radius of Q values """ self.name = "ErrorCalc" net = nef.Network(self, seed=HRLutils.SEED, quick=False) tauPSC = 0.007 intPSC = 0.1 N = 50 # relay for current Q input currQ = net.make("currQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) currQ.fixMode() currQ.addDecodedTermination("input", [[1]], 0.001, False) # input population for resetting the network reset_nodefac = HRLutils.node_fac() reset_nodefac.setIntercept(IndicatorPDF(0.3, 1.0)) reset = net.make("reset", N, 1, encoders=[[1]], node_factory=reset_nodefac) reset.fixMode([SimulationMode.DEFAULT, SimulationMode.RATE]) # this population will begin outputting a value once the reset # signal exceeds the threshold, and that output will then be # used to reset the rest of the network reset.addDecodedTermination("input", [[1]], tauPSC, False) # relay for stored previous value of Q storeQ = net.make("storeQ", 1, 1, node_factory=HRLutils.node_fac(), mode="direct", radius=Qradius) storeQ.fixMode() storeQ.addDecodedTermination("input", [[1]], 0.001, False) #calculate "discount" by integrating output of storeQ acc_storeQ = memory.Memory("acc_storeQ", N * 8, 1, inputscale=50) net.add(acc_storeQ) zero_input = net.make_input("zero_input", [0]) net.connect(zero_input, acc_storeQ.getTermination("target")) net.connect(reset, acc_storeQ.getTermination("transfer")) # threshold storeQ value so it won't go below zero. that is, if we have # negative Q values, we don't want to have a negative discount, or that will just drive # the highest (negative) Q value upwards, and it will always be selected. negative Q # values are instead pushed upwards by the PositiveBias mechanism. Qthresh = net.make("Qthresh", N * 2, 1, encoders=[[1]], eval_points=[[x * 0.001] for x in range(1000)], radius=Qradius, intercept=(0, 1)) net.connect(storeQ, Qthresh, pstc=tauPSC) net.connect(Qthresh, acc_storeQ, pstc=intPSC, transform=[[discount * intPSC]], func=lambda x: max(x[0], 0.0)) # accumulate reward reward = memory.Memory("reward", N * 4, 1, radius=rewardradius, inputscale=50) net.add(reward) reward.addDecodedTermination("input", [[intPSC]], intPSC, False) net.connect(zero_input, reward.getTermination("target")) net.connect(reset, reward.getTermination("transfer")) # put reward, currQ, storeQ, and discount together to calculate error error = net.make("error", N * 2, 1, node_factory=HRLutils.node_fac()) net.connect(currQ, error, pstc=tauPSC) net.connect(reward, error, pstc=tauPSC) net.connect(storeQ, error, pstc=tauPSC, transform=[[-1]]) net.connect(acc_storeQ, error, pstc=tauPSC, transform=[[-1]]) self.exposeTermination(reward.getTermination("input"), "reward") self.exposeTermination(reset.getTermination("input"), "reset") self.exposeTermination(currQ.getTermination("input"), "currQ") self.exposeTermination(storeQ.getTermination("input"), "storeQ") self.exposeOrigin(error.getOrigin("X"), "X")
def __init__(self, name, N, d, scale=1.0, weights=None, maxinput=1.0, oneDinput=False): # scale is a scale on the output of the multiplication # output = (input1.*input2)*scale # weights are optional matrices applied to each input # output = (C1*input1 .* C2*input2)*scale # maxinput is the maximum expected value of any dimension of the # inputs. this is used to scale the inputs internally so that the # length of the vectors in the intermediate populations are not # too small (which results in a lot of noise in the calculations) # oneDinput indicates that the second input is one dimensional, and is # just a scale on the first input rather than an element-wise product self.name = name tauPSC = 0.007 # the size of the intermediate populations smallN = int(math.ceil(float(N) / d)) # the maximum value of the vectors represented by the intermediate # populations. the vector is at most [maxinput maxinput], so the length # of that is sqrt(maxinput**2 + maxinput**2) maxlength = math.sqrt(2 * maxinput ** 2) if weights is not None and len(weights) != 2: print "Warning, other than 2 matrices given to eprod" if weights is None: weights = [MU.I(d), MU.I(d)] inputd = len(weights[0][0]) ef = HRLutils.defaultEnsembleFactory() # create input populations in1 = ef.make("in1", 1, inputd) in1.addDecodedTermination("input", MU.I(inputd), 0.001, False) self.addNode(in1) in1.setMode(SimulationMode.DIRECT) # since this is just a relay in1.fixMode() in2 = ef.make("in2", 1, inputd) if not oneDinput: in2.addDecodedTermination("input", MU.I(inputd), 0.001, False) else: # if it is a 1-D input we just expand it to a full vector of that # value so that we can treat it as an element-wise product in2.addDecodedTermination("input", [[1] for i in range(inputd)], 0.001, False) self.addNode(in2) in2.setMode(SimulationMode.DIRECT) # since this is just a relay in2.fixMode() # ensemble for intermediate populations multef = NEFEnsembleFactoryImpl() multef.nodeFactory.tauRC = 0.05 multef.nodeFactory.tauRef = 0.002 multef.nodeFactory.maxRate = IndicatorPDF(200, 500) multef.nodeFactory.intercept = IndicatorPDF(-1, 1) multef.encoderFactory = vectorgenerators.MultiplicationVectorGenerator() multef.beQuiet() result = ef.make("result", 1, d) result.setMode(SimulationMode.DIRECT) # since this is just a relay result.fixMode() self.addNode(result) resultTerm = [[0] for _ in range(d)] zeros = [0 for _ in range(inputd)] for e in range(d): # create a 2D population for each input dimension which will # combine the components from one dimension of each of the input # populations mpop = multef.make("mpop_" + str(e), smallN, 2) # make two connection that will select one component from each of # the input pops # we divide by maxlength to ensure that the maximum length of the # 2D vector is 1 # remember that (for some reason) the convention in Nengo is that # the input matrices are transpose of what they would be # mathematically mpop.addDecodedTermination( "a", [[(1.0 / maxlength) * weights[0][e][i] for i in range(inputd)], zeros], tauPSC, False ) mpop.addDecodedTermination( "b", [zeros, [(1.0 / maxlength) * weights[1][e][i] for i in range(inputd)]], tauPSC, False ) # multiply the two selected components together mpop.addDecodedOrigin("output", [PostfixFunction("x0*x1", 2)], "AXON") self.addNode(mpop) self.addProjection(in1.getOrigin("X"), mpop.getTermination("a")) self.addProjection(in2.getOrigin("X"), mpop.getTermination("b")) # combine the 1D results back into one vector. # we scaled each input by 1/maxlength, then multiplied them # together for a total scale of 1/maxlength**2, so to undo we # multiply by maxlength**2 resultTerm[e] = [maxlength ** 2 * scale] result.addDecodedTermination("in_" + str(e), resultTerm, 0.001, False) resultTerm[e] = [0] self.addProjection(mpop.getOrigin("output"), result.getTermination("in_" + str(e))) self.exposeTermination(in1.getTermination("input"), "A") self.exposeTermination(in2.getTermination("input"), "B") self.exposeOrigin(result.getOrigin("X"), "X")
def tick(self): if self.t > self.updatetime: self.scale = [0.0 for _ in range(self.action_dimension)] # the least visited vector could also be found by checking all the tiles and weighing them by the last visit time, instead of just looking for the minimum min_list = [] min_val = self.state_visited[0][0] # get min directions from the data structure # this is basically the gradient descent problem? # only if the dataset is too big to just iterate through? # find the global minimums O(n) for i in range(len(self.state_visited)): for j in range(len(self.state_visited[i])): if(self.state_visited[i][j] == min_val): min_list.append([i, j]) elif(self.state_visited[i][j] < min_val): min_list = [] min_val = self.state_visited[i][j] min_list.append([i, j]) # take the average of their orientation # runtime: O(n) # by taking the average of the list of minimum vectors total = [0.0 for _ in range(self.grid_dimension)] for val in min_list: total[0] += val[0] - self.xoffset total[1] += val[1] - self.yoffset least_visited = [total[0] / len(total), total[1] / len(total)] # convert the average minimum vector to a scale # because actions are encoded to up, right, down, left # set the scale proportional to the time it was last visited versus the current time closest_min = self.agent_state min_state_dist = HRLutils.distance(self.agent_state, min_list[0]) for min_loc in range(len(min_list)): state_dist = HRLutils.distance(self.agent_state, min_list[min_loc]) if(state_dist < min_state_dist): closest_min = min_list[min_loc] min_state_dist = state_dist least_visited[0] += self.xoffset least_visited[1] += self.yoffset state_diff = HRLutils.difference(self.agent_state, least_visited) # Whats the point of state_diff? # It catches the corner case where the least visited node is the one you're already on, which may or may not be a real thing that happens # To summarize this noise boost operation, all it's accomplishing is discouraging the agent to go to really far places or to a place that it's visited recently hor_min_dist = abs(self.agent_state[0] - closest_min[0]) hor_noise_boost = ( (self.t - min_val) * self.time_constant + (1/(1+hor_min_dist)) * self.distance_constant ) * (state_diff[0] != 0) vert_min_dist = abs(self.agent_state[1] - closest_min[1]) vert_noise_boost = ( (self.t - min_val) * self.time_constant + (1/(1+vert_min_dist)) * self.distance_constant ) * (state_diff[1] != 0) # this is a bit of a clustermuffin for mapping if(state_diff[1] > 0): # go left print("boost left") self.scale[3] = hor_noise_boost elif(state_diff[1] < 0): # got right print("boost right") self.scale[1] = hor_noise_boost if(state_diff[0] < 0): # got down print("boost down") self.scale[2] = vert_noise_boost elif(state_diff[0] > 0): # got up print("boost up") self.scale[0] = vert_noise_boost print("Current state %s" %self.agent_state) print("least_visited %s" %least_visited) print("state_diff %s" %state_diff) print("scale: %s" %self.scale) #pdb.set_trace() self.state = [self.pdf.sample()[0]*self.scale[i] for i in range(len(self.state))] self.updatetime = self.t + self.period
def gen_evalpoints(filename, seed=None): """Runs an environment for some length of time and records state values, to be used as eval points for agent initialization. :param filename: name of file in which to save eval points :param seed: random seed """ if seed is not None: HRLutils.set_seed(seed) seed = HRLutils.SEED net = nef.Network("gen_evalpoints") contextD = 2 actions = [("up", [0, 1]), ("right", [1, 0]), ("down", [0, -1]), ("left", [-1, 0])] rewards = {"a": 1, "b": 1} env = contextenvironment.ContextEnvironment( actions, HRLutils.datafile("contextmap.bmp"), contextD, rewards, imgsize=(5, 5), dx=0.001, placedev=0.5, colormap={-16777216: "wall", -1: "floor", -256: "a", -2088896: "b"}) net.add(env) stateD = len(env.placecells) + contextD actions = env.actions actionD = len(actions) class EvalRecorder(nef.SimpleNode): def __init__(self, evalfile): self.action = actions[0] self.evalpoints = [] self.evalfile = evalfile nef.SimpleNode.__init__(self, "EvalRecorder") def tick(self): if self.t % 0.1 < 0.001: self.evalpoints += [self.state] if self.t % 10.0 < 0.001: if len(self.evalpoints) > 10000: self.evalpoints = self.evalpoints[len(self.evalpoints) - 10000:] with open(self.evalfile, "w") as f: f.write("\n".join([" ".join([str(x) for x in e]) for e in self.evalpoints])) def termination_state(self, x, dimensions=stateD): self.state = x def termination_action_in(self, x, dimensions=actionD): self.action = actions[x.index(max(x))] def origin_action_out(self): return self.action[1] em = EvalRecorder(HRLutils.datafile("%s_%s.txt" % (filename, seed))) net.add(em) net.connect(em.getOrigin("action_out"), env.getTermination("action")) net.connect(env.getOrigin("optimal_move"), em.getTermination("action_in")) net.connect(env.getOrigin("placewcontext"), em.getTermination("state")) # net.add_to_nengo() net.run(10)