def __init__(self, env, kind="diff", gamma=0.8, weight=0.9, advantage=False): if kind == "qv": qmodel, vmodel = self.create_qv_models( env.observation_space.shape[0], env.action_space.n) model = (qmodel, vmodel) else: model = self.create_model(env.observation_space.shape[0], env.action_space.n) brain = QBrain(model, kind=kind, advantage=advantage, gamma=gamma, v_selectivity=False, qnet_soft_update=0.01, diff_qnet_weight=weight) brain.compile(Adam(lr=1e-3), ["mse"]) MultiDQNAgent.__init__(self, env, brain, train_sample_size=1000, train_batch_size=50)
def __init__(self, env, kind="diff", gamma=0.99, diff_qnet_weight=0.7): model = self.create_model(env.observation_space.shape[0], env.action_space.n) brain = QBrain(model, kind=kind, gamma=gamma, v_selectivity=False, qnet_soft_update=0.01, diff_qnet_weight=diff_qnet_weight) brain.compile(Adam(lr=1e-3), ["mse"]) MultiDQNAgent.__init__(self, env, brain, train_sample_size=1000, train_batch_size=50)
self.T += 1 self.R += 1 return [(agent, {}) for agent, action in actions] def feedback(self, agents): return [(agent, self.R, {}) for agent in agents] class SeqAgent(MultiDQNAgent): pass def create_model(): inp = Input((1,)) dense1 = Dense(5, activation="relu", name="dense1")(inp) out = Dense(1, activation="linear", name="out_linear")(dense1) model = Model(inp, out) print("--- model summary ---") print(model.summary()) return model env = SeqEnv() model = create_model() brain = QBrain(model, soft_update=0.0001) agent = SeqAgent(env, brain) agents = [agent] controller = SynchronousMultiAgentController(env, agents) controller.fit(max_episodes = 1) for tup in sorted(brain.Memory.ShortTermMemory): print tup
action_frequencies = self.Actions / np.sum(self.Actions) print "Episode end: %d, rounds: %d, rewards: %s, average q: %s, actions: %s" % \ (episode, logs["nrounds"], rewards, avq, self.Actions) env = TankTargetEnv() memory = ReplayMemory(100000, v_selectivity=True) tanks = [] for i in xrange(3): model = create_model(env.observation_space.shape[-1], env.action_space.shape[-1]) brain = QBrain(model, typ="diff", memory=memory, soft_update=0.01, gamma=0.99) brain.compile(Adam(lr=1e-3), ["mse"]) if i > 0: brain.transfer(tanks[0].Brain) # make all brains the same initially tanks.append(TankAgent(env, brain, train_sample_size=1000)) controller = SynchronousMultiAgentController(env, tanks, rounds_between_train=10000, episodes_between_train=1) taus = [0.01, 0.1, 1.0, 2.0] ntaus = len(taus)
for a, action in logs["actions"]: self.Actions[action] += 1 def on_episode_end(self, episode, logs): avq = self.SumQ/self.NSteps if self.NSteps > 0 else 0.0 rewards = [r for t, r in logs["episode_rewards"]] action_frequencies = self.Actions/np.sum(self.Actions) print "Episode end: %d, rounds: %d, rewards: %s, average q: %s, actions: %s" % \ (episode, logs["nrounds"], rewards, avq, self.Actions) env = TankDuelEnv() tanks = [] for _ in (1,2): model = create_model(env.observation_space.shape[-1], env.action_space.shape[-1]) brain = QBrain(model, kind="diff", v_selectivity=False, gamma=0.99) brain.compile(Adam(lr=1e-3), ["mse"]) tanks.append(TankAgent(env, brain, train_sample_size=1000)) controller = SynchronousMultiAgentController(env, tanks, rounds_between_train = 10000, episodes_between_train = 1 ) taus = [2.0, 1.0, 0.1, 0.01] ntaus = len(taus) t = 0 test_policy = BoltzmannQPolicy(0.005) test_run_logger = RunLogger("run_log.csv")
if "-h" in opts or "-?" in opts: print """Usage: python tanks_target.py [-k kind] [-r <run log CVS file>] """ sys.exit(1) env = TankTargetEnv(kind) tanks = [] share_brain = True if share_brain: model = create_model(env.observation_space.shape[-1], env.action_space.shape[-1]) brain = QBrain(model, kind=kind, v_selectivity=False, qnet_hard_update=100000, gamma=gamma) brain.compile(Adam(lr=1e-3), ["mse"]) tanks = [TankAgent(env, brain, train_sample_size=1000) for _ in range(1)] else: for _ in (1, 2, 3): model = create_model(env.observation_space.shape[-1], env.action_space.shape[-1]) brain = QBrain(model, kind=kind, v_selectivity=False, qnet_hard_update=100000, gamma=gamma) brain.compile(Adam(lr=1e-3), ["mse"])
opts, args = getopt.getopt(sys.argv[1:], "k:r:h?") opts = dict(opts) kind = opts.get("-k", "diff") run_log = opts.get("-r", "run_log.csv") if "-h" in opts or "-?" in opts: print """Usage: python cars2d.py [-k kind] [-r <run log CVS file>] """ sys.exit(1) env = CarsRadEnv() model = create_model(env.observation_space.shape[-1], env.actions_space.shape[-1]) brain = QBrain(model, kind="diff", gamma=0.99) #, soft_update=0.01) brain.compile(Adam(lr=1e-3), ["mse"]) cars = [CarAgent(env, brain, train_sample_size=1000) for _ in range(3)] #cars = [] #for _ in range(3): # model = create_model(env.observation_space.shape[-1], env.actions_space.shape[-1]) # brain = QBrain(model, soft_update=0.01) # brain.compile(Adam(lr=1e-3), ["mse"]) # cars.append(CarAgent(env, brain)) controller = SynchronousMultiAgentController(env, cars, rounds_between_train=10000, episodes_between_train=1)