vel = np.zeros(nsteps) # run the simulation input_state = np.zeros((1,2), dtype=np.float32) for i in xrange(nsteps): state = simulator.get_screenshot() input_state[0] = state a = policy.action((input_state,None)) simulator.act(a) r = simulator.reward() rtot += r xpos[i], vel[i] = state if simulator.episode_over(): break return rtot, xpos, vel mdp = MountainCar() simulator = MDPSimulator(mdp) net = pickle.load(open("../chimp/pre_trained_nets/mountain_car.net", "rb")) backend = ChainerBackend(settings) backend.set_net(net) learner = DQNLearner(settings, backend) policy = DQNPolicy(learner) r, xtrace, vtrace = car_sim(300, simulator, policy, verbose=True) p.plot(xtrace); p.plot(10.0*vtrace) p.show()
obs = np.zeros((n_samples, ) + o_dims, dtype=np.float32) obsp = np.zeros((n_samples, ) + o_dims, dtype=np.float32) a = np.zeros(n_samples, dtype=np.int32) r = np.zeros(n_samples, dtype=np.float32) term = np.zeros(n_samples, dtype=np.bool) for i in xrange(n_samples): obs[i] = np.random.uniform(0.0, 1.0, o_dims) a[i] = np.random.randint(n_actions) obsp[i] = (obs[i] + 0.25) if a[i] == 1 else (obs[i] - 0.25) obsp[i] = np.clip(obsp[i], 0.0, 1.0) r[i] = np.sum(obs[i]) return obs, a, r, obsp, term net = TestNet() custom_learner = ChainerBackend(settings) custom_learner.set_net(net) learner = DQNLearner(settings, custom_learner) policy = DQNPolicy(learner) obst, a, r, obsp, term = make_batch(10, o_dims, n_actions) for i in xrange(10): ohist = (obst[i], None) a = policy.action(ohist) print "Test: ", i, " ", obst[i], " ", a, " ", learner.forward( (obst[i], None)) print "TRAINING"
obs = np.zeros((n_samples,)+o_dims, dtype=np.float32) obsp = np.zeros((n_samples,)+o_dims, dtype=np.float32) a = np.zeros(n_samples, dtype=np.int32) r = np.zeros(n_samples, dtype=np.float32) term = np.zeros(n_samples, dtype=np.bool) for i in xrange(n_samples): obs[i] = np.random.uniform(0.0, 1.0, o_dims) a[i] = np.random.randint(n_actions) obsp[i] = (obs[i] + 0.25) if a[i] == 1 else (obs[i] - 0.25) obsp[i] = np.clip(obsp[i], 0.0, 1.0) r[i] = np.sum(obs[i]) return obs, a, r, obsp, term net = TestNet() custom_learner = ChainerBackend(settings) custom_learner.set_net(net) learner = DQNLearner(settings, custom_learner) policy = DQNPolicy(learner) obst, a, r, obsp, term = make_batch(10, o_dims, n_actions) for i in xrange(10): ohist = (obst[i], None) a = policy.action(ohist) print "Test: ", i, " ", obst[i], " ", a, " ", learner.forward((obst[i], None)) print "TRAINING" for i in xrange(3000):
# initialize avg_var to prevent divide by zero self.bn1.avg_var.fill(0.1), self.bn2.avg_var.fill(0.1), def __call__(self, ohist, ahist): h = F.relu(self.l1(ohist)) h = F.relu(self.l2(h)) h = self.bn1(h, test=not self.train) h = F.relu(self.l3(h)) h = F.relu(self.l4(h)) h = self.bn2(h, test=not self.train) output = self.lout(h) return output net = CarNet() # Initialize Learner with a Chainer backend backend = ChainerBackend(settings) backend.set_net(net) learner = DQNLearner(settings, backend) # Initialize memory memory = ReplayMemoryHDF5(settings) # Initialize Agent Framework agent = DQNAgent(learner, memory, simulator, settings) # Start training agent.train(verbose=True)
from chimp.learners.chainer_backend import ChainerBackend # Agent Framework from chimp.agents import DQNAgent # Policy class for evaluation from chimp.utils.policies import DQNPolicy # initialize our mountain car simulator simulator = MountainCar() # initialize the netowrk net = CarNet() # Initialize the learner with a Chainer backend and out net backend = ChainerBackend(settings) # initialize with the settings dictionary backend.set_net(net) # set the net for our Chainer backend learner = DQNLearner(settings, backend) # create the learner # Initialize replay memory memory = ReplayMemoryHDF5(settings) # Initialize the DQNAgent agent = DQNAgent(learner, memory, simulator, settings) # pass in all 3 and settings # Start training agent.train(verbose=True) import chainer import chainer.functions as F import chainer.links as L