def learn(self, ipy_clear=False, max_episodes=100000000, max_pathlength=200): start_time = time.time() numeptotal = 0 i = 0 if self.enable_plots and self.stats is None: import matplotlib.pyplot as plt self.stats = { "tr":statbin(self.stats_rate), # Total Reward "ft":statbin(self.stats_rate), # Finishing Time "minvf":statbin(self.stats_rate), # Min Value Fn "maxvf":statbin(self.stats_rate), # Min Value Fn } for e in xrange(max_episodes): observation = self.env.reset() done = False total_reward = 0.0 t = 0 maxv = [] minv = [] obs = np.zeros( [self.nframes]+list(self.env.observation_space.shape) ) new_obs = np.zeros( [self.nframes]+list(self.env.observation_space.shape) ) obs[0,:] = observation while (not done) and (t<max_pathlength): t += 1 self.env.render() action, values = self.act(obs) maxv.append(max(values.flatten())) minv.append(min(values.flatten())) new_observation, reward, done, info = self.env.step(action) new_obs[1:,:] = obs[-1:,:] new_obs[0,:] = new_observation if not done and t == max_pathlength-1: done = True do_update = (i%self.timesteps_per_batch==self.timesteps_per_batch-1) self.update_train( obs, action, reward, new_obs, done, do_update ) obs[:,:] = new_obs[:,:] total_reward += reward i += 1 print " * Episode %08d\tFrame %08d\tSamples: %08d\tTerminal: %08d\tReward: %d\tEpsilon: %f"%(e, t, len(self.observations), self.nterminal, total_reward, self.epsilon) if not self.epsilon_schedule == None: self.epsilon = self.epsilon_schedule(e, self.epsilon) if self.enable_plots: self.stats["tr"].add(total_reward) self.stats["ft"].add(t) self.stats["maxvf"].add(np.mean(maxv)) self.stats["minvf"].add(np.mean(minv)) if(e%self.stats_rate == self.stats_rate-1): if ipy_clear: from IPython import display display.clear_output(wait=True) fig = plt.figure(1) fig.canvas.set_window_title("DDQN Training Stats for %s"%(self.env.__class__.__name__)) plt.clf() plt.subplot(2,2,1) self.stats["tr"].plot() plt.title("Total Reward per Episode") plt.xlabel("Episode") plt.ylabel("Total Reward") plt.legend(loc=2) plt.subplot(2,2,2) self.stats["ft"].plot() plt.title("Finishing Time per Episode") plt.xlabel("Episode") plt.ylabel("Finishing Time") plt.legend(loc=2) plt.subplot(2,2,3) self.stats["maxvf"].plot2(fill_col='lightblue', label='Avg Max VF') self.stats["minvf"].plot2(fill_col='slategrey', label='Avg Min VF') plt.title("Value Function Outputs") plt.xlabel("Episode") plt.ylabel("Value Fn") plt.legend(loc=2) ax = plt.subplot(2,2,4) plt.plot(self.train_costs) plt.title("Training Loss") plt.xlabel("Training Epoch") plt.ylabel("Loss") try: ax.set_yscale("log", nonposy='clip') plt.tight_layout() except: pass plt.show(block=False) plt.draw() plt.pause(0.001)
def train(self, ipy_clear=False, max_episodes=100000000, max_pathlength=200): rewards = statbin.statbin(10) observation = self.env.reset() prev_x = None # used in computing the difference frame xs,hs,dlogps,drs = [],[],[],[] running_reward = None reward_sum = 0 episode_number = 0 while True: if self.render: self.env.render() # preprocess the observation, set input to network to be difference image if not self.preprocessor ==None: cur_x = self.preprocessor(observation) else: cur_x = observation x = cur_x - prev_x if prev_x is not None else np.zeros(self.input_dim, dtype='float32') x = x.flatten() prev_x = cur_x # forward the policy network and sample an action from the returned probability aprob = self.model.predict(x.reshape([1,self.input_dim]), batch_size=1).flatten() action = np.random.choice( self.env.action_space.n, 1, p=aprob/np.sum(aprob) )[0] # record various intermediates (needed later for backprop) xs.append(x) # observation # Harsh Grad ... y = np.zeros([self.env.action_space.n]) y[action] = 1 # Subtle Grad ... # y = aprob*0.9 # y[action] = aprob[action] * 1.1 dlogps.append(y) # grad that encourages the action that was tak #dlogps.append(y - aprob) # grad that encourages the action that was tak observation, reward, done, info = self.env.step(action) reward_sum += float(reward) drs.append(float(reward)) # record reward (has to be done after we call step() to get reward for previous action) if done: # an episode finished episode_number += 1 # stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(xs) epdlogp = np.vstack(dlogps) epr = np.vstack(drs) xs,hs,dlogps,drs = [],[],[],[] # reset array memory # compute the discounted reward backwards through time discounted_epr = self.discount_rewards(epr) # standardize the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr -= np.mean(discounted_epr) discounted_epr /= np.std(discounted_epr) epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.) self.model.fit(epx, epdlogp, nb_epoch=1, verbose=2, shuffle=True) # boring book-keeping running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 rewards.add(reward_sum) print 'resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward) if episode_number % 100 == 0: self.save() reward_sum = 0 observation = self.env.reset() # reset env prev_x = None if(self.enable_plots): plt.figure(1) #plt.plot(rewards) rewards.plot() plt.show(block=False) plt.draw() plt.pause(0.001) if reward != 0: # Pong has either +1 or -1 reward exactly when game ends. print ('ep %d: game finished, reward: %f' % (episode_number, reward)) + ('' if reward == -1 else ' !!!!!!!!')
def learn(self, ipy_clear=False, max_episodes=100000000, max_pathlength=200): start_time = time.time() numeptotal = 0 i = 0 if self.enable_plots and self.stats is None: import matplotlib.pyplot as plt self.stats = { "tr": statbin(self.stats_rate), # Total Reward "ft": statbin(self.stats_rate), # Finishing Time "minvf": statbin(self.stats_rate), # Min Value Fn "maxvf": statbin(self.stats_rate), # Min Value Fn } for e in xrange(max_episodes): observation = self.env.reset() done = False total_reward = 0.0 t = 0 maxv = [] minv = [] obs = np.zeros([self.nframes] + list(self.env.observation_space.shape)) new_obs = np.zeros([self.nframes] + list(self.env.observation_space.shape)) obs[0, :] = observation while (not done) and (t < max_pathlength): t += 1 self.env.render() action, values = self.act(obs) maxv.append(max(values.flatten())) minv.append(min(values.flatten())) new_observation, reward, done, info = self.env.step(action) new_obs[1:, :] = obs[-1:, :] new_obs[0, :] = new_observation if not done and t == max_pathlength - 1: done = True do_update = ( i % self.timesteps_per_batch == self.timesteps_per_batch - 1) self.update_train(obs, action, reward, new_obs, done, do_update) obs[:, :] = new_obs[:, :] total_reward += reward i += 1 print " * Episode %08d\tFrame %08d\tSamples: %08d\tTerminal: %08d\tReward: %d\tEpsilon: %f" % ( e, t, len(self.observations), self.nterminal, total_reward, self.epsilon) if not self.epsilon_schedule == None: self.epsilon = self.epsilon_schedule(e, self.epsilon) if self.enable_plots: self.stats["tr"].add(total_reward) self.stats["ft"].add(t) self.stats["maxvf"].add(np.mean(maxv)) self.stats["minvf"].add(np.mean(minv)) if (e % self.stats_rate == self.stats_rate - 1): if ipy_clear: from IPython import display display.clear_output(wait=True) fig = plt.figure(1) fig.canvas.set_window_title("DDQN Training Stats for %s" % (self.env.__class__.__name__)) plt.clf() plt.subplot(2, 2, 1) self.stats["tr"].plot() plt.title("Total Reward per Episode") plt.xlabel("Episode") plt.ylabel("Total Reward") plt.legend(loc=2) plt.subplot(2, 2, 2) self.stats["ft"].plot() plt.title("Finishing Time per Episode") plt.xlabel("Episode") plt.ylabel("Finishing Time") plt.legend(loc=2) plt.subplot(2, 2, 3) self.stats["maxvf"].plot2(fill_col='lightblue', label='Avg Max VF') self.stats["minvf"].plot2(fill_col='slategrey', label='Avg Min VF') plt.title("Value Function Outputs") plt.xlabel("Episode") plt.ylabel("Value Fn") plt.legend(loc=2) ax = plt.subplot(2, 2, 4) plt.plot(self.train_costs) plt.title("Training Loss") plt.xlabel("Training Epoch") plt.ylabel("Loss") try: ax.set_yscale("log", nonposy='clip') plt.tight_layout() except: pass plt.show(block=False) plt.draw() plt.pause(0.001)
def train(self, ipy_clear=False, max_episodes=100000000, max_pathlength=200): rewards = statbin.statbin(10) observation = self.env.reset() prev_x = None # used in computing the difference frame xs, hs, dlogps, drs = [], [], [], [] running_reward = None reward_sum = 0 episode_number = 0 while True: if self.render: self.env.render() # preprocess the observation, set input to network to be difference image if not self.preprocessor == None: cur_x = self.preprocessor(observation) else: cur_x = observation x = cur_x - prev_x if prev_x is not None else np.zeros( self.input_dim, dtype='float32') x = x.flatten() prev_x = cur_x # forward the policy network and sample an action from the returned probability aprob = self.model.predict(x.reshape([1, self.input_dim]), batch_size=1).flatten() action = np.random.choice(self.env.action_space.n, 1, p=aprob / np.sum(aprob))[0] # record various intermediates (needed later for backprop) xs.append(x) # observation # Harsh Grad ... y = np.zeros([self.env.action_space.n]) y[action] = 1 # Subtle Grad ... # y = aprob*0.9 # y[action] = aprob[action] * 1.1 dlogps.append(y) # grad that encourages the action that was tak #dlogps.append(y - aprob) # grad that encourages the action that was tak observation, reward, done, info = self.env.step(action) reward_sum += float(reward) drs.append( float(reward) ) # record reward (has to be done after we call step() to get reward for previous action) if done: # an episode finished episode_number += 1 # stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(xs) epdlogp = np.vstack(dlogps) epr = np.vstack(drs) xs, hs, dlogps, drs = [], [], [], [] # reset array memory # compute the discounted reward backwards through time discounted_epr = self.discount_rewards(epr) # standardize the rewards to be unit normal (helps control the gradient estimator variance) discounted_epr -= np.mean(discounted_epr) discounted_epr /= np.std(discounted_epr) epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.) experiment = Experiment(project_name='osh/kerlym') self.model.fit(epx, epdlogp, nb_epoch=1, verbose=2, shuffle=True) # boring book-keeping running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 rewards.add(reward_sum) print 'resetting env. episode reward total was %f. running mean: %f' % ( reward_sum, running_reward) if episode_number % 100 == 0: self.save() reward_sum = 0 observation = self.env.reset() # reset env prev_x = None if (self.enable_plots): plt.figure(1) #plt.plot(rewards) rewards.plot() plt.show(block=False) plt.draw() plt.pause(0.001) if reward != 0: # Pong has either +1 or -1 reward exactly when game ends. print('ep %d: game finished, reward: %f' % (episode_number, reward)) + ('' if reward == -1 else ' !!!!!!!!')