def train(self): # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2) with Loop_handler( ) as interruption: # to properly save even if ctrl+C is pressed for eps in range(self.EPISODES): episode_reward = 0 s = self.env.reset() """ if an env is created using the "gym.make" method, it will terminate after 200 steps """ for t in range(self.MAX_TIME_STEPS): # done = False # while not done: if self.render: self.env.render() a = self.policy(s) s_, r, done, _ = self.env.step(a) self.replay_buffer.add(np.reshape(s, (self.s_dim, )), np.reshape(a, (self.a_dim, )), r, done, np.reshape(s_, (self.s_dim, ))) episode_reward += r if self.replay_buffer.size() > self.minibatch_size: q = self.train_step() s = s_.reshape(1, -1) if interruption(): break ep_reward_list.append(episode_reward) # Mean of last 40 episodes avg_reward = np.mean(ep_reward_list[-40:]) print("Episode * {} * Avg Reward is ==> {}".format( eps, avg_reward)) avg_reward_list.append(avg_reward) monitor.add_data(avg_reward, q) self.save_weights( save_name=self.save_name) # if you want to save weights self.plot_results(avg_reward=avg_reward_list, train=True)
# Evaluate the policy: if Q_loss != 0 and n_ep % EVAL_FREQ == 0 : eval_env.reset( store_data=True ) stddev_m = 0 for t in range( EP_LEN ) : a, stddev = sac.best_action( eval_env.get_obs(), return_stddev=True ) stddev_m += stddev _, _, ep_done, _ = eval_env.step( a ) if ep_done : break stddev_m /= EP_LEN alpha = float( sac.alpha ) print( 'It %i | Ep %i | LQ %+7.4f | temp %5.3f | Sd %+5.2f | ' % ( sac.n_iter, n_ep, Q_loss, alpha, stddev_m ), end='' ) eval_env.print_eval() sys.stdout.flush() reward_graph.add_data( n_ep, eval_env.get_Rt(), alpha ) end = time.time() print( 'Elapsed time: %.3f' % ( end - start ) ) save_data = True answer = input( '\nSave network parameters in ' + session_dir + '? (y) ' ) if answer.strip() != 'y' : answer = input( 'Where to store network parameters? (leave empty to discard data) ' ) if answer.strip() : session_dir = answer else : save_data = False if save_data : os.makedirs( session_dir, exist_ok=True )
if exploration and TD > 0 : actor.inc_training( scaling( x_prev ), u ) Rt += R x_data.append( x*180/pi ) if t >= Ttrial - step/2 or restart : ntrial += 1 Qa = actor.end_of_batch()[0] Qc = critic.end_of_batch()[0] if sys.argv[-1] != 'quick' or ntrial%20 == 0 : t, Rt, success_rate, Nt = quick_eval( lambda x : umax*( clip( actor.eval( scaling( x ) ), -1, 1 ) ) ) print 'Eval: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g' % ( ntrial, t, Rt, success_rate, Nt, Qa, Qc ) reward_graph.add_data( Rt, ntrial ) else : Rt /= t/step success_rate = sum( [ ( 1. if abs( a[0] ) < 10 else 0. ) for a in x_data ] )/len( x_data )*100 print 'Trial: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g' % ( ntrial, t, Rt, success_rate, ( diff - x0[0] )/( 2*pi ), Qa, Qc ) t = 0. x = array( x0 ) Rt = 0 x_data = [ x*180/pi ] diff = 0. restart = False answer = raw_input( '\nSave data? (y) ' ) if answer == 'y' :
Rt += R x_data.append(x * 180 / pi) if t >= Ttrial - step / 2 or restart: ntrial += 1 Qa = actor.end_of_batch()[0] Qc = critic.end_of_batch()[0] if sys.argv[-1] != 'quick' or ntrial % 20 == 0: t, Rt, success_rate, Nt = quick_eval( lambda x: umax * (clip(actor.eval(scaling(x)), -1, 1))) print( 'Eval: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g' % (ntrial, t, Rt, success_rate, Nt, Qa, Qc)) reward_graph.add_data(ntrial, Rt) else: Rt /= t / step success_rate = sum([(1. if abs(a[0]) < 10 else 0.) for a in x_data]) / len(x_data) * 100 print( 'Trial: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g' % (ntrial, t, Rt, success_rate, (diff - x0[0]) / (2 * pi), Qa, Qc)) sys.stdout.flush() t = 0. x = array(x0) Rt = 0 x_data = [x * 180 / pi] diff = 0.
Li = ddpg.train(ITER_PER_EP) # Evaluate the policy: if n_ep % 1 == 0: s = eval_env.reset(store_data=True) for t in range(EP_LEN): s, _, done, _ = eval_env.step(ddpg.get_action(s)) if done: break print('It %i | Ep %i | Li %+8.4f | ' % (ddpg.n_iter, n_ep, Li), end='', flush=True) eval_env.print_eval() sys.stdout.flush() ddpg.reward_summary(eval_env.get_Rt()) reward_graph.add_data(eval_env.get_Rt(), n_ep) end = time.time() print('Elapsed time: %.3f' % (end - start)) answer = input('\nSave network parameters in ' + session_dir + '? (y) ') if answer.strip() == 'y': os.makedirs(session_dir, exist_ok=True) ddpg.save_model(session_files) print('Parameters saved.') else: answer = input( 'Where to store network parameters? (leave empty to discard data) ' ) if answer.strip():