Exemple #1
0
    def train(self):
        # To store reward history of each episode
        ep_reward_list = []
        # To store average reward history of last few episodes
        avg_reward_list = []
        monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2)
        with Loop_handler(
        ) as interruption:  # to properly save even if ctrl+C is pressed
            for eps in range(self.EPISODES):
                episode_reward = 0
                s = self.env.reset()
                """
                if an env is created using the "gym.make" method, it will terminate after 200 steps
                """
                for t in range(self.MAX_TIME_STEPS):
                    # done = False
                    # while not done:
                    if self.render:
                        self.env.render()
                    a = self.policy(s)
                    s_, r, done, _ = self.env.step(a)
                    self.replay_buffer.add(np.reshape(s, (self.s_dim, )),
                                           np.reshape(a, (self.a_dim, )),
                                           r, done,
                                           np.reshape(s_, (self.s_dim, )))
                    episode_reward += r
                    if self.replay_buffer.size() > self.minibatch_size:
                        q = self.train_step()
                    s = s_.reshape(1, -1)
                    if interruption():
                        break
                ep_reward_list.append(episode_reward)
                # Mean of last 40 episodes
                avg_reward = np.mean(ep_reward_list[-40:])
                print("Episode * {} * Avg Reward is ==> {}".format(
                    eps, avg_reward))
                avg_reward_list.append(avg_reward)
                monitor.add_data(avg_reward, q)

            self.save_weights(
                save_name=self.save_name)  # if you want to save weights
            self.plot_results(avg_reward=avg_reward_list, train=True)
Exemple #2
0
			# Evaluate the policy:
			if Q_loss != 0 and n_ep % EVAL_FREQ == 0 :
				eval_env.reset( store_data=True )
				stddev_m = 0
				for t in range( EP_LEN ) :
					a, stddev = sac.best_action( eval_env.get_obs(), return_stddev=True )
					stddev_m += stddev
					_, _, ep_done, _ = eval_env.step( a )
					if ep_done : break
				stddev_m /= EP_LEN
				alpha = float( sac.alpha )
				print( 'It %i | Ep %i | LQ %+7.4f | temp %5.3f | Sd %+5.2f | ' %
				       ( sac.n_iter, n_ep, Q_loss, alpha, stddev_m ), end='' )
				eval_env.print_eval()
				sys.stdout.flush()
				reward_graph.add_data( n_ep, eval_env.get_Rt(), alpha )


	end = time.time()
	print( 'Elapsed time: %.3f' % ( end - start ) )

	save_data = True
	answer = input( '\nSave network parameters in ' + session_dir + '? (y) ' )
	if answer.strip() != 'y' :
		answer = input( 'Where to store network parameters? (leave empty to discard data) ' )
		if answer.strip() :
			session_dir = answer
		else :
			save_data = False
	if save_data :
		os.makedirs( session_dir, exist_ok=True )
Exemple #3
0
			if exploration and TD > 0 :
				actor.inc_training( scaling( x_prev ), u )

			Rt += R
			x_data.append( x*180/pi )

			if t >= Ttrial - step/2 or restart :
				ntrial += 1

				Qa = actor.end_of_batch()[0]
				Qc = critic.end_of_batch()[0]

				if sys.argv[-1] != 'quick' or ntrial%20 == 0 :
					t, Rt, success_rate, Nt = quick_eval( lambda x : umax*( clip( actor.eval( scaling( x ) ), -1, 1 ) ) )
					print 'Eval: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g' % ( ntrial, t, Rt, success_rate, Nt, Qa, Qc )
					reward_graph.add_data( Rt, ntrial )
				else :
					Rt /= t/step
					success_rate = sum( [ ( 1. if abs( a[0] ) < 10 else 0. ) for a in x_data ] )/len( x_data )*100
					print 'Trial: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g' % ( ntrial, t, Rt, success_rate, ( diff - x0[0] )/( 2*pi ), Qa, Qc )

				t = 0.
				x = array( x0 )
				Rt = 0
				x_data = [ x*180/pi ]
				diff = 0.
				restart = False

	answer = raw_input( '\nSave data? (y) ' )
	if answer == 'y' :
Exemple #4
0
            Rt += R
            x_data.append(x * 180 / pi)

            if t >= Ttrial - step / 2 or restart:
                ntrial += 1

                Qa = actor.end_of_batch()[0]
                Qc = critic.end_of_batch()[0]

                if sys.argv[-1] != 'quick' or ntrial % 20 == 0:
                    t, Rt, success_rate, Nt = quick_eval(
                        lambda x: umax * (clip(actor.eval(scaling(x)), -1, 1)))
                    print(
                        'Eval: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g'
                        % (ntrial, t, Rt, success_rate, Nt, Qa, Qc))
                    reward_graph.add_data(ntrial, Rt)
                else:
                    Rt /= t / step
                    success_rate = sum([(1. if abs(a[0]) < 10 else 0.)
                                        for a in x_data]) / len(x_data) * 100
                    print(
                        'Trial: %i | t: %4.1f | Rt: %+7.4f | Success rate: %5.1f %% | Nt: %+3d | Qa: %7.2g | Qc: %7.2g'
                        % (ntrial, t, Rt, success_rate,
                           (diff - x0[0]) / (2 * pi), Qa, Qc))
                sys.stdout.flush()

                t = 0.
                x = array(x0)
                Rt = 0
                x_data = [x * 180 / pi]
                diff = 0.
Exemple #5
0
                        Li = ddpg.train(ITER_PER_EP)

                    # Evaluate the policy:
                    if n_ep % 1 == 0:
                        s = eval_env.reset(store_data=True)
                        for t in range(EP_LEN):
                            s, _, done, _ = eval_env.step(ddpg.get_action(s))
                            if done: break
                        print('It %i | Ep %i | Li %+8.4f | ' %
                              (ddpg.n_iter, n_ep, Li),
                              end='',
                              flush=True)
                        eval_env.print_eval()
                        sys.stdout.flush()
                        ddpg.reward_summary(eval_env.get_Rt())
                        reward_graph.add_data(eval_env.get_Rt(), n_ep)

            end = time.time()
            print('Elapsed time: %.3f' % (end - start))

            answer = input('\nSave network parameters in ' + session_dir +
                           '? (y) ')
            if answer.strip() == 'y':
                os.makedirs(session_dir, exist_ok=True)
                ddpg.save_model(session_files)
                print('Parameters saved.')
            else:
                answer = input(
                    'Where to store network parameters? (leave empty to discard data) '
                )
                if answer.strip():