コード例 #1
0
    def train(self):
        # To store reward history of each episode
        ep_reward_list = []
        # To store average reward history of last few episodes
        avg_reward_list = []
        monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2)
        with Loop_handler(
        ) as interruption:  # to properly save even if ctrl+C is pressed
            for eps in range(self.EPISODES):
                episode_reward = 0
                s = self.env.reset()
                """
                if an env is created using the "gym.make" method, it will terminate after 200 steps
                """
                for t in range(self.MAX_TIME_STEPS):
                    # done = False
                    # while not done:
                    if self.render:
                        self.env.render()
                    a = self.policy(s)
                    s_, r, done, _ = self.env.step(a)
                    self.replay_buffer.add(np.reshape(s, (self.s_dim, )),
                                           np.reshape(a, (self.a_dim, )),
                                           r, done,
                                           np.reshape(s_, (self.s_dim, )))
                    episode_reward += r
                    if self.replay_buffer.size() > self.minibatch_size:
                        q = self.train_step()
                    s = s_.reshape(1, -1)
                    if interruption():
                        break
                ep_reward_list.append(episode_reward)
                # Mean of last 40 episodes
                avg_reward = np.mean(ep_reward_list[-40:])
                print("Episode * {} * Avg Reward is ==> {}".format(
                    eps, avg_reward))
                avg_reward_list.append(avg_reward)
                monitor.add_data(avg_reward, q)

            self.save_weights(
                save_name=self.save_name)  # if you want to save weights
            self.plot_results(avg_reward=avg_reward_list, train=True)
コード例 #2
0
		if len( sys.argv ) > 2 :
			session_dir = sys.argv[2]
		sac.load( session_dir )
		if not sac.load_replay_buffer( session_dir + '/replay_buffer.pkl' ) :
			print( 'Could not find %s: starting with an empty replay buffer.' % ( session_dir + '/replay_buffer.pkl' ) )


	np.random.seed( hyper_params['seed'] )

	training_env = ENV()
	eval_env = ENV()

	n_ep = 0
	Q_loss = 0

	reward_graph = Monitor( [ 1 , 1 ], titles=[ 'Average reward per trial', 'Temperature' ], xlabel='trials', keep=False )

	import time
	start = time.time()

	with Loop_handler() as interruption :

		while not interruption() and n_ep < EP_MAX :


			# Run a new trial:
			s = training_env.reset()

			for _ in range( EP_LEN ) :

				# Choose a random action and execute the next step:
コード例 #3
0
ppo = PPO(**hyper_params)

if len(sys.argv) == 1 or sys.argv[1] != 'eval':

    if len(sys.argv) > 1 and sys.argv[1] == 'load':
        if len(sys.argv) > 2:
            session_dir = sys.argv[2]
        ppo.load(session_dir + '/session')

    training_env = ENV()
    eval_env = ENV()

    n_ep = 0

    reward_graph = Monitor(titles='Average reward per trial',
                           xlabel='trials',
                           keep=False)

    import time
    start = time.time()

    with Loop_handler() as interruption:

        while not interruption() and n_ep < EP_MAX:

            # Gather new data from the current policy:
            n_samples = 0

            for ep in range(EPISODES_PER_BATCH):

                s = training_env.reset()
コード例 #4
0
	#prob_expl = lambda n : exp( -0.0003*n )
	prob_expl = lambda n : 0.2

	ntrial = 0
	t = 0.
	x = array( x0 )

	Rt = 0.
	x_data = [ x*180/pi ]

	diff = 0.
	restart = False

	#random.seed( 0 )

	reward_graph = Monitor( titles='Average reward per trial', xlabel='trials', keep=False )

	with Loop_handler() as interruption :

		while not interruption() and ntrial < 20000 :

			# Action selection:
			exploration = random.rand() < prob_expl( ntrial )
			if exploration :
				u = umax*( 2*random.rand() - 1 )
			else :
				u = umax*actor.eval( scaling( x ) )
			u = clip( u, -umax, umax )

			# Simulation step:
			x_prev = array( x )