예제 #1
0
        # initialize pca ?
        if i_episode == 0:
            state = env.reset(Tag=True)
        else:
            state = env.reset(Tag=False)

        # initialize agent's noise
        agent.reset()
        score = 0

        reward_y = []
        episode_x = []
        pbar = tqdm(range(100))

        for i in pbar:
            action = agent.act(state)
            time, accuracy, next_state, reward = env.step(action, i)

            # save accuracy
            start_time += time
            X.append(start_time)
            Y.append(accuracy)

            agent.step(state, action, reward, next_state)
            state = next_state
            score += reward
            pbar.set_description("Epoch: %d Accuracy: %.3f Reward: %.3f" %
                                 (i, accuracy, reward))

            # end?
            if accuracy >= 0.983:
예제 #2
0
def main(env, episodes=500, max_steps=500, eps_decay=.99,
         actor_lr=10**-6, critic_lr=10**-3, gamma=.9, 
         base_nodes=64, batch_size=128,theta=.4, sigma=.25):

	with tf.Session() as sess:

		# Initialize environment and constants
		input_dim   = env.state_dim   
		output_dim  = env.action_dim  
		action_high = env.action_high 
		action_low  = env.action_low 

		# Create DDPG Agent
		agent = Agent(input_dim, output_dim, action_high, action_low, 
		              actor_lr=actor_lr, critic_lr=critic_lr, gamma=gamma, 
		              base_nodes=base_nodes, eps_decay=eps_decay,
		              batch_size=batch_size,theta=theta, sigma=sigma,
		              sess=sess)

		sess.run(tf.global_variables_initializer())
		agent.actor.update_target_network()
		agent.critic.update_target_network()

		# Prepare for episodes
		c_losses, rewards, actions, Qs, states = [np.array([]) for i in range(5)]

		for e in tqdm(range(episodes)):

			# Reset episode
			state = env.reset()
			state = np.reshape(state, (-1, len(state)))
			agent.noise.reset()

			done         = False
			step_count   = 0
			total_reward = 0

			while not done and step_count < max_steps:

				# Action
				action = agent.act(state)
				next_state, reward, done = env.step(action)
				next_state = np.reshape(next_state, (-1, len(next_state)))

				# Learn
				c_loss = agent.learn(state, action, reward, done, next_state)
				
				# Save results
				c_losses = np.append(c_losses, c_loss)
				actions  = np.append(actions, action)
				states   = np.append(states, state[0])
				Qs       = np.append(Qs, agent.critic.predict(state, action))
				
				# Loop
				state         = next_state
				step_count   += 1
				total_reward += reward

			# Reduce exploration
			if agent.eps > agent.min_eps:
				agent.eps *= agent.eps_decay

			rewards = np.append(rewards, total_reward)


		return rewards, c_losses, actions, Qs