# initialize pca ? if i_episode == 0: state = env.reset(Tag=True) else: state = env.reset(Tag=False) # initialize agent's noise agent.reset() score = 0 reward_y = [] episode_x = [] pbar = tqdm(range(100)) for i in pbar: action = agent.act(state) time, accuracy, next_state, reward = env.step(action, i) # save accuracy start_time += time X.append(start_time) Y.append(accuracy) agent.step(state, action, reward, next_state) state = next_state score += reward pbar.set_description("Epoch: %d Accuracy: %.3f Reward: %.3f" % (i, accuracy, reward)) # end? if accuracy >= 0.983:
def main(env, episodes=500, max_steps=500, eps_decay=.99, actor_lr=10**-6, critic_lr=10**-3, gamma=.9, base_nodes=64, batch_size=128,theta=.4, sigma=.25): with tf.Session() as sess: # Initialize environment and constants input_dim = env.state_dim output_dim = env.action_dim action_high = env.action_high action_low = env.action_low # Create DDPG Agent agent = Agent(input_dim, output_dim, action_high, action_low, actor_lr=actor_lr, critic_lr=critic_lr, gamma=gamma, base_nodes=base_nodes, eps_decay=eps_decay, batch_size=batch_size,theta=theta, sigma=sigma, sess=sess) sess.run(tf.global_variables_initializer()) agent.actor.update_target_network() agent.critic.update_target_network() # Prepare for episodes c_losses, rewards, actions, Qs, states = [np.array([]) for i in range(5)] for e in tqdm(range(episodes)): # Reset episode state = env.reset() state = np.reshape(state, (-1, len(state))) agent.noise.reset() done = False step_count = 0 total_reward = 0 while not done and step_count < max_steps: # Action action = agent.act(state) next_state, reward, done = env.step(action) next_state = np.reshape(next_state, (-1, len(next_state))) # Learn c_loss = agent.learn(state, action, reward, done, next_state) # Save results c_losses = np.append(c_losses, c_loss) actions = np.append(actions, action) states = np.append(states, state[0]) Qs = np.append(Qs, agent.critic.predict(state, action)) # Loop state = next_state step_count += 1 total_reward += reward # Reduce exploration if agent.eps > agent.min_eps: agent.eps *= agent.eps_decay rewards = np.append(rewards, total_reward) return rewards, c_losses, actions, Qs