# batch train total_reward = 0 env.reset() action = env.action_space.sample() state, reward, done, _ = env.step(action) for _ in range(1000): # training states, actions, rewards, next_states = memory.sample(20) next_actions = actor.get_actions(next_states) next_qs = critic.get_qs(next_states, next_actions) loss, q = critic.train(states, actions, rewards, next_qs) action_gradients = critic.get_action_gradients(states, actions) actor.train(states, action_gradients[0]) env.render() action = actor.get_action_for_train(state, ep) next_state, reward, done, _ = env.step(action) memory.add((state, action, reward, next_state)) # print(state, action, reward, next_state) total_reward += reward # print(action, reward, total_reward) state = next_state if done: break # if ep % 10 == 0: # critic.update_network_params() logging.info('Episode: {}'.format(ep) + ' Total Reward: {:.4f}'.format(total_reward) + ' Q: {:.4f}'.format(np.max(q)) + ' loss: {:.4f}'.format(loss))