def main(): # try: parse_cmd_args() sess = tf.Session() K.set_session(sess) db = Database() env = Environment(db, argus) actor_critic = ActorCritic(env, sess, learning_rate=argus['learning_rate'], train_min_size=argus['train_min_size'], size_mem=argus['maxlen_mem'], size_predict_mem=argus['maxlen_predict_mem']) num_trials = argus['num_trial'] # ? # trial_len = 500 # ? # ntp env.preheat() # First iteration cur_state = env._get_obs() # np.array (inner_metric + sql) cur_state = cur_state.reshape((1, env.state.shape[0])) # action = env.action_space.sample() action = env.fetch_action() # np.array action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape-") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state for i in range(num_trials): # env.render() cur_state = cur_state.reshape((1, env.state.shape[0])) action, isPredicted = actor_critic.act(cur_state) print(action) action_2 = action.reshape((1, env.action_space.shape[0])) # for memory # action.tolist() # to execute new_state, reward, done, _ = env.step(action, isPredicted, i + 1) new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("%d-shape-" % i) print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() cur_state = new_state '''
class Agent(mp.Process): def __init__(self, global_actor_critic, optimizer, input_dims, nb_actions, gamma, lr, name, global_ep_index, env_id): super(Agent, self).__init__() self.local_actor_critic = ActorCritic(input_dims, nb_actions, gamma) self.global_actor_critic = global_actor_critic self.name = "w%02i" % name self.episode_index = global_ep_index self.env = gym.make(env_id) self.optimizer = optimizer def run(self): t_step = 1 while self.episode_index.value < EPISODES: done = False observation = self.env.reset() score = 0 self.local_actor_critic.clear_memory() while not done: action = self.local_actor_critic.choose_action(observation) observation_, reward, done, info = self.env.step(action) score += reward self.local_actor_critic.remember(observation, action, reward) if (t_step % T_MAX) == 0 or done: loss = self.local_actor_critic.calc_loss(done) self.optimizer.zero_grad() loss.backward() for local_param, global_param in zip( self.local_actor_critic.parameters(), self.global_actor_critic.parameters()): global_param._grad = local_param.grad self.optimizer.step() self.local_actor_critic.load_state_dict(self.global_actor_critic.state_dict()) self.local_actor_critic.clear_memory() t_step += 1 observation = observation_ with self.episode_index.get_lock(): self.episode_index.value += 1 print(self.name, 'episode ', self.episode_index.value, 'reward %.1f' % score)
# ntp env.preheat() # First iteration cur_state = env._get_obs() # np.array (inner_metric + sql) cur_state = cur_state.reshape((1, env.state.shape[0])) # action = env.action_space.sample() action = env.fetch_action() # np.array action_2 = action.reshape((1, env.action_space.shape[0])) # for memory new_state, reward, done, socre, _ = env.step(action, 0, 1) # apply the action -> to steady state -> return the reward new_state = new_state.reshape((1, env.state.shape[0])) reward_np = np.array([reward]) print("0-shape") print(new_state.shape) actor_critic.remember(cur_state, action_2, reward_np, new_state, done) actor_critic.train() # len<32, useless cur_state = new_state predicted_rewardList = [] for epoch in range(num_trials): # env.render() cur_state = cur_state.reshape((1, env.state.shape[0])) action, isPredicted = actor_critic.act(cur_state) print(action) action_2 = action.reshape((1, env.action_space.shape[0])) # for memory # action.tolist() # to execute new_state, reward, done, score, _ = env.step(action, isPredicted, epoch + 1) new_state = new_state.reshape((1, env.state.shape[0])) if isPredicted == 1: predicted_rewardList.append([epoch, reward])