def trainD(file_name="Distral_1col", batch_size=128, gamma=0.999, alpha=0.9, beta=5, eps_start=0.9, eps_end=0.05, eps_decay=5, is_plot=False, num_episodes=200, max_num_steps_per_episode=1000, learning_rate=0.001, memory_replay_size=10000, memory_policy_size=1000): num_envs = 4 list_of_envs = [] for i in range(num_envs): list_of_envs.append(gym.make('Breakout-v4')) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # pi_0 policy = Policy(list_of_envs[0].action_space.n, num_envs, alpha, beta, sess) # Q value, every environment has one, used to calculate A_i, models = [ DQN(list_of_envs[i], policy, alpha, beta, sess, model_name="model_" + str(i)) for i in range(0, num_envs) ] policy.add_models(models) sess.run(tf.global_variables_initializer()) # info list for each environment episode_durations = [[] for _ in range(num_envs)] # list of local steps episode_rewards = [[] for _ in range(num_envs) ] # list of list of episode reward episodes_done = np.zeros(num_envs) # episode num steps_done = np.zeros(num_envs) # global timesteps for each env current_time = np.zeros(num_envs) # local timesteps for each env episode_total_rewards = np.zeros(num_envs) policy_step = 0 # Initialize environments states = [] for i in range(num_envs): states.append(policy.models[i].env.reset()) while np.min(episodes_done) < num_episodes: # 1. do the step for each env for i in range(num_envs): scale_state = cv2.resize(states[i], (84, 84)) action = policy.models[i].select_action(scale_state) next_state, reward, done, _ = policy.models[i].env.step(action[0, 0]) reward = [reward] episode_total_rewards[i] += reward # if done: # next_state = None steps_done[i] += 1 # global_steps current_time[i] += 1 # local steps time = [current_time[i]] # 把states scale 到[84, 84] policy.models[i].experience(scale_state, action, reward, cv2.resize(next_state, (84, 84)), done, time) states[i] = next_state # move to next state # 2. do one optimization step for each env using "soft-q-learning". # Perform one step of the optimization (on the target network) policy.models[i].optimize_step() # ===========update step info end ======================== # ===========update episode info begin ==================== if done: print( "ENV:", i, "iter:", episodes_done[i], "\treward:", episode_total_rewards[i], "\tit:", current_time[i], "\texp_factor:", eps_end + (eps_start - eps_end) * math.exp(-1. * episodes_done[i] / eps_decay)) states[i] = policy.models[i].env.reset() # reset env episodes_done[i] += 1 # episode steps episode_durations[i].append( current_time[i] ) # append each episode local timesteps list for every env current_time[i] = 0 # reset local timesteps episode_rewards[i].append( episode_total_rewards[i] ) # append total episode_reward to list if is_plot: plot_rewards(episode_rewards, i) # ===========update episode info end ==================== # 3. do one optimization step for the policy # after all envs has performed one step, optimize policy policy_step += 1 l = policy.optimize_step(policy_step) print("Policy steps ", policy_step, ", loss : ", l) print('Complete') if is_plot: plt.ioff() plt.show() np.save(file_name + '-distral-2col-rewards', episode_rewards) np.save(file_name + '-distral-2col-durations', episode_durations) return models, policy, episode_rewards, episode_durations