def train(env, policy, normalizer, hp, parentPipes, args): logger = DataLog() total_steps = 0 best_return = -99999999 if os.path.isdir(args.logdir) == False: os.mkdir(args.logdir) previous_dir = os.getcwd() os.chdir(args.logdir) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') for step in range(hp.nb_steps): # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas() positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions if (parentPipes): process_count = len(parentPipes) if parentPipes: p = 0 while (p < hp.nb_directions): temp_p = p n_left = hp.nb_directions - p #Number of processes required to complete the search for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "positive", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): positive_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): parentPipe = parentPipes[k] parentPipe.send([ _EXPLORE, [normalizer, policy, hp, "negative", deltas[temp_p]] ]) temp_p = temp_p + 1 temp_p = p for k in range(min([process_count, n_left])): negative_rewards[temp_p], step_count = parentPipes[k].recv( ) total_steps = total_steps + step_count temp_p = temp_p + 1 p = p + process_count # print('mp step has worked, ', p) print('total steps till now: ', total_steps, 'Processes done: ', p) else: # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k] = explore(env, policy, "positive", deltas[k], hp) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k] = explore(env, policy, "negative", deltas[k], hp) # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:int(hp.nb_best_directions)] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) # Printing the final reward of the policy after the update reward_evaluation = explore(env, policy, None, None, hp) logger.log_kv('steps', total_steps) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save("iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = "iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log('logs/') make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc='logs/')
def train(env, policy, normalizer, hp, job_name="default_exp"): """ Training using Augmented Random Search :param env : OpenAI gym environment :param policy : Object of class Policy :param normalizer : Object of class normalizer :param hp : Object of class hp :param job_name : Name of the directory where you want to save data :returns : Nothing, trains the agent """ logger = DataLog() total_steps = 0 best_return = -99999999 if os.path.isdir(job_name) == False: os.mkdir(job_name) previous_dir = os.getcwd() os.chdir(job_name) if os.path.isdir('iterations') == False: os.mkdir('iterations') if os.path.isdir('logs') == False: os.mkdir('logs') hp.to_text('hyperparameters') for step in range(hp.nb_steps): # Initializing the perturbations deltas and the positive/negative rewards deltas = policy.sample_deltas(hp) positive_rewards = [0] * hp.nb_directions negative_rewards = [0] * hp.nb_directions # Getting the positive rewards in the positive directions for k in range(hp.nb_directions): positive_rewards[k], step_count_positive = explore( env, normalizer, policy, "positive", deltas[k], hp) # break # print('done: ',k) # Getting the negative rewards in the negative/opposite directions for k in range(hp.nb_directions): negative_rewards[k], step_count_negative = explore( env, normalizer, policy, "negative", deltas[k], hp) # break # print('done: ', k) total_steps = total_steps + step_count_positive + step_count_negative # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions scores = { k: max(r_pos, r_neg) for k, ( r_pos, r_neg) in enumerate(zip(positive_rewards, negative_rewards)) } order = sorted(scores.keys(), key=lambda x: -scores[x])[:hp.nb_best_directions] rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k]) for k in order] # Gathering all the positive/negative rewards to compute the standard deviation of these rewards all_rewards = np.array([x[0] for x in rollouts] + [x[1] for x in rollouts]) sigma_r = all_rewards.std( ) # Standard deviation of only rewards in the best directions is what it should be # Updating our policy policy.update(rollouts, sigma_r, args) # Printing the final reward of the policy after the update reward_evaluation, _ = explore(env, normalizer, policy, None, None, hp) logger.log_kv('steps', total_steps) logger.log_kv('return', reward_evaluation) if (reward_evaluation > best_return): best_policy = policy.theta best_return = reward_evaluation np.save("iterations/best_policy.npy", best_policy) print('Step:', step, 'Reward:', reward_evaluation) policy_path = "iterations/" + "policy_" + str(step) np.save(policy_path, policy.theta) logger.save_log('logs/') make_train_plots_ars(log=logger.log, keys=['steps', 'return'], save_loc='logs/')