Beispiel #1
0
def train(env, policy, hp, parentPipes, args):
    args.logdir = "experiments/" + args.logdir
    logger = DataLog()
    total_steps = 0
    best_return = -99999999

    working_dir = os.getcwd()

    if os.path.isdir(args.logdir) == False:
        os.mkdir(args.logdir)

    previous_dir = os.getcwd()

    os.chdir(args.logdir)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')

    log_dir = os.getcwd()
    os.chdir(working_dir)

    for step in range(hp.nb_steps):
        if hp.domain_Rand:
            env.Set_Randomization(default=False)
        else:
            env.randomize_only_inclines()
        #Cirriculum learning
        if (step > hp.curilearn):
            avail_deg = [7, 9, 11, 11]
            env.incline_deg = avail_deg[random.randint(0, 3)]
        else:
            avail_deg = [5, 7, 9]
            env.incline_deg = avail_deg[random.randint(0, 2)]

        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        if (parentPipes):
            process_count = len(parentPipes)
        if parentPipes:
            p = 0
            while (p < hp.nb_directions):
                temp_p = p
                n_left = hp.nb_directions - p  #Number of processes required to complete the search
                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send(
                        [_EXPLORE, [policy, hp, "positive", deltas[temp_p]]])
                    temp_p = temp_p + 1
                temp_p = p
                for k in range(min([process_count, n_left])):
                    positive_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send(
                        [_EXPLORE, [policy, hp, "negative", deltas[temp_p]]])
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    negative_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                p = p + process_count
                print('total steps till now: ', total_steps,
                      'Processes done: ', p)

        else:
            # Getting the positive rewards in the positive directions
            for k in range(hp.nb_directions):
                positive_rewards[k] = explore(env, policy, "positive",
                                              deltas[k], hp)

            # Getting the negative rewards in the negative/opposite directions
            for k in range(hp.nb_directions):
                negative_rewards[k] = explore(env, policy, "negative",
                                              deltas[k], hp)

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:int(hp.nb_best_directions)]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be
        # Updating our policy
        policy.update(rollouts, sigma_r, args)

        #Start evaluating after only second stage
        if step >= hp.curilearn:
            # policy evaluation after specified iterations
            if step % hp.evalstep == 0:
                reward_evaluation = policyevaluation(env, policy, hp)
                logger.log_kv('steps', step)
                logger.log_kv('return', reward_evaluation)
                if (reward_evaluation > best_return):
                    best_policy = policy.theta
                    best_return = reward_evaluation
                    np.save(log_dir + "/iterations/best_policy.npy",
                            best_policy)
                print('Step:', step, 'Reward:', reward_evaluation)
                policy_path = log_dir + "/iterations/" + "policy_" + str(step)
                np.save(policy_path, policy.theta)

                logger.save_log(log_dir + "/logs/")
                make_train_plots_ars(log=logger.log,
                                     keys=['steps', 'return'],
                                     save_loc=log_dir + "/logs/")
Beispiel #2
0
def train(env, policy, normalizer, hp, parentPipes, args):
    logger = DataLog()
    total_steps = 0
    best_return = -99999999
    if os.path.isdir(args.logdir) == False:
        os.mkdir(args.logdir)
    previous_dir = os.getcwd()
    os.chdir(args.logdir)
    if os.path.isdir('iterations') == False: os.mkdir('iterations')
    if os.path.isdir('logs') == False: os.mkdir('logs')
    hp.to_text('hyperparameters')

    for step in range(hp.nb_steps):

        # Initializing the perturbations deltas and the positive/negative rewards
        deltas = policy.sample_deltas()
        positive_rewards = [0] * hp.nb_directions
        negative_rewards = [0] * hp.nb_directions
        if (parentPipes):
            process_count = len(parentPipes)
        if parentPipes:
            p = 0
            while (p < hp.nb_directions):
                temp_p = p
                n_left = hp.nb_directions - p  #Number of processes required to complete the search
                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "positive", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p
                for k in range(min([process_count, n_left])):
                    positive_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    parentPipe = parentPipes[k]
                    parentPipe.send([
                        _EXPLORE,
                        [normalizer, policy, hp, "negative", deltas[temp_p]]
                    ])
                    temp_p = temp_p + 1
                temp_p = p

                for k in range(min([process_count, n_left])):
                    negative_rewards[temp_p], step_count = parentPipes[k].recv(
                    )
                    total_steps = total_steps + step_count
                    temp_p = temp_p + 1
                p = p + process_count

                # print('mp step has worked, ', p)
                print('total steps till now: ', total_steps,
                      'Processes done: ', p)

        else:
            # Getting the positive rewards in the positive directions
            for k in range(hp.nb_directions):
                positive_rewards[k] = explore(env, policy, "positive",
                                              deltas[k], hp)

            # Getting the negative rewards in the negative/opposite directions
            for k in range(hp.nb_directions):
                negative_rewards[k] = explore(env, policy, "negative",
                                              deltas[k], hp)

        # Sorting the rollouts by the max(r_pos, r_neg) and selecting the best directions
        scores = {
            k: max(r_pos, r_neg)
            for k, (
                r_pos,
                r_neg) in enumerate(zip(positive_rewards, negative_rewards))
        }
        order = sorted(scores.keys(),
                       key=lambda x: -scores[x])[:int(hp.nb_best_directions)]
        rollouts = [(positive_rewards[k], negative_rewards[k], deltas[k])
                    for k in order]

        # Gathering all the positive/negative rewards to compute the standard deviation of these rewards
        all_rewards = np.array([x[0]
                                for x in rollouts] + [x[1] for x in rollouts])
        sigma_r = all_rewards.std(
        )  # Standard deviation of only rewards in the best directions is what it should be
        # Updating our policy
        policy.update(rollouts, sigma_r, args)

        # Printing the final reward of the policy after the update
        reward_evaluation = explore(env, policy, None, None, hp)
        logger.log_kv('steps', step)
        logger.log_kv('return', reward_evaluation)
        if (reward_evaluation > best_return):
            best_policy = policy.theta
            best_return = reward_evaluation
            np.save("iterations/best_policy.npy", best_policy)
        print('Step:', step, 'Reward:', reward_evaluation)
        policy_path = "iterations/" + "policy_" + str(step)
        np.save(policy_path, policy.theta)
        logger.save_log('logs/')
        make_train_plots_ars(log=logger.log,
                             keys=['steps', 'return'],
                             save_loc='logs/')