Ejemplo n.º 1
0
def train(N, T, delta, env):
    """
    param N: number of trajectories to sample in each time step
    param T: number of iterations to train the model
    param delta: trust region size
    param env: the environment for the policy to learn
    
    return:
        theta: the trained model parameters
        avg_episodes_rewards: list of average rewards for each time step
    """
    theta = np.random.rand(C.extracted_feature_size, 1)

    episode_rewards = []  #record the reward during the training process

    for iteration_index in range(0, T):
        #first, I sample the grads and the rewards
        trajectories_grads, trajectories_rewards = sample(theta, env, N)

        total_reward = 0
        for trajectory_index in range(0, len(trajectories_rewards)):
            current_reward = trajectories_rewards[trajectory_index]
            total_reward += np.sum(current_reward)
        total_reward = total_reward / len(trajectories_rewards)
        print('total reward is', total_reward, 'and this is training epoch',
              iteration_index)

        episode_rewards.append(total_reward)

        #gradient of the value function
        value_function_gradient = utils.compute_value_gradient(
            trajectories_grads, trajectories_rewards)

        #fisher matrix
        fisher_matrix = utils.compute_fisher_matrix(trajectories_grads)

        #step size
        step_size = utils.compute_eta(delta, fisher_matrix,
                                      value_function_gradient)

        #update theta
        theta += step_size * np.linalg.inv(
            fisher_matrix) @ value_function_gradient

        #save the learned parameter theta
        learned_parameter_theta = {}
        learned_parameter_theta['learned_parameter_theta'] = theta
        cwd = os.getcwd()
        #cwd = os.path.join(cwd, 'data_folder')
        parameter_file = 'learned_parameter_theta.json'
        cwd = os.path.join(cwd, parameter_file)
        with open(cwd, 'w') as statusFile:
            statusFile.write(jsonpickle.encode(learned_parameter_theta))

    return theta, episode_rewards
Ejemplo n.º 2
0
def train(N, T, delta):
    """

    :param N: number of trajectories to sample in each time step
    :param T: number of iterations to train the model
    :param delta: trust region size
    :return:
        theta: the trained model parameters
        avg_episodes_rewards: list of average rewards for each time step
    """
    theta = np.random.rand(200, 1)

    env = simple_continuous_buy_sell_spy.simple_continuous_buy_sell_spy()

    episode_rewards = []

    for iteration_index in range(0, T):
        #first, I sample the grads and the rewards
        trajectories_grads, trajectories_reward = sample(theta, env, N)

        total_reward = 0
        for trajectory_index in range(0, len(trajectories_reward)):
            current_reward = trajectories_reward[trajectory_index]
            total_reward += np.sum(current_reward)
        total_reward = total_reward / len(trajectories_reward)
        print('total_reward.append(', total_reward, ')')

        episode_rewards.append(total_reward)

        #gradient of the value function
        value_function_gradient = utils.compute_value_gradient(
            trajectories_grads, trajectories_reward)

        #fisher matrix
        fisher_matrix = utils.compute_fisher_matrix(trajectories_grads)

        #step size
        step_size = utils.compute_eta(delta, fisher_matrix,
                                      value_function_gradient)

        #update theta
        update_theta = step_size * np.linalg.inv(
            fisher_matrix) @ value_function_gradient
        if np.isfinite(update_theta).all() == True:
            theta += update_theta
        else:
            theta

    return theta, episode_rewards
Ejemplo n.º 3
0
def train(N, T, delta):
    """

    :param N: number of trajectories to sample in each time step
    :param T: number of iterations to train the model
    :param delta: trust region size
    :return:
        theta: the trained model parameters
        avg_episodes_rewards: list of average rewards for each time step
    """
    theta = np.random.rand(100, 1)
    env = gym.make('CartPole-v0')
    env.seed(12345)

    episode_rewards = []

    for iteration_index in range(0, T):
        #first, I sample the grads and the rewards
        trajectories_grads, trajectories_reward = sample(theta, env, N)

        total_reward = 0
        for trajectory_index in range(0, len(trajectories_reward)):
            current_reward = trajectories_reward[trajectory_index]
            total_reward += np.sum(current_reward)
        total_reward = total_reward / len(trajectories_reward)
        print('total reward is', total_reward)

        episode_rewards.append(total_reward)

        #gradient of the value function
        value_function_gradient = utils.compute_value_gradient(
            trajectories_grads, trajectories_reward)

        #fisher matrix
        fisher_matrix = utils.compute_fisher_matrix(trajectories_grads)

        #step size
        step_size = utils.compute_eta(delta, fisher_matrix,
                                      value_function_gradient)

        #update theta
        theta += step_size * np.linalg.inv(
            fisher_matrix) @ value_function_gradient

    return theta, episode_rewards
Ejemplo n.º 4
0
    soln_fisher = tests_info[i]['fisher']
    fisher = utils.compute_fisher_matrix(total_grads)

    err = np.linalg.norm(soln_fisher - fisher)
    print('test {} for compute_fisher_matrix - error = {}'.format(i, err))

""" ------------- testing compute_value_gradient ----------------"""
print('-' * 10 + ' testing compute_value_gradient ' + '-' * 10)
for i in test_cases:
    total_grads = tests_info[i]['total_grads']
    total_rewards = tests_info[i]['total_rewards']

    soln_v_grad = tests_info[i]['v_grad']
    #print('the solution grad is',soln_v_grad.tolist())
    v_grad = utils.compute_value_gradient(total_grads, total_rewards)
    #print('the computed grad is',v_grad.tolist())

    err = np.linalg.norm(soln_v_grad - v_grad)
    print('test {} for compute_value_gradient - error = {}'.format(i, err))

""" ------------- testing compute_value_gradient ----------------"""
print('-' * 10 + ' testing compute_value_gradient ' + '-' * 10)
for i in test_cases:

    fisher = tests_info[i]['fisher']
    delta = 1e-2
    v_grad = tests_info[i]['v_grad']
    soln_eta = tests_info[i]['eta']

    eta = utils.compute_eta(delta, fisher, v_grad)
Ejemplo n.º 5
0
def train(N, T, delta, env):
    """
    param N: number of trajectories to sample in each time step
    param T: number of iterations to train the model
    param delta: trust region size
    param env: the environment for the policy to learn
    
    return:
        theta: the trained model parameters
        avg_episodes_rewards: list of average rewards for each time step
    """
    theta = np.random.rand(C.extracted_feature_size, 1)

    #cov matrix for the exploration part of sampling
    variance = torch.full(size=(C.output_dim, ),
                          fill_value=C.variance_for_exploration)
    cov_matrix = torch.diag(variance)

    #inv_cov_matrix for computing log grad of action distribution
    inv_cov_matrix_diag = np.ones(
        C.output_dim) * (1.0 / C.variance_for_exploration)
    inv_cov_matrix = np.diag(inv_cov_matrix_diag)

    replay_buffer = []
    replay_buffer_rewards = []
    optimization_history_list = []

    for iteration_index in range(0, T):
        #first, I sample the grads and the rewards
        replay_buffer, replay_buffer_rewards, current_batch_reward = sample(
            theta, env, N, replay_buffer, replay_buffer_rewards, cov_matrix)

        #record the optimization process
        optimization_history_list.append(current_batch_reward)
        optimization_history = {}
        optimization_history['objective_history'] = optimization_history_list
        cwd = os.getcwd()
        #cwd = os.path.join(cwd, 'data_folder')
        parameter_file = 'optimization_history.json'
        cwd = os.path.join(cwd, parameter_file)
        with open(cwd, 'w') as statusFile:
            statusFile.write(jsonpickle.encode(optimization_history))

        print('this is training epoch', iteration_index)
        print('the current reward is', current_batch_reward)

        for _ in range(0, C.max_offline_training):

            #sample experience from the replay buffer for training
            # new_replay_buffer_rewards = []
            # for entry in replay_buffer_rewards:
            #     new_replay_buffer_rewards.append(np.log(entry*-1)*-1) #because the reward is negative here
            # sample_probability = (np.exp(new_replay_buffer_rewards))/np.sum(np.exp(new_replay_buffer_rewards)) #apply softmax to the total_reward list
            sampled_off_line_data = []
            for sample_counter in range(0, C.batch_size):
                #sampled_index = np.random.choice(np.arange(0, len(replay_buffer)), p=sample_probability.tolist())
                sampled_index = random.randint(0, len(replay_buffer) - 1)
                sampled_off_line_data.append(replay_buffer[sampled_index])

            #update model

            #gradient of the value function
            value_function_gradient, grads_for_fisher_matrix = utils.compute_value_gradient(
                sampled_off_line_data, theta, cov_matrix, inv_cov_matrix)

            #fisher matrix
            fisher_matrix = utils.compute_fisher_matrix(
                grads_for_fisher_matrix)

            #step size
            step_size = utils.compute_eta(delta, fisher_matrix,
                                          value_function_gradient)

            #update theta
            theta += step_size * np.linalg.inv(
                fisher_matrix) @ value_function_gradient

            #save the learned parameter theta
            learned_parameter_theta = {}
            learned_parameter_theta['learned_parameter_theta'] = theta
            cwd = os.getcwd()
            #cwd = os.path.join(cwd, 'data_folder')
            parameter_file = 'learned_parameter_theta.json'
            cwd = os.path.join(cwd, parameter_file)
            with open(cwd, 'w') as statusFile:
                statusFile.write(jsonpickle.encode(learned_parameter_theta))

    return theta