Esempio n. 1
0
def path_discounted_returns(env,
                            gamma,
                            num_traj,
                            policy=test_policy(),
                            simpy=False,
                            printing=False):
    # print('Env is of type ', type(env))
    # print('Policy is of type ', type(policy))
    if printing: print('Simulating %d Rollouts...' % (num_traj))
    start_time = time.time()

    if (not isinstance(env, TfEnv)):
        env = TfEnv(env)

    paths = []
    rollout_times = []

    if printing:
        bar = progressbar.ProgressBar()
        iterator = bar(range(num_traj))
    else:
        iterator = range(num_traj)
    for i in iterator:
        start_time_r = time.time()
        if (simpy):
            paths.append(ed_simpy_dec_rollout(env, policy))
        else:
            paths.append(ed_dec_rollout(env, policy))
        elapsed_r = time.time() - start_time_r
        rollout_times.append(elapsed_r)

    paths = [item for sublist in paths for item in sublist]

    adr = []

    for path in paths:
        t_sojourn = path["offset_t_sojourn"]
        discount_gamma = np.exp(-gamma * t_sojourn)
        path_adr = variable_discount_cumsum(path["rewards"], discount_gamma)
        avg_discounted_return = path_adr[0]
        adr.append(avg_discounted_return)

    elapsed = time.time() - start_time
    if printing:
        print('Time Elapsed %.2f, or %.7f +- %.7f per rollout' %
              (elapsed, mean(rollout_times),
               std(rollout_times) / np.sqrt(num_traj)))

    return mean(adr), std(adr) / np.sqrt(num_traj), adr
Esempio n. 2
0
        )

        print(obj)

        env = obj['env']
        policy = obj['policy']

        agents = policy
        print('Learned Policy')

        average_discounted_rewards = []

        GAMMA = 0.  #math.log(0.9)/(-5.) # decay to 90% in 5 seconds

        for i in range(20):
            paths = ed_dec_rollout(env, agents)
            for path in paths:
                t_sojourn = path["offset_t_sojourn"]
                discount_gamma = np.exp(-GAMMA * t_sojourn)
                path["returns"] = variable_discount_cumsum(
                    path["rewards"], discount_gamma)
                average_discounted_rewards.append(sum(path["rewards"]))

            if (i % 10 == 0):
                print('Iteration: ', i)

        print(len(average_discounted_rewards))
        print(np.mean(average_discounted_rewards),
              np.std(average_discounted_rewards))

        from fire_smdp_params import test_policy_smarter