Esempio n. 1
0
def action_policy(eval_scenes=None, useParam=True):
    if eval_scenes == None:
        eval_scenes = scene_set.keys()
    headers = [
        "ALG. Rew_based", "Tot rewards", "SD", "Elapsed T", "HyperParam1",
        "HyperParam2", "HyperParam3"
    ]
    print("Aciton Policy Learning...")
    init_mean = 3.0
    test_rewards_set = {}
    for scene in eval_scenes:
        v = scene_set[scene]
        alg_num = 0
        f_rew = open(os.path.join(args.log_dir, "total_rewards_range.txt"),
                     "a")
        f_rew.write("\n" + scene + "\n")
        print("Domain:" + scene)
        #f_rew.close()
        result = {'reward': [], 'count': [], 'Q': [], 'tab': []}
        result2 = {'reward': [], 'count': [], 'Q': [], 'tab': []}

        # Q-learning_fixed, egreedy
        print(labels_act[alg_num])
        t_start = time.time()
        models = {}
        tmp = {}
        for alpha in alphas:
            for es in epsilons:
                peq = [
                    tq.Qlearning(scene,
                                 alpha,
                                 discount,
                                 initQ=init_mean,
                                 TH=v[1]) for i in range(Nrun)
                ]
                [peq[i].env.set_slip(args.slip) for i in range(Nrun)]
                [
                    peq[i].learning('egreedy',
                                    es,
                                    eval_greedy=True,
                                    rate_decay=True) for i in range(Nrun)
                ]
                models[(alpha, es)] = peq
                tmp[(alpha, es)] = [x.test_rewards for x in peq]
        test_rewards_set[labels_act[alg_num]] = tmp
        elapsed_t = round((time.time() - t_start) / len(models), 2)
        action_policy_helper(scene, models, result, labels_act[alg_num],
                             elapsed_t)
        action_policy_helper(scene,
                             models,
                             result2,
                             labels_act[alg_num],
                             elapsed_t,
                             reward_based=False)
        print(result['tab'][-1])
        alg_num += 1

        # Q-learning_fixed, boltzmann
        print(labels_act[alg_num])
        t_start = time.time()
        models = {}
        tmp = {}
        for alpha in alphas:
            for tau in boltz_temp:
                peq = [
                    tq.Qlearning(scene,
                                 alpha,
                                 discount,
                                 initQ=init_mean,
                                 TH=v[1]) for i in range(Nrun)
                ]
                [peq[i].env.set_slip(args.slip) for i in range(Nrun)]
                [
                    peq[i].learning('softmax',
                                    tau,
                                    eval_greedy=True,
                                    rate_decay=True) for i in range(Nrun)
                ]
                models[(
                    alpha,
                    tau,
                )] = peq
                tmp[(alpha, tau)] = [x.test_rewards for x in peq]
        test_rewards_set[labels_act[alg_num]] = tmp
        elapsed_t = round((time.time() - t_start) / len(models), 2)
        action_policy_helper(scene, models, result, labels_act[alg_num],
                             elapsed_t)
        action_policy_helper(scene,
                             models,
                             result2,
                             labels_act[alg_num],
                             elapsed_t,
                             reward_based=False)
        print(result['tab'][-1])
        alg_num += 1

        #model_set = {}
        # ADFQs - Egreedy
        for policy in update_policies:
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            tmp = {}
            if args.slip == 0.0:
                for es in epsilons:
                    for var in variances:
                        adfq = [
                            brl.adfq(scene,
                                     discount,
                                     init_mean=init_mean,
                                     init_var=var,
                                     TH=v[1]) for i in range(Nrun)
                        ]
                        [adfq[i].env.set_slip(args.slip) for i in range(Nrun)]
                        [
                            adfq[i].learning(updatePolicy=policy,
                                             actionPolicy='egreedy',
                                             actionParam=es,
                                             eval_greedy=True)
                            for i in range(Nrun)
                        ]
                        models[(es, var)] = adfq
                        tmp[(es, var)] = [x.test_rewards for x in adfq]
            else:
                for noise in noises:
                    for batch_size in batch_sizes:
                        for es in epsilons:
                            adfq = [
                                brl.adfq(scene,
                                         discount,
                                         init_mean=init_mean,
                                         TH=v[1]) for i in range(Nrun)
                            ]
                            [
                                adfq[i].env.set_slip(args.slip)
                                for i in range(Nrun)
                            ]
                            [
                                adfq[i].learning(updatePolicy=policy,
                                                 actionPolicy='egreedy',
                                                 actionParam=es,
                                                 eval_greedy=True,
                                                 noise=noise,
                                                 batch_size=batch_size)
                                for i in range(Nrun)
                            ]
                            models[(
                                noise,
                                batch_size,
                                es,
                            )] = adfq
                            tmp[(
                                noise,
                                batch_size,
                                es,
                            )] = [x.test_rewards for x in adfq]
            test_rewards_set[labels_act[alg_num]] = tmp
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])
            #model_set[(policy, 'egreedy')] = models
            alg_num += 1

        # ADFQs - Thompson_Sampling
        for policy in update_policies:

            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            tmp = {}
            if args.slip == 0.0:
                for var in variances:
                    adfq = [
                        brl.adfq(scene,
                                 discount,
                                 init_mean=init_mean,
                                 init_var=var,
                                 TH=v[1]) for i in range(Nrun)
                    ]
                    [adfq[i].env.set_slip(args.slip) for i in range(Nrun)]
                    [
                        adfq[i].learning(updatePolicy=policy,
                                         actionPolicy='ts',
                                         actionParam=None,
                                         eval_greedy=True) for i in range(Nrun)
                    ]
                    models[(var, )] = adfq
                    tmp[(var, )] = [x.test_rewards for x in adfq]
            else:
                for noise in noises:
                    for batch_size in batch_sizes:
                        adfq = [
                            brl.adfq(scene,
                                     discount,
                                     init_mean=init_mean,
                                     TH=v[1]) for i in range(Nrun)
                        ]
                        [adfq[i].env.set_slip(args.slip) for i in range(Nrun)]
                        [
                            adfq[i].learning(updatePolicy=policy,
                                             actionPolicy='ts',
                                             actionParam=None,
                                             eval_greedy=True,
                                             noise=noise,
                                             batch_size=batch_size)
                            for i in range(Nrun)
                        ]
                        models[(
                            noise,
                            batch_size,
                        )] = adfq
                        tmp[(
                            noise,
                            batch_size,
                        )] = [x.test_rewards for x in adfq]
            test_rewards_set[labels_act[alg_num]] = tmp
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])
            pickle.dump(
                test_rewards_set,
                open(os.path.join(args.log_dir, "set_rewards.pkl"), "wb"))
            alg_num += 1

        if False:  #not(scene == 'maze'):
            # KTD-Q with Egreedy
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            tmp = {}
            for kappa in [1.0, 0.5 * v[2], v[2]]:
                for es in epsilons:
                    ktd = [
                        brl.ktd_Q(scene,
                                  discount,
                                  init_mean=init_mean,
                                  TH=v[1]) for i in range(Nrun)
                    ]
                    [ktd[i].env.set_slip(args.slip) for i in range(Nrun)]
                    [
                        ktd[i].learning(kappa=kappa,
                                        actionPolicy="egreedy",
                                        actionParam=es,
                                        eval_greedy=True) for i in range(Nrun)
                    ]  #bp[scene][labels[4]]['kappa']
                    models[(kappa, es)] = ktd
                    tmp[(kappa, es)] = [x.test_rewards for x in ktd]
            test_rewards_set[labels_act[alg_num]] = tmp
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])
            alg_num += 1

            # KTD-Q with Active Learning
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            for kappa in [1.0, 0.5 * v[2], v[2]]:
                ktd = [
                    brl.ktd_Q(scene, discount, init_mean=init_mean, TH=v[1])
                    for i in range(Nrun)
                ]
                [ktd[i].env.set_slip(args.slip) for i in range(Nrun)]
                [
                    ktd[i].learning(kappa=kappa,
                                    actionPolicy="active",
                                    actionParam=None,
                                    eval_greedy=True) for i in range(Nrun)
                ]
                models[(kappa, )] = ktd
                tmp[(kappa, )] = [x.test_rewards for x in ktd]
            test_rewards_set[labels_act[alg_num]] = tmp
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])

        f_rew.write(tabulate(result['tab'], headers=headers,
                             tablefmt='orgtbl'))
        f_rew.write('\n')
        f_rew.write(
            tabulate(result2['tab'],
                     headers=["ALG. Eval_based"] + headers[1:],
                     tablefmt='orgtbl'))
        f_rew.write('\n')
        f_rew.close()
        pickle.dump(test_rewards_set,
                    open(os.path.join(args.log_dir, "set_rewards.pkl"), "wb"))
Esempio n. 2
0
def off_policy(eval_scenes=None):
    if eval_scenes == None:
        eval_scenes = scene_set.keys()

    headers = [
        "ALG. Q_err based", "RMSE mean ", "Elapsed T", "HyperParam1",
        "HyperParam2", "HyperParam3"
    ]
    print("Off-Policy Learning...")
    for scene in eval_scenes:
        f_rmse = open(os.path.join(args.log_dir, "rmse.txt"), "a")
        v = scene_set[scene]
        alg_num = 0
        result = {'Q_err': [], 'tab': []}
        print("Domain:%s with slip %.2f" % (scene, args.slip))
        actionSet = [np.random.choice(v[0], v[1]) for i in range(Nrun)]
        if MDPs.model_assign(scene).episodic:  # Non-episodic
            init_means = np.ones((Nrun, ))
        else:
            init_means = 1.0 / (1 - discount) * np.ones((Nrun, ))

        # Q-learning (fixed learning rate)
        print(labels_off[alg_num])
        t_start = time.time()
        models = {}
        for alpha in alphas:
            algs = [
                tq.Qlearning(scene, alpha, discount, initQ=init_means[i])
                for i in range(Nrun)
            ]
            [algs[i].env.set_slip(args.slip) for i in range(Nrun)]
            [
                algs[i].learning('offline', actionSet[i], rate_decay=True)
                for i in range(Nrun)
            ]
            models[(alpha, )] = algs
        off_policy_helper(scene, models, result, labels_off[alg_num],
                          round((time.time() - t_start) / len(models), 2))
        alg_num += 1
        print(tabulate(result['tab'], headers=headers, tablefmt='orgtbl'))

        # ADFQ
        useScale = False
        for policy in update_policies:
            print(labels_off[alg_num])
            t_start = time.time()
            models = {}
            if args.slip == 0.0:
                algs = [
                    brl.adfq(scene, discount, init_mean=init_means[i])
                    for i in range(Nrun)
                ]
                [algs[i].env.set_slip(args.slip) for i in range(Nrun)]
                [
                    algs[i].learning(updatePolicy=policy,
                                     actionPolicy='offline',
                                     actionParam=actionSet[i],
                                     useScale=useScale) for i in range(Nrun)
                ]
                models[(-1, )] = algs
            else:
                for batch_size in batch_sizes:
                    for noise in noises:
                        algs = [
                            brl.adfq(scene, discount, init_mean=init_means[i])
                            for i in range(Nrun)
                        ]
                        [algs[i].env.set_slip(args.slip) for i in range(Nrun)]
                        [
                            algs[i].learning(updatePolicy=policy,
                                             actionPolicy='offline',
                                             actionParam=actionSet[i],
                                             batch_size=batch_size,
                                             noise=noise,
                                             useScale=useScale)
                            for i in range(Nrun)
                        ]
                        models[(
                            batch_size,
                            noise,
                        )] = algs
            off_policy_helper(scene, models, result, labels_off[alg_num],
                              round((time.time() - t_start) / len(models), 2))
            print(tabulate(result['tab'], headers=headers, tablefmt='orgtbl'))
            alg_num += 1
        np.save(os.path.join(args.log_dir, "err_set_" + scene),
                result['Q_err'])
        # KTD-Q
        if not (scene == 'maze'):
            print(labels_off[alg_num])
            t_start = time.time()
            models = {}
            for kappa in [1.0, 0.5 * v[2], v[2]]:
                print("kappa %.2f" % kappa)
                algs = [
                    brl.ktd_Q(scene, discount, init_mean=init_means[i])
                    for i in range(Nrun)
                ]
                [algs[i].env.set_slip(args.slip) for i in range(Nrun)]
                [
                    algs[i].learning(kappa=kappa,
                                     actionPolicy="offline",
                                     actionParam=actionSet[i])
                    for i in range(Nrun)
                ]
                models[(kappa, )] = algs
            off_policy_helper(scene, models, result, labels_off[alg_num],
                              round((time.time() - t_start) / len(models), 2))
            alg_num += 1
        print(tabulate(result['tab'], headers=headers, tablefmt='orgtbl'))
        np.save(os.path.join(args.log_dir, "err_set_" + scene),
                result['Q_err'])
        f_rmse.write(scene + "\n")
        f_rmse.write(
            tabulate(result['tab'], headers=headers, tablefmt='orgtbl'))
        f_rmse.write('\n')
Esempio n. 3
0
def action_policy(eval_scenes=None, useParam=True):
    if eval_scenes == None:
        eval_scenes = scene_set.keys()
    headers = [
        "ALG. Rew_based", "Tot rewards", "SD", "Elapsed T", "HyperParam1",
        "HyperParam2"
    ]
    print("Aciton Policy Learning...")
    for scene in eval_scenes:
        v = scene_set[scene]
        alg_num = 0
        f_rew = open(os.path.join(args.result_dir, "total_rewards_range.txt"),
                     "a")
        f_rew.write("\n" + scene + "\n")
        print("Domain:" + scene)
        #f_rew.close()
        result = {'reward': [], 'count': [], 'Q': [], 'tab': []}
        result2 = {'reward': [], 'count': [], 'Q': [], 'tab': []}

        # # Q-learning_fixed, egreedy
        # print(labels_act[alg_num])
        # t_start = time.time()
        # models = {}
        # for alpha in alphas:
        # 	for es in epsilons:
        # 		peq = [tq.Qlearning(scene, alpha, discount, 0.0, init_policy=True, TH=v[1]) for i in range(Nrun)]
        # 		[peq[i].learning('egreedy',es, eval_greedy =True, rate_decay=False) for i in range(Nrun)]
        # 		models[(alpha,es)] = peq
        # elapsed_t = round( (time.time()-t_start)/len(models) ,2)
        # action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t)
        # action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False)
        # print(result['tab'][-1])
        alg_num += 1

        # # Q-learning_fixed, boltzmann
        # print(labels_act[alg_num])
        # t_start = time.time()
        # models = {}
        # for alpha in alphas:
        # 	for tau in boltz_temp:
        # 		peq = [tq.Qlearning(scene, alpha, discount, 0.0, init_policy=True, TH=v[1]) for i in range(Nrun)]
        # 		[peq[i].learning('softmax',tau,eval_greedy =True, rate_decay=False) for i in range(Nrun)]
        # 		models[(alpha,tau,)] = peq
        # elapsed_t = round( (time.time()-t_start)/len(models) ,2)
        # action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t)
        # action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False)
        # print(result['tab'][-1])
        alg_num += 1

        # ADFQs - Egreedy
        for policy in update_policies:
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            for var in variances:
                for es in epsilons:
                    adfq = [
                        brl.adfq(scene,
                                 discount,
                                 0.0,
                                 var,
                                 init_policy=True,
                                 TH=v[1]) for i in range(Nrun)
                    ]
                    [
                        adfq[i].learning(updatePolicy=policy,
                                         actionPolicy='egreedy',
                                         actionParam=es,
                                         eval_greedy=True,
                                         updateParam=0.01) for i in range(Nrun)
                    ]
                    models[(var, es)] = (adfq)
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])
            alg_num += 1

        # ADFQs - Eg+Bayesian
        for policy in update_policies:
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            for var in variances:
                for es in epsilons:
                    adfq = [
                        brl.adfq(scene,
                                 discount,
                                 0.0,
                                 var,
                                 init_policy=True,
                                 TH=v[1]) for i in range(Nrun)
                    ]
                    [
                        adfq[i].learning(updatePolicy=policy,
                                         actionPolicy='semi-Bayes',
                                         actionParam=es,
                                         eval_greedy=True,
                                         updateParam=0.01) for i in range(Nrun)
                    ]
                    models[(var, es)] = (adfq)
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])
            alg_num += 1

        # ADFQs - Bayesian
        for policy in update_policies:
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            for var in variances:
                adfq = [
                    brl.adfq(scene,
                             discount,
                             0.0,
                             var,
                             init_policy=True,
                             TH=v[1]) for i in range(Nrun)
                ]
                [
                    adfq[i].learning(updatePolicy=policy,
                                     actionPolicy='Bayes',
                                     actionParam=None,
                                     eval_greedy=True,
                                     updateParam=0.01) for i in range(Nrun)
                ]
                models[(var, )] = (adfq)
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])
            alg_num += 1

        if False:  # not(scene == 'maze'):
            # KTD-Q with Egreedy
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            for var in [1.0, 10.0]:
                for es in [0.05, 0.1, 0.15]:
                    ktd = [
                        brl.ktd_Q(scene,
                                  discount,
                                  0.0,
                                  var,
                                  init_policy=True,
                                  TH=v[1]) for i in range(Nrun)
                    ]
                    [
                        ktd[i].learning(1,
                                        actionPolicy="egreedy",
                                        actionParam=es,
                                        eval_greedy=True) for i in range(Nrun)
                    ]  #bp[scene][labels[4]]['kappa']
                    models[(var, es)] = ktd
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])
            alg_num += 1

            # KTD-Q with Active Learning
            print(labels_act[alg_num])
            t_start = time.time()
            models = {}
            for var in [1.0, 10.0]:
                ktd = [
                    brl.ktd_Q(scene,
                              discount,
                              0.0,
                              var,
                              init_policy=True,
                              TH=v[1]) for i in range(Nrun)
                ]
                [
                    ktd[i].learning(1,
                                    actionPolicy="active",
                                    actionParam=None,
                                    eval_greedy=True) for i in range(Nrun)
                ]
                models[(var, )] = ktd
            elapsed_t = round((time.time() - t_start) / len(models), 2)
            action_policy_helper(scene, models, result, labels_act[alg_num],
                                 elapsed_t)
            action_policy_helper(scene,
                                 models,
                                 result2,
                                 labels_act[alg_num],
                                 elapsed_t,
                                 reward_based=False)
            print(result['tab'][-1])

        f_rew.write(tabulate(result['tab'], headers=headers,
                             tablefmt='orgtbl'))
        f_rew.write('\n')
        f_rew.write(
            tabulate(result2['tab'],
                     headers=["ALG. Eval_based"] + headers[1:],
                     tablefmt='orgtbl'))
        f_rew.write('\n')
        f_rew.close()
Esempio n. 4
0
def off_policy(eval_scenes=None):
    if eval_scenes == None:
        eval_scenes = scene_set.keys()

    print("Off-Policy Learning...")
    for scene in eval_scenes:
        f_rmse = open(os.path.join(args.result_dir, "rmse.txt"), "a")
        v = scene_set[scene]
        alg_num = 0
        result = {'Q_err': [], 'tab': []}
        print("Domain:" + scene)
        actionSet = [np.random.choice(v[0], v[1]) for i in range(Nrun)]
        if scene == 'loop':  # Non-episodic
            init_means = 1.0 / (1 - discount) * np.ones((Nrun, ))
        else:
            init_means = np.ones((Nrun, ))

        # Q-learning (fixed learning rate)
        print(labels[alg_num])
        t_start = time.time()
        models = {}
        for alpha in alphas:
            peq = [
                tq.Qlearning(scene,
                             alpha,
                             discount,
                             init_means[i],
                             init_policy=False) for i in range(Nrun)
            ]
            [
                peq[i].learning('offline', actionSet[i], rate_decay=False)
                for i in range(Nrun)
            ]
            models[(alpha, )] = peq
        off_policy_helper(scene, models, result, labels[alg_num],
                          round((time.time() - t_start) / len(models), 2))
        alg_num += 1

        # ADFQ - Numeric
        print(labels[alg_num])
        t_start = time.time()
        models = {}
        for var in variances:
            adfq = [
                brl.adfq(scene,
                         discount,
                         init_means[i],
                         var,
                         init_policy=False) for i in range(Nrun)
            ]
            [
                adfq[i].learning(updatePolicy='Numeric',
                                 actionPolicy='offline',
                                 actionParam=actionSet[i]) for i in range(Nrun)
            ]
            models[(var, )] = adfq
        off_policy_helper(scene, models, result, labels[alg_num],
                          round((time.time() - t_start) / len(models), 2))
        alg_num += 1

        # ADFQ - Approx
        print(labels[alg_num])
        t_start = time.time()
        models = {}
        for var in variances:
            adfq = [
                brl.adfq(scene,
                         discount,
                         init_means[i],
                         var,
                         init_policy=False) for i in range(Nrun)
            ]
            [
                adfq[i].learning(updatePolicy='Approx',
                                 actionPolicy='offline',
                                 actionParam=actionSet[i]) for i in range(Nrun)
            ]
            models[(var, )] = adfq
        off_policy_helper(scene, models, result, labels[alg_num],
                          round((time.time() - t_start) / len(models), 2))
        alg_num += 1

        # KTD-Q
        if not (scene == 'maze'):
            print(labels[alg_num])
            t_start = time.time()
            models = {}
            for var in variances:
                for kappa in [1.0, 0.5 * v[2], v[2]]:
                    ktd = [
                        brl.ktd_Q(scene,
                                  discount,
                                  init_means[i],
                                  var,
                                  init_policy=False) for i in range(Nrun)
                    ]
                    [
                        ktd[i].learning(kappa,
                                        actionPolicy="offline",
                                        actionParam=actionSet[i])
                        for i in range(Nrun)
                    ]
                    models[(var, kappa)] = ktd
            off_policy_helper(scene, models, result, labels[alg_num],
                              round((time.time() - t_start) / len(models), 2))
            alg_num += 1

        np.save(os.path.join(args.result_dir, "err_set_" + scene),
                result['Q_err'])
        f_rmse.write(scene + "\n")
        f_rmse.write(
            tabulate(result['tab'], headers=headers, tablefmt='orgtbl'))
        f_rmse.write('\n')
Esempio n. 5
0
import brl as brl
import numpy as np
import tabularRL

env_name = 'minimaze'
TH = 15000
action_random = np.random.choice(4, TH)
print("Testing the algorithm in the Mini-Maze environment...")

x = brl.adfq(env_name, 0.95, init_mean=0.001, TH=TH)
x.env.set_slip(0.0)
x.learning('offline', action_random, eval_greedy=True)
avg_rew = np.mean(x.test_rewards[-10:])
if avg_rew == 3.0:
    print("ADFQ Deterministic: Reached an optimal policy... Passed the test!")
else:
    print(avg_rew)
    raise ValueError("It was unable to reach an optimal policy")

y = brl.adfq(env_name, 0.95, init_mean=0.001, TH=TH)
y.learning('offline', action_random, noise=0.001, eval_greedy=True)
avg_rew = np.mean(y.test_rewards[-10:])
if avg_rew > 2.0:
    print("ADFQ Stochastic: Reached an optimal policy... Passed the test!")
else:
    print(avg_rew)
    raise ValueError("It was unable to reach an optimal policy")

q = tabularRL.Qlearning(env_name, 0.5, 0.95, initQ=0.001, TH=TH)
q.env.set_slip(0.0)
q.learning('offline', action_random, eval_greedy=True)
Esempio n. 6
0
    f1, ax1 = plt.subplots()
    f2, ax2 = plt.subplots()
    f3, ax3 = plt.subplots()
    f4, ax4 = plt.subplots()
    """
	# Numeric
	x_num = brl.adfq(scene, discount, init_mean, 100.0, init_policy=init_policy)
	x_num.obj.set_time(T)
	x_num.obj.set_slip(slip_p)
	x_num.learning('Numeric', 'offline', actions, eval_greedy = True, useScale=useScale, batch_size=batch_size, noise = args.noise)
	models[scene].append(x_num)
	"""
    # SoftApprox
    x_softapp = brl.adfq(scene,
                         discount,
                         init_mean,
                         100.0,
                         init_policy=init_policy)
    x_softapp.obj.set_time(T)
    x_softapp.obj.set_slip(slip_p)
    x_softapp.learning('SoftApprox',
                       'offline',
                       actions,
                       eval_greedy=True,
                       useScale=useScale,
                       batch_size=batch_size,
                       noise=args.noise)
    models[scene].append(x_softapp)
    if batch_size == 0:
        # SoftApprox with Asymptotic
        x_softapp_asym = brl.adfq(scene,