def action_policy(eval_scenes=None, useParam=True): if eval_scenes == None: eval_scenes = scene_set.keys() headers = [ "ALG. Rew_based", "Tot rewards", "SD", "Elapsed T", "HyperParam1", "HyperParam2", "HyperParam3" ] print("Aciton Policy Learning...") init_mean = 3.0 test_rewards_set = {} for scene in eval_scenes: v = scene_set[scene] alg_num = 0 f_rew = open(os.path.join(args.log_dir, "total_rewards_range.txt"), "a") f_rew.write("\n" + scene + "\n") print("Domain:" + scene) #f_rew.close() result = {'reward': [], 'count': [], 'Q': [], 'tab': []} result2 = {'reward': [], 'count': [], 'Q': [], 'tab': []} # Q-learning_fixed, egreedy print(labels_act[alg_num]) t_start = time.time() models = {} tmp = {} for alpha in alphas: for es in epsilons: peq = [ tq.Qlearning(scene, alpha, discount, initQ=init_mean, TH=v[1]) for i in range(Nrun) ] [peq[i].env.set_slip(args.slip) for i in range(Nrun)] [ peq[i].learning('egreedy', es, eval_greedy=True, rate_decay=True) for i in range(Nrun) ] models[(alpha, es)] = peq tmp[(alpha, es)] = [x.test_rewards for x in peq] test_rewards_set[labels_act[alg_num]] = tmp elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) alg_num += 1 # Q-learning_fixed, boltzmann print(labels_act[alg_num]) t_start = time.time() models = {} tmp = {} for alpha in alphas: for tau in boltz_temp: peq = [ tq.Qlearning(scene, alpha, discount, initQ=init_mean, TH=v[1]) for i in range(Nrun) ] [peq[i].env.set_slip(args.slip) for i in range(Nrun)] [ peq[i].learning('softmax', tau, eval_greedy=True, rate_decay=True) for i in range(Nrun) ] models[( alpha, tau, )] = peq tmp[(alpha, tau)] = [x.test_rewards for x in peq] test_rewards_set[labels_act[alg_num]] = tmp elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) alg_num += 1 #model_set = {} # ADFQs - Egreedy for policy in update_policies: print(labels_act[alg_num]) t_start = time.time() models = {} tmp = {} if args.slip == 0.0: for es in epsilons: for var in variances: adfq = [ brl.adfq(scene, discount, init_mean=init_mean, init_var=var, TH=v[1]) for i in range(Nrun) ] [adfq[i].env.set_slip(args.slip) for i in range(Nrun)] [ adfq[i].learning(updatePolicy=policy, actionPolicy='egreedy', actionParam=es, eval_greedy=True) for i in range(Nrun) ] models[(es, var)] = adfq tmp[(es, var)] = [x.test_rewards for x in adfq] else: for noise in noises: for batch_size in batch_sizes: for es in epsilons: adfq = [ brl.adfq(scene, discount, init_mean=init_mean, TH=v[1]) for i in range(Nrun) ] [ adfq[i].env.set_slip(args.slip) for i in range(Nrun) ] [ adfq[i].learning(updatePolicy=policy, actionPolicy='egreedy', actionParam=es, eval_greedy=True, noise=noise, batch_size=batch_size) for i in range(Nrun) ] models[( noise, batch_size, es, )] = adfq tmp[( noise, batch_size, es, )] = [x.test_rewards for x in adfq] test_rewards_set[labels_act[alg_num]] = tmp elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) #model_set[(policy, 'egreedy')] = models alg_num += 1 # ADFQs - Thompson_Sampling for policy in update_policies: print(labels_act[alg_num]) t_start = time.time() models = {} tmp = {} if args.slip == 0.0: for var in variances: adfq = [ brl.adfq(scene, discount, init_mean=init_mean, init_var=var, TH=v[1]) for i in range(Nrun) ] [adfq[i].env.set_slip(args.slip) for i in range(Nrun)] [ adfq[i].learning(updatePolicy=policy, actionPolicy='ts', actionParam=None, eval_greedy=True) for i in range(Nrun) ] models[(var, )] = adfq tmp[(var, )] = [x.test_rewards for x in adfq] else: for noise in noises: for batch_size in batch_sizes: adfq = [ brl.adfq(scene, discount, init_mean=init_mean, TH=v[1]) for i in range(Nrun) ] [adfq[i].env.set_slip(args.slip) for i in range(Nrun)] [ adfq[i].learning(updatePolicy=policy, actionPolicy='ts', actionParam=None, eval_greedy=True, noise=noise, batch_size=batch_size) for i in range(Nrun) ] models[( noise, batch_size, )] = adfq tmp[( noise, batch_size, )] = [x.test_rewards for x in adfq] test_rewards_set[labels_act[alg_num]] = tmp elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) pickle.dump( test_rewards_set, open(os.path.join(args.log_dir, "set_rewards.pkl"), "wb")) alg_num += 1 if False: #not(scene == 'maze'): # KTD-Q with Egreedy print(labels_act[alg_num]) t_start = time.time() models = {} tmp = {} for kappa in [1.0, 0.5 * v[2], v[2]]: for es in epsilons: ktd = [ brl.ktd_Q(scene, discount, init_mean=init_mean, TH=v[1]) for i in range(Nrun) ] [ktd[i].env.set_slip(args.slip) for i in range(Nrun)] [ ktd[i].learning(kappa=kappa, actionPolicy="egreedy", actionParam=es, eval_greedy=True) for i in range(Nrun) ] #bp[scene][labels[4]]['kappa'] models[(kappa, es)] = ktd tmp[(kappa, es)] = [x.test_rewards for x in ktd] test_rewards_set[labels_act[alg_num]] = tmp elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) alg_num += 1 # KTD-Q with Active Learning print(labels_act[alg_num]) t_start = time.time() models = {} for kappa in [1.0, 0.5 * v[2], v[2]]: ktd = [ brl.ktd_Q(scene, discount, init_mean=init_mean, TH=v[1]) for i in range(Nrun) ] [ktd[i].env.set_slip(args.slip) for i in range(Nrun)] [ ktd[i].learning(kappa=kappa, actionPolicy="active", actionParam=None, eval_greedy=True) for i in range(Nrun) ] models[(kappa, )] = ktd tmp[(kappa, )] = [x.test_rewards for x in ktd] test_rewards_set[labels_act[alg_num]] = tmp elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) f_rew.write(tabulate(result['tab'], headers=headers, tablefmt='orgtbl')) f_rew.write('\n') f_rew.write( tabulate(result2['tab'], headers=["ALG. Eval_based"] + headers[1:], tablefmt='orgtbl')) f_rew.write('\n') f_rew.close() pickle.dump(test_rewards_set, open(os.path.join(args.log_dir, "set_rewards.pkl"), "wb"))
def off_policy(eval_scenes=None): if eval_scenes == None: eval_scenes = scene_set.keys() headers = [ "ALG. Q_err based", "RMSE mean ", "Elapsed T", "HyperParam1", "HyperParam2", "HyperParam3" ] print("Off-Policy Learning...") for scene in eval_scenes: f_rmse = open(os.path.join(args.log_dir, "rmse.txt"), "a") v = scene_set[scene] alg_num = 0 result = {'Q_err': [], 'tab': []} print("Domain:%s with slip %.2f" % (scene, args.slip)) actionSet = [np.random.choice(v[0], v[1]) for i in range(Nrun)] if MDPs.model_assign(scene).episodic: # Non-episodic init_means = np.ones((Nrun, )) else: init_means = 1.0 / (1 - discount) * np.ones((Nrun, )) # Q-learning (fixed learning rate) print(labels_off[alg_num]) t_start = time.time() models = {} for alpha in alphas: algs = [ tq.Qlearning(scene, alpha, discount, initQ=init_means[i]) for i in range(Nrun) ] [algs[i].env.set_slip(args.slip) for i in range(Nrun)] [ algs[i].learning('offline', actionSet[i], rate_decay=True) for i in range(Nrun) ] models[(alpha, )] = algs off_policy_helper(scene, models, result, labels_off[alg_num], round((time.time() - t_start) / len(models), 2)) alg_num += 1 print(tabulate(result['tab'], headers=headers, tablefmt='orgtbl')) # ADFQ useScale = False for policy in update_policies: print(labels_off[alg_num]) t_start = time.time() models = {} if args.slip == 0.0: algs = [ brl.adfq(scene, discount, init_mean=init_means[i]) for i in range(Nrun) ] [algs[i].env.set_slip(args.slip) for i in range(Nrun)] [ algs[i].learning(updatePolicy=policy, actionPolicy='offline', actionParam=actionSet[i], useScale=useScale) for i in range(Nrun) ] models[(-1, )] = algs else: for batch_size in batch_sizes: for noise in noises: algs = [ brl.adfq(scene, discount, init_mean=init_means[i]) for i in range(Nrun) ] [algs[i].env.set_slip(args.slip) for i in range(Nrun)] [ algs[i].learning(updatePolicy=policy, actionPolicy='offline', actionParam=actionSet[i], batch_size=batch_size, noise=noise, useScale=useScale) for i in range(Nrun) ] models[( batch_size, noise, )] = algs off_policy_helper(scene, models, result, labels_off[alg_num], round((time.time() - t_start) / len(models), 2)) print(tabulate(result['tab'], headers=headers, tablefmt='orgtbl')) alg_num += 1 np.save(os.path.join(args.log_dir, "err_set_" + scene), result['Q_err']) # KTD-Q if not (scene == 'maze'): print(labels_off[alg_num]) t_start = time.time() models = {} for kappa in [1.0, 0.5 * v[2], v[2]]: print("kappa %.2f" % kappa) algs = [ brl.ktd_Q(scene, discount, init_mean=init_means[i]) for i in range(Nrun) ] [algs[i].env.set_slip(args.slip) for i in range(Nrun)] [ algs[i].learning(kappa=kappa, actionPolicy="offline", actionParam=actionSet[i]) for i in range(Nrun) ] models[(kappa, )] = algs off_policy_helper(scene, models, result, labels_off[alg_num], round((time.time() - t_start) / len(models), 2)) alg_num += 1 print(tabulate(result['tab'], headers=headers, tablefmt='orgtbl')) np.save(os.path.join(args.log_dir, "err_set_" + scene), result['Q_err']) f_rmse.write(scene + "\n") f_rmse.write( tabulate(result['tab'], headers=headers, tablefmt='orgtbl')) f_rmse.write('\n')
def action_policy(eval_scenes=None, useParam=True): if eval_scenes == None: eval_scenes = scene_set.keys() headers = [ "ALG. Rew_based", "Tot rewards", "SD", "Elapsed T", "HyperParam1", "HyperParam2" ] print("Aciton Policy Learning...") for scene in eval_scenes: v = scene_set[scene] alg_num = 0 f_rew = open(os.path.join(args.result_dir, "total_rewards_range.txt"), "a") f_rew.write("\n" + scene + "\n") print("Domain:" + scene) #f_rew.close() result = {'reward': [], 'count': [], 'Q': [], 'tab': []} result2 = {'reward': [], 'count': [], 'Q': [], 'tab': []} # # Q-learning_fixed, egreedy # print(labels_act[alg_num]) # t_start = time.time() # models = {} # for alpha in alphas: # for es in epsilons: # peq = [tq.Qlearning(scene, alpha, discount, 0.0, init_policy=True, TH=v[1]) for i in range(Nrun)] # [peq[i].learning('egreedy',es, eval_greedy =True, rate_decay=False) for i in range(Nrun)] # models[(alpha,es)] = peq # elapsed_t = round( (time.time()-t_start)/len(models) ,2) # action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) # action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) # print(result['tab'][-1]) alg_num += 1 # # Q-learning_fixed, boltzmann # print(labels_act[alg_num]) # t_start = time.time() # models = {} # for alpha in alphas: # for tau in boltz_temp: # peq = [tq.Qlearning(scene, alpha, discount, 0.0, init_policy=True, TH=v[1]) for i in range(Nrun)] # [peq[i].learning('softmax',tau,eval_greedy =True, rate_decay=False) for i in range(Nrun)] # models[(alpha,tau,)] = peq # elapsed_t = round( (time.time()-t_start)/len(models) ,2) # action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) # action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) # print(result['tab'][-1]) alg_num += 1 # ADFQs - Egreedy for policy in update_policies: print(labels_act[alg_num]) t_start = time.time() models = {} for var in variances: for es in epsilons: adfq = [ brl.adfq(scene, discount, 0.0, var, init_policy=True, TH=v[1]) for i in range(Nrun) ] [ adfq[i].learning(updatePolicy=policy, actionPolicy='egreedy', actionParam=es, eval_greedy=True, updateParam=0.01) for i in range(Nrun) ] models[(var, es)] = (adfq) elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) alg_num += 1 # ADFQs - Eg+Bayesian for policy in update_policies: print(labels_act[alg_num]) t_start = time.time() models = {} for var in variances: for es in epsilons: adfq = [ brl.adfq(scene, discount, 0.0, var, init_policy=True, TH=v[1]) for i in range(Nrun) ] [ adfq[i].learning(updatePolicy=policy, actionPolicy='semi-Bayes', actionParam=es, eval_greedy=True, updateParam=0.01) for i in range(Nrun) ] models[(var, es)] = (adfq) elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) alg_num += 1 # ADFQs - Bayesian for policy in update_policies: print(labels_act[alg_num]) t_start = time.time() models = {} for var in variances: adfq = [ brl.adfq(scene, discount, 0.0, var, init_policy=True, TH=v[1]) for i in range(Nrun) ] [ adfq[i].learning(updatePolicy=policy, actionPolicy='Bayes', actionParam=None, eval_greedy=True, updateParam=0.01) for i in range(Nrun) ] models[(var, )] = (adfq) elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) alg_num += 1 if False: # not(scene == 'maze'): # KTD-Q with Egreedy print(labels_act[alg_num]) t_start = time.time() models = {} for var in [1.0, 10.0]: for es in [0.05, 0.1, 0.15]: ktd = [ brl.ktd_Q(scene, discount, 0.0, var, init_policy=True, TH=v[1]) for i in range(Nrun) ] [ ktd[i].learning(1, actionPolicy="egreedy", actionParam=es, eval_greedy=True) for i in range(Nrun) ] #bp[scene][labels[4]]['kappa'] models[(var, es)] = ktd elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) alg_num += 1 # KTD-Q with Active Learning print(labels_act[alg_num]) t_start = time.time() models = {} for var in [1.0, 10.0]: ktd = [ brl.ktd_Q(scene, discount, 0.0, var, init_policy=True, TH=v[1]) for i in range(Nrun) ] [ ktd[i].learning(1, actionPolicy="active", actionParam=None, eval_greedy=True) for i in range(Nrun) ] models[(var, )] = ktd elapsed_t = round((time.time() - t_start) / len(models), 2) action_policy_helper(scene, models, result, labels_act[alg_num], elapsed_t) action_policy_helper(scene, models, result2, labels_act[alg_num], elapsed_t, reward_based=False) print(result['tab'][-1]) f_rew.write(tabulate(result['tab'], headers=headers, tablefmt='orgtbl')) f_rew.write('\n') f_rew.write( tabulate(result2['tab'], headers=["ALG. Eval_based"] + headers[1:], tablefmt='orgtbl')) f_rew.write('\n') f_rew.close()
def off_policy(eval_scenes=None): if eval_scenes == None: eval_scenes = scene_set.keys() print("Off-Policy Learning...") for scene in eval_scenes: f_rmse = open(os.path.join(args.result_dir, "rmse.txt"), "a") v = scene_set[scene] alg_num = 0 result = {'Q_err': [], 'tab': []} print("Domain:" + scene) actionSet = [np.random.choice(v[0], v[1]) for i in range(Nrun)] if scene == 'loop': # Non-episodic init_means = 1.0 / (1 - discount) * np.ones((Nrun, )) else: init_means = np.ones((Nrun, )) # Q-learning (fixed learning rate) print(labels[alg_num]) t_start = time.time() models = {} for alpha in alphas: peq = [ tq.Qlearning(scene, alpha, discount, init_means[i], init_policy=False) for i in range(Nrun) ] [ peq[i].learning('offline', actionSet[i], rate_decay=False) for i in range(Nrun) ] models[(alpha, )] = peq off_policy_helper(scene, models, result, labels[alg_num], round((time.time() - t_start) / len(models), 2)) alg_num += 1 # ADFQ - Numeric print(labels[alg_num]) t_start = time.time() models = {} for var in variances: adfq = [ brl.adfq(scene, discount, init_means[i], var, init_policy=False) for i in range(Nrun) ] [ adfq[i].learning(updatePolicy='Numeric', actionPolicy='offline', actionParam=actionSet[i]) for i in range(Nrun) ] models[(var, )] = adfq off_policy_helper(scene, models, result, labels[alg_num], round((time.time() - t_start) / len(models), 2)) alg_num += 1 # ADFQ - Approx print(labels[alg_num]) t_start = time.time() models = {} for var in variances: adfq = [ brl.adfq(scene, discount, init_means[i], var, init_policy=False) for i in range(Nrun) ] [ adfq[i].learning(updatePolicy='Approx', actionPolicy='offline', actionParam=actionSet[i]) for i in range(Nrun) ] models[(var, )] = adfq off_policy_helper(scene, models, result, labels[alg_num], round((time.time() - t_start) / len(models), 2)) alg_num += 1 # KTD-Q if not (scene == 'maze'): print(labels[alg_num]) t_start = time.time() models = {} for var in variances: for kappa in [1.0, 0.5 * v[2], v[2]]: ktd = [ brl.ktd_Q(scene, discount, init_means[i], var, init_policy=False) for i in range(Nrun) ] [ ktd[i].learning(kappa, actionPolicy="offline", actionParam=actionSet[i]) for i in range(Nrun) ] models[(var, kappa)] = ktd off_policy_helper(scene, models, result, labels[alg_num], round((time.time() - t_start) / len(models), 2)) alg_num += 1 np.save(os.path.join(args.result_dir, "err_set_" + scene), result['Q_err']) f_rmse.write(scene + "\n") f_rmse.write( tabulate(result['tab'], headers=headers, tablefmt='orgtbl')) f_rmse.write('\n')
import brl as brl import numpy as np import tabularRL env_name = 'minimaze' TH = 15000 action_random = np.random.choice(4, TH) print("Testing the algorithm in the Mini-Maze environment...") x = brl.adfq(env_name, 0.95, init_mean=0.001, TH=TH) x.env.set_slip(0.0) x.learning('offline', action_random, eval_greedy=True) avg_rew = np.mean(x.test_rewards[-10:]) if avg_rew == 3.0: print("ADFQ Deterministic: Reached an optimal policy... Passed the test!") else: print(avg_rew) raise ValueError("It was unable to reach an optimal policy") y = brl.adfq(env_name, 0.95, init_mean=0.001, TH=TH) y.learning('offline', action_random, noise=0.001, eval_greedy=True) avg_rew = np.mean(y.test_rewards[-10:]) if avg_rew > 2.0: print("ADFQ Stochastic: Reached an optimal policy... Passed the test!") else: print(avg_rew) raise ValueError("It was unable to reach an optimal policy") q = tabularRL.Qlearning(env_name, 0.5, 0.95, initQ=0.001, TH=TH) q.env.set_slip(0.0) q.learning('offline', action_random, eval_greedy=True)
f1, ax1 = plt.subplots() f2, ax2 = plt.subplots() f3, ax3 = plt.subplots() f4, ax4 = plt.subplots() """ # Numeric x_num = brl.adfq(scene, discount, init_mean, 100.0, init_policy=init_policy) x_num.obj.set_time(T) x_num.obj.set_slip(slip_p) x_num.learning('Numeric', 'offline', actions, eval_greedy = True, useScale=useScale, batch_size=batch_size, noise = args.noise) models[scene].append(x_num) """ # SoftApprox x_softapp = brl.adfq(scene, discount, init_mean, 100.0, init_policy=init_policy) x_softapp.obj.set_time(T) x_softapp.obj.set_slip(slip_p) x_softapp.learning('SoftApprox', 'offline', actions, eval_greedy=True, useScale=useScale, batch_size=batch_size, noise=args.noise) models[scene].append(x_softapp) if batch_size == 0: # SoftApprox with Asymptotic x_softapp_asym = brl.adfq(scene,