single_sample.append([s[0], prev_action]) else: index = np.argmin( [abs(act - prev_action) for act in abs_opt_policy[mcrst]]) single_sample.append([s[0], abs_opt_policy[mcrst][index]]) fictitious_samples.append(single_sample) return fictitious_samples for i in range(0, N_ITERATION): determin_samples = sampling_from_det_pol(env, N_EPISODES, N_STEPS, det_param_a, det_param_b) abstraction.divide_samples(determin_samples, problem) abstraction.compute_abstract_tf(optA, ENV_NOISE) abs_opt_pol = abs_updater.solve_mdp(abstraction.get_container()) fictitious_samples = sampling_abstract_optimal_pol(abs_opt_pol, determin_samples, det_param_a, det_param_b) det_param_a = det_upd.batch_gradient_update_a(det_param_a, det_param_b, fictitious_samples) det_param_b = det_upd.batch_gradient_update_b(det_param_a, det_param_b, fictitious_samples) estj = helper.estimate_J_from_samples(determin_samples, GAMMA) print("Updated deterministic policy parameter A: {}".format(det_param_a)) print("Updated deterministic policy parameter B: {}".format(det_param_b)) print("Updated estimated performance measure: {}\n".format(estj))
else: index = np.argmin([ abs(act - prev_action) for act in abs_opt_policy[mcrst] ]) single_sample.append([s[0], abs_opt_policy[mcrst][index]]) fictitious_samples.append(single_sample) return fictitious_samples for i in range(0, N_ITERATION): deterministic_samples = sampling_from_det_pol(env, N_EPISODES, N_STEPS, det_param) abstraction.divide_samples(deterministic_samples, problem) abstraction.compute_abstract_tf() abstract_optimal_policy = abs_updater.solve_mdp( abstraction.get_container()) fictitious_samples = sampling_abstract_optimal_pol(abstract_optimal_policy, deterministic_samples, det_param) det_param = det_upd.batch_gradient_update(det_param, fictitious_samples) # j = env.computeJ(det_param, ENV_NOISE, N_EPISODES) absj = helper.estimate_J_from_samples(deterministic_samples, GAMMA) print("Updated deterministic policy parameter: {}".format(det_param)) print("Updated estimated performance measure: {}\n".format(absj)) visualizer.show_values(det_param, absj, absj) visualizer.save_image()
def main(seed=None): help = Helper(seed) # load and configure the environment. env = gym.make('LQG1D-v0') env.sigma_noise = ENV_NOISE env.A = np.array([A]).reshape((1, 1)) env.B = np.array([B]).reshape((1, 1)) env.gamma = GAMMA env.seed(help.getSeed()) INTERVALS = helper.get_constant_intervals(MIN_SPACE_VAL, MAX_SPACE_VAL, N_MCRST_DYN) print("INTERVALS: {}\n{}\n".format(N_MCRST_DYN, INTERVALS)) # calculate the optimal values of the problem. opt_par4vis = round(env.computeOptimalK()[0][0], 3) det_param = INIT_DETERMINISTIC_PARAM optJ4vis = round(env.computeJ(env.computeOptimalK(), 0, N_EPISODES), 3) # logging.basicConfig(level=logging.DEBUG, filename='../test.log', filemode='w', format='%(message)s') filename = "../csv/lqg1d/DPO/data{}.csv".format(help.getSeed()) data_file = open(filename, mode='w') file_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) # instantiate the components of the algorithm. # abstraction = LqgFKnown(A, B, GAMMA, SINK, INTERVALS) abstraction = LipschitzDeltaS(GAMMA, SINK, INTERVALS, A, B) if not STOCH else \ MaxLikelihoodAbstraction(GAMMA, SINK, INTERVALS, B * STOCH_L_MULTIPLIER) abs_updater = None if not STOCH: abs_updater = AbsUpdater(GAMMA, SINK, INTERVALS) if ds0 else IVI( GAMMA, SINK, True, INTERVALS) else: abs_updater = AbsUpdater(GAMMA, SINK, INTERVALS) det_upd = Updater(help.getSeed(), UPD_LAM) title = "A={}, B={}, Opt par={}, Opt J={}, Noise std dev={}".format( A, B, opt_par4vis, optJ4vis, ENV_NOISE) key = "{}_{}_{}_{}_{}".format(A, B, ENV_NOISE, det_param, help.getSeed()) key = key.replace('.', ',') key = key + ".jpg" initJ = env.computeJ(det_param, 0, N_EPISODES) visualizer = Lqg1dVisualizer(title, key, det_param, opt_par4vis, initJ, optJ4vis) visualizer.clean_panels() # PLOTTER INFO stats = {} stats['param'] = [] stats['j'] = [] stats['sampleJ'] = [] stats['abstractJ'] = [] stats['param'].append(det_param) stats['j'].append(initJ) # ------------ writer_min = SummaryWriter('runs/min') writer_max = SummaryWriter('runs/max') writer_opt = SummaryWriter('runs/opt') for i in range(0, N_ITERATION): determin_samples = sampling_from_det_pol(env, N_EPISODES, N_STEPS, det_param) # dyn_intervals = helper.build_mcrst_from_samples(determin_samples, N_MCRST_DYN, MIN_SPACE_VAL, MAX_SPACE_VAL) dyn_intervals = None abstraction.divide_samples(determin_samples, problem, help.getSeed(), intervals=dyn_intervals) abstraction.compute_abstract_tf(ds0, ENV_NOISE) abs_opt_pol = abs_updater.solve_mdp(abstraction.get_container(), intervals=dyn_intervals) # tensorboard for mcrst, ap in enumerate(abs_opt_pol): if len(ap) > 1: ap = ap[0] writer_opt.add_scalar('mcrst{}'.format(mcrst), ap, i) for mcrst, cont in enumerate(abstraction.get_container()): writer_min.add_scalar('mcrst{}'.format(mcrst), min(cont.keys()), i) writer_max.add_scalar('mcrst{}'.format(mcrst), max(cont.keys()), i) # ---- performance abstract policy --- first_states_ep = [d[0][0] for d in determin_samples] absJ = estimate_performance_abstract_policy(env, N_EPISODES, N_STEPS, abs_opt_pol, first_states_ep, dyn_intervals, INTERVALS) # ------------------------------------ fictitious_samples = sampling_abstract_optimal_pol( abs_opt_pol, determin_samples, det_param, dyn_intervals, INTERVALS) det_param = det_upd.batch_gradient_update(det_param, fictitious_samples) j = env.computeJ(det_param, 0, N_EPISODES) estj = helper.estimate_J_from_samples(determin_samples, GAMMA) print("{} - Updated deterministic policy parameter: {}".format( i, det_param)) print("Updated performance measure: {}".format(j)) print("Updated estimated performance measure: {}\n".format(estj)) visualizer.show_values(det_param, j, estj, absJ) file_writer.writerow([det_param, j, estj, absJ]) # PLOTTER INFO stats['param'].append(det_param) stats['j'].append(j) stats['sampleJ'].append(estj) stats['abstractJ'].append(absJ) # ------------ visualizer.save_image() writer_min.close() writer_max.close() writer_opt.close() return stats, opt_par4vis, optJ4vis
def main(seed=None, alpha=0.001, lam=0.0005): help = Helper(seed) # load and configure the environment. env = gym.make('ComplexMiniGolf-v0') env.sigma_noise = ENV_NOISE env.gamma = GAMMA env.seed(help.getSeed()) # logging.basicConfig(level=logging.DEBUG, filename='../../test.log', filemode='w', format='%(message)s') cumulative_fail = 0 cumulative_j = 0 filename = "../csv/minigolf/friction1.9/DPO/ALPHA={}/LAM={}/data{}.csv".format( alpha, lam, help.getSeed()) os.makedirs(os.path.dirname(filename), exist_ok=True) data_file = open(filename, mode='w') file_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) rbf = RBFNet(CENTERS, STD_DEV, INIT_W, help.getSeed(), alpha, lam) # rbf = RBFNet([3, 6, 10, 14, 17], [0.1, 0.3, 0.5, 0.7, 1]) # rbf = RBFNet([3, 6, 10, 14, 17], [0.49, 0.63, 0.79, 0.95, 1.33], help.getSeed()) visualizer = MGVisualizer( "MG visualizer", "minigolf/DPO/ALPHA={}/LAM={}/test{}.png".format( alpha, lam, help.getSeed())) visualizer.clean_panels() # PLOTTER INFO stats = {} stats['w1'] = [] stats['w2'] = [] stats['w3'] = [] stats['w4'] = [] stats['j'] = [] stats['fail'] = [] # ------------ for i in range(0, N_ITERATION): determin_samples = sampling_from_det_pol(env, N_EPISODES, N_STEPS, rbf) INTERVALS = helper.get_constant_intervals([MIN_SPACE_VAL], [MAX_SPACE_VAL], [N_MCRST_DYN])[0] # dyn_intervals = helper.build_mcrst_from_samples(determin_samples, N_MCRST_DYN, MIN_SPACE_VAL, MAX_SPACE_VAL) INTERVALS = [[-4, 0]] + INTERVALS dyn_intervals = None if i == 0: abstraction = LipschitzDeltaS( GAMMA, SINK, INTERVALS) if ds0 else LipschitzDeltaS( GAMMA, SINK, INTERVALS, 1.3, 0.9) # abstraction = MaxLikelihoodAbstraction(GAMMA, SINK, INTERVALS, 5.5) abs_updater = AbsUpdater(GAMMA, SINK, INTERVALS, -100) if ds0 and LDELTAS == 0 else \ IVI(GAMMA, SINK, True, INTERVALS) # abs_updater = AbsUpdater(GAMMA, SINK, INTERVALS, 0) abstraction.divide_samples(determin_samples, problem, help.getSeed(), intervals=dyn_intervals) abstraction.compute_abstract_tf(ds0, LDELTAS) abs_opt_pol = abs_updater.solve_mdp(abstraction.get_container(), intervals=dyn_intervals) fictitious_samples = sampling_abstract_optimal_pol( abs_opt_pol, determin_samples, rbf, INTERVALS) fictitious_samples = helper.flat_listoflists(fictitious_samples) X = [f[0] for f in fictitious_samples] y = [f[1] for f in fictitious_samples] # X = np.reshape([f[0] for f in fictitious_samples], (len(fictitious_samples),)) # y = np.reshape([f[1] for f in fictitious_samples], (len(fictitious_samples),)) rbf.fit(X, y) estj = helper.estimate_J_from_samples(determin_samples, GAMMA) cumulative_j += estj print("Iteration n.{}".format(i)) print("W: {}".format(rbf.w)) print("Updated estimated performance measure: {}".format(estj)) zeros, hundred, failing_states = helper.minigolf_reward_counter( determin_samples) print("Number of zeroes: {} - Number of big penalties: {}".format( zeros, hundred)) print("Failing states: {}".format(failing_states)) cumulative_fail += hundred print("Cumulative fails: {}\n".format(cumulative_fail)) # actions = [m.keys() for m in abstraction.get_container()] # action_range = [max(a) - min(a) if len(a) > 0 else 0 for a in actions] # intervals = dyn_intervals if dyn_intervals is not None else INTERVALS # [print("Mcrst = {}, diameter = {}, action range = {}".format(dyn, dyn[1] - dyn[0], ran)) for dyn, ran in # zip(intervals, action_range)] # print("\n") w = rbf.w visualizer.show_values(w, estj, cumulative_fail) file_writer.writerow([w[0], w[1], w[2], w[3], cumulative_fail, estj]) # --- APPENDIX E --- if i == 0 or i == 99 or i == 199 or i == 299 or i == 399 or i == 499: filename2 = "../csv/minigolf/appendix/ALPHA={}/LAM={}/it{}/data{}.csv".format( alpha, lam, i, help.getSeed()) os.makedirs(os.path.dirname(filename2), exist_ok=True) data_file2 = open(filename2, mode='w') file_writer2 = csv.writer(data_file2, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) file_writer2.writerow([ 'mcrst', 'min_a', 'max_a', 'min_opt_a', 'max_opt_a', 'w1', 'w2', 'w3', 'w4' ]) for j in range(1, len(abstraction.get_container()) - 1): actions = abstraction.get_container()[j].keys() w = rbf.w file_writer2.writerow([ j, min(actions), max(actions), min(abs_opt_pol[j]), max(abs_opt_pol[j]), w[0], w[1], w[2], w[3] ]) data_file2.close() # ------------------ # PLOTTER INFO # if i % 10 == 0: stats['w1'].append(w[0]) stats['w2'].append(w[1]) stats['w3'].append(w[2]) stats['w4'].append(w[3]) stats['j'].append(estj) stats['fail'].append(cumulative_fail) # ------------ visualizer.save_image() return stats, cumulative_j