def NetworkBase_timeStep(self, time, supportDepressionImpact, concealDiscriminateImpact, discriminateConcealImpact, discriminateDepressionImpact, concealDepressionImpact, support=None, conceal=None, discrimination=None, attitude=None, depression=None, policyScore=None, bias=0): ONLY_NON_DISCRIMINATORY = 1 ONLY_DISCRIMINATORY = 2 # "Natural gap" between passing of enforced policies TIME_GAP = 5 # Considers the cases where the type of policy is externally # enforced (not proposed at random in simulation) if (policyScore or bias) and time % TIME_GAP == 0: newPolicy = Policy(time, score=policyScore, biasPass=bias) # Converst from the numerical bias to a boolean for if # the scores are bias towards discriminatory or support if bias == ONLY_NON_DISCRIMINATORY: onlyDisc = False else: onlyDisc = True self.NetworkBase_enforcePolicy(time, score=policyScore, onlyDisc=onlyDisc) else: newPolicy = Policy(time) newPolicy.Policy_considerPolicy(self, time, self.policyCap) self.NetworkBase_updatePolicyScore(time) for agentID in self.Agents: self.Agents[agentID].Agent_updateAgent(time, supportDepressionImpact, concealDiscriminateImpact, discriminateConcealImpact, discriminateDepressionImpact, concealDepressionImpact, support, conceal, discrimination, attitude, depression)
def get_soft_policy_from_qf_dict( qf_dict: SAf, softmax: bool, epsilon: float ) -> Policy: if softmax: ret = Policy({s: find_softmax_action_probs(v) for s, v in qf_dict.items()}) else: ret = Policy({s: find_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items()}) return ret
def NetworkBase_enforcePolicy(self, time, score=None, onlyDisc=False): ONLY_NON_DISCRIMINATORY = 1 ONLY_DISCRIMINATORY = 2 if self.policyScore + score > self.policyCap: return if score: enforcedPolicy = Policy(time=time, score=score) else: # Maps from boolean value to the ints specified above biasType = int(onlyDisc) + 1 enforcedPolicy = Policy(time=time, biasPass=biasType) self.NetworkBase_addToPolicies(enforcedPolicy, time)
def one_step_lookahead(self, V, pe=0): new_policy_mat = [[[None for y in range(self.length)] for x in range(self.width)] for dir in range(self.num_dirs)] for state in self.env.states: adj_states = self.env.getAdjStates(state) max_action_value = float("-inf") best_action = None for action_tuple in action_space: move, rotate = action_tuple action = Action(move, rotate) action_value = 0 for nxt_state in adj_states: nxt_x, nxt_y, nxt_dir = nxt_state.getState() action_value += self.get_trans_prob( pe, state, action, nxt_state) * V[nxt_dir][nxt_x][nxt_y] if action_value > max_action_value: max_action_value = action_value best_action = action cur_x, cur_y, cur_dir = state.getState() new_policy_mat[cur_dir][cur_x][cur_y] = best_action new_policy = Policy(new_policy_mat) return new_policy
def main(): env = gym.make('CartPole-v1') pi = Policy(LEARNING_RATE, GAMMA) score = 0.0 print_interval = 20 for n_epi in range(10000): s = env.reset() done = False while not done: prob = pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample() s_prime, r, done, info = env.step(a.item()) pi.put_data((r, prob[a])) s = s_prime score += r pi.train_net() if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score: {}".format( n_epi, score / print_interval)) score = 0.0 env.close()
def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1, lambda_2, lambda_3, n_steps, l_margin): # Input Parameters self.eps = eps # eps-greedy self.gamma = gamma # discount factor self.batch_size = batch_size self.tau = tau # frequency of target replacement self.ed = 0.005 # bonus for demonstration # todo they aren't used self.ea = 0.001 # todo they aren't used self.l_margin = l_margin self.n_steps = n_steps self.lambda1 = lambda_1 # n-step return self.lambda2 = lambda_2 # supervised loss self.lambda3 = lambda_3 # L2 self.counter = 0 # target replacement counter # todo change to iter_counter self.replay = Memory(capacity=max_memory) self.loss = nn.MSELoss() self.policy = Policy() # todo change not have to pass architecture self.opt = optim.Adam(self.policy.predictNet.parameters(), lr=lr, weight_decay=lambda_3) self.replay.e = 0 self.demoReplay = ddict(list) self.noisy = hasattr(self.policy.predictNet, "sample")
def _init_model(self): """init model from parameters""" self.env, env_continuous, self.num_states, self.num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(self.num_states, self.num_actions).to(device) self.value_net = Value(self.num_states).to(device) self.running_state = ZFilter((self.num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format( self.env_id, self.model_path, self.env_id)) data = pickle.load( open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb")) self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v)
def create_greedy_worker(networks, counter, config): logger = Logger("output_{0}_greedy.out".format(config['experiment'])) environment = HFOEnv(port=6321, seed=86868686, numOpponents=1) environment.connectToServer() w_args = (100000, networks["learning"], environment, Policy(logger=logger), logger, counter) return mp.Process(target=policy_worker.run, args=w_args)
def eval(model_type=model_type, model_path=model_path): if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' env = LunarLander() if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() episodes = 50 wins = 0 frames = [] fuel_left = [] for i in range(episodes): if i % 10 == 0: print(f"On episode {i}") frame_count = 0 env.reset() state = env.get_state() while True: frame_count += 1 action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action) if done: if env.won: wins += 1 frames.append(frame_count) fuel_left.append(env.rocket.fuel) break env.close() if wins > 0: print(f"wins: {wins}") print(f"mean frames on wins {np.mean(frames)}") print(f"std frames on wins {np.std(frames, ddof=1)}") print(f"min frames on wins {np.min(frames)}") print(f"max frames on wins {np.max(frames)}") print(f"mean fuel on wins {np.mean(fuel_left)}") print(f"std fuel on wins {np.std(fuel_left, ddof=1)}") print(f"min fuel on wins {np.min(fuel_left)}") print(f"max fuel on wins {np.max(fuel_left)}") else: print("The model had 0 wins. Statistics can't be calculated")
def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"): self.FILE = FILE self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.policy = Policy().to(self.device) self.policy.load_state_dict(torch.load(self.FILE)) self.policy.eval() self.criterion = nn.CrossEntropyLoss() self.learning_rate = learning_rate self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.learning_rate)
def __init__(self, params): self.params = params self.__state_dim = params['state_dim'] self.__action_dim = params['action_dim'] self.__buffer_size = params['buffer_size'] self.__batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__lr = params['lr'] self.__update_every = params['update_every'] eps = params['eps'] eps_decay = params['eps_decay'] min_eps = params['min_eps'] seed = params['seed'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network critic_params = dict() critic_params['seed'] = seed critic_params['arch_params'] = params['arch_params_critic'] self.critic_local = QNetwork(critic_params).to(device) self.critic_target = QNetwork(critic_params).to(device) self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=self.__lr) #Policy actor_params = dict() actor_params['seed'] = seed actor_params['arch_params'] = params['arch_params_actor'] actor_params['noise_type'] = params['noise_type'] actor_params['eps'] = eps actor_params['eps_decay'] = eps_decay actor_params['min_eps'] = min_eps actor_params['arch_params'] = params['arch_params_actor'] self.actor_local = Policy(actor_params).to(device) self.actor_target = Policy(actor_params).to(device) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=self.__lr) self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size) self.__t_step = 0
def load_weights(self, load_from): checkpoint = torch.load(load_from) qnet_params = checkpoint['critic_params'] policy_params = checkpoint['actor_params'] self.actor_local = Policy(policy_params['actor_params']) self.actor_local.load_state_dict( checkpoint['actor_params']['state_dict']) self.critic_local = QNetwork(qnet_params['critic_params']) self.critic_local.load_state_dict( checkpoint['critic_params']['state_dict']) return self
def __init__(self, env="LunarLanderContinuous-v2", gamma=0.99): self.policy = Policy(env_id=env) self.env = gym.make(env) self.runs = Runs(gamma=gamma) self.plotter = VisdomPlotter(env_name=env) self.device_cpu = torch.device("cpu") if torch.cuda.is_available(): self.use_gpu = True self.device = torch.device("cuda") else: self.use_gpu = False self.device = torch.device("cpu")
def __evaluate_general_award(self, battle_info, card): free_pos = battle_info.field.get_empty_pos(self_side_flag=True) best_pos = -1 best_award = -1 for pos in free_pos: award = self.get_policy_award(battle_info, card, pos) print("Testing Card " + str(card.id) + " in pos" + str(pos) + " AWard:" + str(award)) if award > best_award: best_pos = pos best_award = award return Policy(card, best_pos, best_award)
def policy_iteration(self, tol=1e-4) -> DetPolicy: ''' Find the optimal policy using policy iteration ''' pol = Policy({ s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items() }) vf = self.find_value_func_dict(pol) epsilon = tol * 1e4 while epsilon >= tol: pol = self.find_improved_policy(pol) new_vf = self.find_value_func_dict(pol) epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return pol
def parsePolicy(self, fname): width = None height = 0 policy = [] with open(fname) as file: while True: line = file.readline() if not line: break if width != None and (len(line) - 1) != width: raise Exception("Input width inconsistent") width = len(line) - 1 height += 1 rowActions = self.parseLine(line) policy.extend(rowActions) policy = Policy(policy) policy.setWidth(width) policy.setHeight(height) return (policy)
def create_workers(config, logger): counter = mp.Value('i', 0) learning_network = network_factory.create_network( num_layers=config["num_layers"], hidden_size=config["hidden_size"]) target_network = network_factory.create_network( num_layers=config["num_layers"], hidden_size=config["hidden_size"]) learning_network.load_state_dict(target_network.state_dict()) optimizer = SharedAdam(learning_network.parameters(), lr=config["learning_rate"]) optimizer.share_memory() workers = [] for idx in range(0, config["n_workers"]): networks = {"learning": learning_network, "target": target_network} # environment = create_environment(idx) policy = Policy(epsilon=config["startingEpsilons"][idx], numUpdates=config["numPolicyUpdates"], minEpsilon=config["minEpsilons"][idx], logger=logger) trainingArgs = (idx, networks, optimizer, counter, policy, config, logger) p = mp.Process(target=Worker.train, args=trainingArgs) logger.log("Starting process: {0}".format(idx)) p.start() logger.log("Process started: {0}".format(idx)) workers.append(p) logger.log("Worker Appended: {0}".format(idx)) logger.log("Creating the greedy worker") p = create_greedy_worker(networks, counter, config) p.start() workers.append(p) logger.log("Greedy worker started and appended") return workers, target_network
def value_iter(self, discount, pe=0): prev_value = np.zeros((self.num_dirs, self.width, self.length)) new_policy_matrix = [[[None for y in range(self.length)] for x in range(self.width)] for dir in range(self.num_dirs)] converge = False while not converge: # print("\nValue Iteration ") new_value = np.zeros((self.num_dirs, self.width, self.length)) for cur_state in self.env.states: cur_x, cur_y, cur_dir = cur_state.getState() adj_states = self.env.getAdjStates(cur_state) best_action = None max_action_value = float("-inf") for action_tuple in action_space: move, rotate = action_tuple action = Action(move, rotate) action_value = 0 for nxt_state in adj_states: x, y, dir = nxt_state.getState() action_value += self.get_trans_prob( pe, cur_state, action, nxt_state) * (self.get_reward(cur_state) + discount * prev_value[dir][x][y]) if action_value > max_action_value: best_action = action max_action_value = action_value new_policy_matrix[cur_dir][cur_x][cur_y] = best_action new_value[cur_dir][cur_x][cur_y] = max_action_value diff = np.sum(np.abs(new_value - prev_value)) # print("Value diff: ", diff) if np.array_equal(new_value, prev_value): converge = True prev_value = new_value new_policy = Policy(new_policy_matrix) return new_policy, new_value
def __init__( self, _NAUDIO_COMMANDS, #scalar, number of possible audio commands _EEG_INPUT_SHAPE, #shape, (ntimepoints, nchan, nfreqs) _LOGDIR, #pass in directory to write summaries and whatnot _POLICY_LR=1e-4, #scalar, policy learning rate _VALUE_LR=1e-3, #scalar, value learning rate _REWARD_MA_LEN=100, #scalar _LSTM_CELLS=[ 30, 30, 30 ] #lstm dimensions, (cell0_size, cell1_size, ...) when total length is number of cells ): # These should not be changed by user but may change later in architechture self._InputShape = list(_EEG_INPUT_SHAPE) self._LSTMCells = list(_LSTM_CELLS) self._LSTMUnrollLength = 1 self._ValueDiscount = 1.0 self._Policy = Policy(_LEARNING_RATE=_POLICY_LR, _ACTIONSPACE_SIZE=_NAUDIO_COMMANDS) self._Value = Value(_LEARNING_RATE=_VALUE_LR, _DISCOUNT_RATE=self._ValueDiscount) self._Reward = Reward(_INPUT_SHAPE=_EEG_INPUT_SHAPE, _MA_LENGTH=_REWARD_MA_LEN) self._Shared = Shared(_CELLS=_LSTM_CELLS, _UNROLL_LENGTH=self._LSTMUnrollLength) # We store a version of the hidden state which we pass in every iteration self._HiddenStateShape = (len(_LSTM_CELLS), 2, self._LSTMUnrollLength, _LSTM_CELLS[-1]) self._LocalHiddenState = np.zeros(self._HiddenStateShape) # Save the logdir self.mLogdir = _LOGDIR self._buildModel() self._buildSummaries() self._buildFeedDicts() self._initSession()
def train_policies(self, load_best_policy=False, load_reinforcement=False): if load_reinforcement: for i in range(0, game_setting.K): policy = Policy(self.game_setting) file_name = policy.load_reinforcement_model(i) self.policies.append([policy, file_name, 0, 0]) return if load_best_policy: start = 1 policy = Policy(self.game_setting) nr_of_training_cases = policy.load_best_model() self.policies.append([policy, nr_of_training_cases, 0, 0]) else: start = 0 policy = Policy(self.game_setting) max_cases = min( policy.import_data_and_train(max_cases=self.max_cases, test_nr_of_cases=True), self.max_cases) if self.negative_training_power > 0: for i in range(start, self.K): nr_of_cases = max( 0, max_cases // ((i + 1)**self.negative_training_power)) if nr_of_cases > 0: policy = Policy(self.game_setting) actual_nr_of_cases = policy.import_data_and_train( max_cases=nr_of_cases) else: policy = Policy(self.game_setting, no_model=True) actual_nr_of_cases = 0 self.policies.append([policy, actual_nr_of_cases, 0, 0]) else: for i in range(start, self.K): policy = Policy(self.game_setting) nr_of_cases = max( int(max_cases * (self.K - i - 1) / (self.K - 1)), 0) if nr_of_cases > 0: actual_nr_of_cases = policy.import_data_and_train( max_cases=nr_of_cases) self.policies.append([policy, actual_nr_of_cases, 0, 0]) else: self.policies.append([policy, 0, 0, 0])
def setUp(self): self.ss = (7,9) self.a_map = OrderedDict() self.a_map['U'] = (1,1) self.a_map['D'] = (-1,-1) self.a_map['R'] = (1,0) self.a_map['L'] = (1,-2) self.ws = WorldSpace(self.ss, self.a_map) self.p_kw = {} self.p_kw['discount_factor'] = 1 self.p_kw['exploration_factor'] = 0.95 self.p_kw['is_static'] = False self.p_kw['learn_rate'] = 0.001 self.policy = Policy(self.ws, **self.p_kw) self.p_kw['value_type'] = Policy.STATE_VALUES self.p_kw['init_variance'] = 0.01 self.tab_pol = TabularPolicy(self.ws, **self.p_kw)
import ConfigParser import sys import traceback import IRecv_Module as IM import logger from BaseThread import BaseThread from MPI_Wrapper import Client from MPI_Wrapper import Tags from Policy import Policy from Task import SampleTask from Task import TaskStatus from WorkerRegistry import WorkerStatus policy = Policy() log = logger.getLogger('WorkerAgent') wlog = None def MSG_wrapper(**kwd): return json.dumps(kwd) class HeartbeatThread(BaseThread): """ ping to master to update status """ def __init__(self, client, worker_agent): BaseThread.__init__(self, name='HeartbeatThread')
testStickyWall = False toImprovePolicy = False selectPolicy = False # run: optimalPolicyThroughImprovement = None optimalPolicyThroughValueIteration = None if beliefTracking: bel = Belief(gridMap) #bel.explore(randomActionSelection) bel.explore(QMDP) if testStickyWall: config = PolicyConfig(setStickyWalls=True) emptyPolicy = Policy([]) emptyPolicy.setConfig(config) emptyPolicy.valueIteration(gridMap) print(emptyPolicy) emptyPolicy.resetValues() print("sticky policy:") print(emptyPolicy) for pSticky in [0.25, 0.5, 0.75, 0.9]: print("pSticky: " + str(pSticky)) config.setStickyWallConfig(StickyWallConfig(pSticky)) emptyPolicy.valueIteration(gridMap) emptyPolicy.resetValues() print(emptyPolicy) if toImprovePolicy: # load default policy
# 'policy' or 'dqn' to choose which type of model to evaluate model_type = 'policy' # model_type = 'dqn' model_path = "policies/22-1-2021_13-44/policy0.tar" env = LunarLander() env.reset() exit_program = False if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() state = env.get_state() while not exit_program: env.render() action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action)
def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy: return Policy({s: {a: 1. / len(v) for a in v} for s, v in state_action_dict.items()})
def initializeEnv(self): """ Initializes the actor and critic neural networks and variables related to training Can be called to reinitialize the network to it's original state """ # Set random seed self.statusBox.setText('Creating environment...') s = self.parameters['Learning']['random_seed'] from random import seed if s != 0: seed(s) tf.random.set_random_seed(s) # Create environment envName = self.envSelectionDropdown.currentText().strip() try: self.env = gym.make(envName) except: import rl from rl.baselines import get_parameters config = get_parameters(envName) self.env = getattr(rl.environments, envName)(config=config) # Show screen try: self.env.render(mode="human") except: pass self.env.reset() self.done = False self.gamma = self.parameters['Learning']['gamma'] self.lam = self.parameters['Learning']['lambda'] self.policy_logvar = self.parameters['Learning']['log_variance'] self.trajectories = [] self.obs = self.env.observation_space.shape[0] try: self.actions = self.env.action_space.shape[0] self.actionWidget.setYRange(self.env.action_space.low[0] - .4, self.env.action_space.high[0] + .4) except: self.actions = self.env.action_space.n self.discrete = True # Create the list of deques that is used for averaging out the outputs of the actor network # during training of the network self.testAction = [deque(maxlen=5) for _ in range(self.actions)] self.valueFunction = NNValueFunction(self.obs, self.actions, self.parameters['Learning'], self.parameters['Networks']) self.policy = Policy(self.obs, self.actions, self.parameters['Learning'], self.parameters['Networks'], self.policy_logvar) self.policyLoss = [0] self.episode = 0 self.mean_reward = [] self.sums = 0.0 self.mean_actions = np.zeros( [self.parameters['Learning']['batch_size'], 3]) self.scaler = Scaler(self.env.observation_space.shape[0]) self.observes, self.rewards, self.unscaled_obs = None, None, None self.step = 0 self.statusBox.setText('Created {} environment.'.format(envName)) self.buttonStatus('initialized')
feature_vector = convertFeatureVectorToFormat(rootstate.board.flatten('F'), rootstate.toplay) training_data_file.write(",".join( str(int(input)) for input in feature_vector) + "|" + ",".join(str(target) for target in target) + "|" + "\n") game_setting = GameSetting() file_path = training_data_file_path = DATA_DIR + 'n'.join( str(dim) for dim in game_setting.network_dimensions ) + "-" + str(time.time() + datetime.now().microsecond) + "-" + ''.join( random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(5)) training_data_file = open(file_path, "w+") """ state = HexState1(game_setting) print(state) print(state.place_white((1,1))) print(state.place_black((0,0))) print(state.place_white((1,0))) print(state.place_black((0,1))) print(state) print(state.winner()) """ play_game(game_setting) policy = Policy(game_setting) policy.import_all_data_and_train() play_game(game_setting, policy=policy) training_data_file.close()
def __init__(self, world): self.w = world self.p = Policy(world) self.master = Tk() self.master.title("MDP Example: GridWorld") self.c = self.w.newCanvasToDraw(self.master) self.c.pack(side=LEFT, padx=10, pady=10) self.p.world.draw(self.c) self.frame = Frame(self.master, relief=RAISED, borderwidth=1) self.frame.pack(fill=BOTH, side=LEFT, expand=1) self.whatToShow = None self.turboMode = BooleanVar() self.algorithm = StringVar() self.algorithm.set("vi") self.bShowMap = Button(self.frame, text="Show Map", command=self.cbShowMap) self.bUtilities = Button(self.frame, text="Show Utilities", command=self.cbShowUtilities) self.bShowQvalues = Button(self.frame, text="Show Q-Values", command=self.cbShowQValues) self.bShowPolicy = Button(self.frame, text="Show Policy", command=self.cbShowPolicy) self.whatToShow = self.cbShowMap # COMPUTATION------------------------------------------ self.frameComputation = Frame(self.frame) self.computationStarted = False self.bComputation = Button(self.frameComputation, text="Start Computation") self.bComputation.config(command=self.toggleComputation) self.bComputation.pack(side=TOP, padx=10, pady=5) self.radioBAlgorithms = [] for text, mode in (("Value iteration", "vi"), ("Policy iteration", "pi")): b = Radiobutton(self.frameComputation, text=text, variable=self.algorithm, value=mode) b.pack(anchor=W, padx=10, pady=5) self.radioBAlgorithms.append(b) self.frameSleep = Frame(self.frameComputation) self.tSleep = Label(self.frameSleep, text="Sleep (sec): ") self.eSleep = Spinbox(self.frameSleep, from_=0, to=10, width=5) self.tSleep.pack(side=LEFT) self.eSleep.pack(side=LEFT) self.frameSleep.pack(side=TOP, padx=10, pady=5) self.turboModeCheck = Checkbutton(self.frameComputation, text="Turbo fix point", variable=self.turboMode) self.turboModeCheck.pack(side=TOP) self.tDebugModeIterations = Label(self.frameComputation, text="Iterations: 0") self.tDebugModeIterations.pack(side=TOP, padx=10, pady=5) self.bResetResults = Button(self.frameComputation, text="Reset Results") self.bResetResults.config(command=self.resetResults) self.bResetResults.pack(side=TOP, padx=10, pady=5) self.bShowMap.pack(side=TOP, padx=10, pady=5) self.bUtilities.pack(side=TOP, pady=5) self.bShowQvalues.pack(side=TOP, padx=10, pady=5) self.bShowPolicy.pack(side=TOP, pady=5) self.frameComputation.pack(side=BOTTOM, pady=20)
print("----------------") print("This is the Policy") policy_data = { 1: { 'a': 0.4, 'b': 0.6 }, 2: { 'a': 0.7, 'c': 0.3 }, 3: { 'b': 1.0 } } pol_obj = Policy(policy_data) print(pol_obj.policy_data) print("----------------") print("This is MRPRefined") mrp_refined_obj = mdp_refined_obj.get_mrp_refined(pol_obj) print("Transitions") print(mrp_refined_obj.mpgraph) print("Rewards Refined") print(mrp_refined_obj.rewards_refined) print("-----------------") print("This is MDP") print("Rewards") print(mdp_refined_obj.rewards)
'Study': 0.5, 'FB': 0.5 }, 'C2': { 'Study': 0.5, 'SLP': 0.5 }, 'C3': { 'Study': 0.5, 'Pub': 0.5 }, 'Facebook': { 'FB': 0.5, 'Quit': 0.5 }, 'Sleep': { 'SLP': 1 } } mdp_obj = MDP(student, 0.999999) pol_obj = Policy(policy) mrp_obj = mdp_obj.find_mrp(pol_obj) #print('The sink states are: \n',mdp_obj.find_sink_states(), "\n") #print('The terminal states are: \n',mdp_obj.find_terminal_states(), "\n") print('The value obtained from pol evaluation is: \n', mdp_obj.policy_evaluation(pol_obj), "\n") print('The value obtained from pol evaluation is: \n', mdp_obj.find_value_func_dict(pol_obj), "\n") pol_obj_vi = print(mdp_obj.policy_iteration())