Ejemplo n.º 1
0
def buildTwoMDPs():
    n = 10
    rewards = np.zeros(4)
    probabilities = np.zeros(2)
    myMDPs = []

    rewards[1] = penalty_handicap = -10
    rewards[2] = penalty_collision = -100
    probabilities[0] = prob_avail_handicap = 0.9
    probabilities[
        1] = prob_T = 5.0  # probability temperature: lower - most occupied, higher - most available
    """ 1st set of parameter values for less cost of driving and high reward for closest parking spot"""
    myMDP1 = mdp.MDP(n)
    MDPname = "myMDP1.txt"
    rewards[0] = penalty_driving = -1
    rewards[3] = best_reward = 100
    myMDP1.make_MDP(MDPname, rewards, probabilities)
    myMDPs.append(myMDP1)
    """ 2nd set of parameter values for high cost of driving and less reward for closest parking spot"""
    myMDP2 = mdp.MDP(n)
    MDPname = "myMDP2.txt"
    rewards[0] = penalty_driving = -10
    rewards[3] = best_reward = 10
    myMDP2.make_MDP(MDPname, rewards, probabilities)
    myMDPs.append(myMDP2)

    return myMDPs
Ejemplo n.º 2
0
def main(symbol, numTrial):
    # Usage: Currently, we cannot take too much features
    # If one wishes to run directly, we only apply 'volatility' and 'order_flow' in features.
    totalVol = 50
    voLevel = 5
    timeGap = 10
    priceFlex = 4
    timeLevel = 5

    # First timeLevel * timeGap time: time for limit order
    # Last 1 : time for market order
    total_time = timeLevel * timeGap + 1
    train_df, test_df, quantile = data_prepare(symbol, total_time)
    FeatureNum = len(quantile.columns)

    # Initilize MDP
    mdp = MDP.TradeMDP(totalVol=totalVol, voLevel=voLevel, timeLevel=timeLevel, priceFlex=priceFlex, timeGap=timeGap,
                       FeatureLevel=3, FeatureNum=FeatureNum)
    q_algo = MDP.QLearningAlgorithm(mdp, exploreStep=2000, init_prob=0.8, final_prob=0.1)
    RL_ontrain_reward, SL_ontrain_reward = df_simulate(numTrial, q_algo, train_df, total_time, quantile, train=True)
    RL_train_reward, SL_train_reward = df_simulate(1, q_algo, train_df, total_time, quantile, train=False)
    RL_test_reward, SL_test_reward = df_simulate(1, q_algo, test_df, total_time, quantile, train=False)

    return RL_ontrain_reward, RL_train_reward, RL_test_reward, \
           SL_ontrain_reward, SL_train_reward, SL_test_reward, q_algo
Ejemplo n.º 3
0
    def __init__(self,
                 network_file='network_topology',
                 initial=0,
                 targets=[],
                 num_obstacles=0,
                 T=1,
                 task_type='sequential',
                 visualization=False,
                 decoys_set=[]):
        self.dg = nx.read_gml(network_file)

        edge_number = []
        dead_end = set()
        for index in range(self.dg.number_of_nodes()):
            if self.dg.out_degree(str(index)) == 0:
                dead_end.add(index)
            edge_number.append(len(list(self.dg.neighbors(str(index)))))
        self.action_number = max(edge_number)

        self.current = initial
        self.nstates = self.dg.number_of_nodes()
        self.actlist = ["a" + str(e) for e in range(self.action_number)]
        self.T = T
        self.num_obstacles = num_obstacles
        self.task_type = task_type
        self.visualization = visualization
        self.decoys_set = decoys_set

        self.obstacles_combo = []
        self.obstacles = np.asarray([])
        self.sample_obstacles()

        self.horizon = self.nstates - 1
        self.target_index = 0
        self.time_index = 0
        self.configuration_index = 0
        self.p = 0.5

        self.targets = np.asarray(targets)
        if self.task_type == 'sequential':
            self.dead_end = dead_end.difference(
                [self.targets[self.target_index]])
            acc = np.full(self.horizon,
                          self.targets[self.target_index],
                          dtype=np.int)
        else:
            self.dead_end = dead_end.difference(self.targets)
            acc = np.ones(
                (self.horizon, self.targets.size), dtype=np.int) * self.targets

        self.mdp = MDP(initial,
                       self.actlist,
                       range(self.nstates + 1),
                       acc=acc,
                       obstacles=np.ones(
                           (self.horizon, self.obstacles.size), dtype=np.int) *
                       self.obstacles,
                       horizon=self.horizon)

        self.mdp.prob = self.getProbs()
def policy_improvement_forest_management(
        mdp: MDP.ForestManagement(), epsilon=0.001):
    "Solve an MDP by policy iteration [Fig. 17.7]"
    pi = np.zeros(mdp.number_of_states)
    U = np.zeros(mdp.number_of_states + 1)
    U[-1] = mdp.reward_wait / mdp.gamma  #Equivalent to giving value of 0 and reward of reaching final state
    action = {'cut': 0, 'wait': 1}
    while True:
        U = policy_evaluation_forest_management(pi, U, mdp, epsilon)
        unchanged = True
        for s in range(mdp.number_of_states):
            equivalent = False
            value_cut = mdp.reward_cut
            value_wait = (1 - mdp.fire_prob) * mdp.gamma * U[s + 1]
            if value_wait > value_cut:
                best_action = action['wait']
            elif value_wait < value_cut:
                best_action = action['cut']
            else:
                best_action = rd.randint(0, 1)
                equivalent = True

            if best_action != pi[s]:
                pi[s] = best_action
                if equivalent == False:
                    unchanged = False
        if unchanged:
            return U, pi
Ejemplo n.º 5
0
def generate_random_MDP(X,
                        U,
                        B,
                        std=0,
                        random_state=np.random.RandomState(0),
                        R_mode=DEPEND_ONLY_ON_START_STATE):
    '''
    :param X: state size
    :param U: actions size
    :param B: Branching factor
    :param std:
    :param random_state:
    '''
    P = np.zeros(shape=(U, X, X))
    R = np.zeros(shape=(U, X, X))
    R_std = std * np.ones(shape=(U, X, X))

    for x in range(X):
        for u in range(U):
            P[u, x], R[u, x] = get_random_sparse_vector(X, B, random_state)
        if R_mode == DEPEND_ONLY_ON_START_STATE:
            R[0, x, :] = R[0, x, 0]
            R[u, x, :] = R[0, x, 0]

    mdp = MDP.MDP(P=P, R=R, R_std=R_std)
    return mdp
    def recording(self):
        S, A = 5, 6
        exp = MDP.Experience(S, A)

        s, s1, a = 3, 4, 5
        rew, negrew, zerorew = 7.4, -4.2, 0.0

        self.assertEqual(exp.getVisits(s, a, s1), 0)

        exp.record(s, a, s1, rew)

        self.assertEqual(exp.getVisits(s, a, s1), 1)
        self.assertEqual(exp.getReward(s, a, s1), rew)

        exp.reset()

        self.assertEqual(exp.getVisits(s, a, s1), 0)

        exp.record(s, a, s1, negrew)

        self.assertEqual(exp.getVisits(s, a, s1), 1)
        self.assertEqual(exp.getReward(s, a, s1), negrew)

        exp.record(s, a, s1, zerorew)

        self.assertEqual(exp.getVisits(s, a, s1), 2)
        self.assertEqual(exp.getReward(s, a, s1), negrew)

        self.assertEqual(exp.getVisitsSum(s, a), 2)
    def compatibility(self):
        S, A = 4, 3
        exp = MDP.Experience(S, A)

        visits = []
        rewards = []
        for s in xrange(0, S):
            visits[s] = []
            rewards[s] = []
            for a in xrange(0, A):
                visits[s][a] = []
                rewards[s][a] = []
                for s1 in xrange(0, S):
                    visits[s][a][s1] = generator()
                    rewards[s][a][s1] = generator()

        exp.setVisits(visits)
        exp.setRewards(rewards)

        for s in xrange(0, S):
            for a in xrange(0, A):
                visitsSum, rewardSum = 0, 0
                for s1 in xrange(0, S):
                    self.assertEqual(exp.getVisits(s, a, s1), visits[s][a][s1])
                    self.assertEqual(exp.getReward(s, a, s1),
                                     rewards[s][a][s1])
                    visitsSum += visits[s][a][s1]
                    rewardSum += rewards[s][a][s1]

                self.assertEqual(exp.getVisitsSum(s, a), visitsSum)
                self.assertEqual(exp.getRewardSum(s, a), rewardSum)
Ejemplo n.º 8
0
    def __init__(self,
                 initial,
                 nrows=8,
                 ncols=8,
                 robotmdp=MDP(),
                 targets_path=[[]],
                 obstacles=[],
                 size=16,
                 task_type='sequential'):
        super(GridworldGui, self).__init__(initial, nrows, ncols, robotmdp,
                                           targets_path, obstacles, task_type)
        # compute the appropriate height and width (with room for cell borders)

        self.height = nrows * size + nrows + 1
        self.width = ncols * size + ncols + 1
        self.size = size

        # initialize pygame ( SDL extensions )
        pygame.init()
        pygame.display.set_mode((self.width, self.height))
        pygame.display.set_caption('Gridworld')
        self.screen = pygame.display.get_surface()
        self.surface = pygame.Surface(self.screen.get_size())
        self.bg = pygame.Surface(self.screen.get_size())
        self.bg_rendered = False  # optimize background render

        self.background()
        self.screen.blit(self.surface, (0, 0))
        pygame.display.flip()

        self.build_templates()
        self.updategui = True  # switch to stop updating gui if you want to collect a trace quickly

        self.current = self.mdp.init  # when start, the current state is the initial state
        self.state2circle(self.current)
Ejemplo n.º 9
0
    def MDP_standard(self, pathway_count=100, years=100, start_ID=0, policy=None, supp_var_cost=0, supp_fixed_cost=0):
        """Creates and returns a set of MDP pathways which were each generated with a given policy (default=coin-toss)"""

        pathways = [None]*pathway_count

        #set up coin-toss policy if one isn't passed in
        if policy == None:
            policy = FireGirlPolicy()
            policy.b = [0,0,0,0,0,0,0,0,0,0,0]

        for i in range(pathway_count):
            pathways[i] = FireGirlPathway(i+start_ID)
            
            #setting suppression costs
            pathways[i].fire_suppression_cost_per_day = supp_fixed_cost
            pathways[i].fire_suppression_cost_per_cell = supp_var_cost

            #creating a random policy (skipping first one: leaving constant parameter = 0)
            pathways[i].Policy = policy

            #generating landscape, running pathway simulations, and converting to MDP_pathway object
            pathways[i].generateNewLandscape()
            pathways[i].doYears(years)
            pathways[i].updateNetValue()
            pathways[i] = MDP.convert_firegirl_pathway_to_MDP_pathway(pathways[i])

        #return the pathways
        return pathways
Ejemplo n.º 10
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):
        '''Model-based Reinforcement Learning with epsilon greedy 
        exploration.  This function should use value iteration,
        policy iteration or modified policy iteration to update the policy at each step

        Inputs:
        s0 -- initial state
        defaultT -- default transition function when a state-action pair has not been vsited
        initialR -- initial estimate of the reward function
        nEpisodes -- # of episodes (one episode consists of a trajectory of nSteps that starts in s0
        nSteps -- # of steps per episode
        epsilon -- probability with which an action is chosen at random

        Outputs: 
        V -- final value function
        policy -- final policy
        '''

        model = MDP.MDP(defaultT, initialR, self.mdp.discount)
        V = np.zeros(model.nStates)
        policy = np.zeros(model.nStates, int)

        count_sa = np.zeros((model.nStates, model.nActions)).astype(float)
        count_sas = np.zeros(
            (model.nStates, model.nActions, model.nStates)).astype(float)

        c_reward = np.zeros(nEpisodes)
        for i in range(nEpisodes):
            state = s0
            for j in range(nSteps):
                action = policy[state]
                if random.uniform(0, 1) < epsilon:
                    action = random.randint(0, model.nActions - 1)

                reward, nextState = self.sampleRewardAndNextState(
                    state, action)
                c_reward[i] += reward * (model.discount**j)

                count_sa[state, action] += 1.0
                count_sas[state, action, nextState] += 1.0

                model.T[action,
                        state, :] = np.divide(count_sas[state, action, :],
                                              count_sa[state, action])
                model.R[action,
                        state] = (reward + (count_sa[state, action] - 1.0) *
                                  model.R[action, state]) / count_sa[state,
                                                                     action]
                policy, V, _ = model.policyIteration(policy)

                state = nextState

        return [V, policy, c_reward]
Ejemplo n.º 11
0
 def get_MDP_sample(self):
     mdp = MDP.DiscreteMDP(self.n_states, self.n_actions)
     for s in range(self.n_states):
         for a in range(self.n_actions):
             ## Sample transitions from the Dirichlet
             mdp.P[s,a] = np.random.dirichlet(self.alpha[s, a])
             mdp.R[s,a] = np.random.beta(self.reward_alpha[s,a], self.reward_beta[s,a])
     return mdp
Ejemplo n.º 12
0
 def get_mean_MDP(self):
     mdp = MDP.DiscreteMDP(self.n_states, self.n_actions)
     for s in range(self.n_states):
         for a in range(self.n_actions):
             ## Sample transitions from the Dirichlet
             mdp.P[s,a] = self.get_marginal_transition_probabilities(s, a)
             mdp.R[s,a] = self.get_expected_reward(s,a)
     return mdp
Ejemplo n.º 13
0
    def doOfflineComputation(self, type):

        if type == 'value':

            cost, action = MDP.mdp_value_iteration(self.problem,
                                                   self.valueTable,
                                                   self.policyTable,
                                                   self.epsilon)

        elif type == 'policy':

            cost, action = MDP.mdp_policy_iteration(self.problem,
                                                    self.valueTable,
                                                    self.policyTable,
                                                    self.epsilon)

        self.valueTable = cost
        self.policyTable = action
Ejemplo n.º 14
0
    def calc_policy(self, main_index, other_policy=None):
        if other_policy is None:
            other_policy = np.ones((self.mdp_a[1 - main_index], self.s))
            other_policy /= np.sum(other_policy, axis=0)
        t, r = self.get_tr_with_others_policy(main_index, other_policy)

        mdp = MDP.MDP(t.shape[1], t.shape[0], self.d)
        mdp.t = t
        mdp.r = r
        return self.solver.get_greedy_policy(mdp)
Ejemplo n.º 15
0
    def construction(self):
        S, A = 5, 6
        exp = MDP.Experience(S, A)

        self.assertEqual(exp.getS(), S)
        self.assertEqual(exp.getA(), A)
        self.assertEqual(exp.getVisits(0, 0, 0), 0)
        self.assertEqual(exp.getReward(0, 0, 0), 0.0)

        self.assertEqual(exp.getVisits(S - 1, A - 1, S - 1), 0)
        self.assertEqual(exp.getReward(S - 1, A - 1, S - 1), 0.0)
Ejemplo n.º 16
0
def evaluate_TD():
    print("Creating MDP.")
    mdp = MDP.MDP(10, 3)
    print("Running TD State-Value Estimation.")
    ts = []
    for i in range(1, 20):
        print(str(i) + "...")
        v = Temporal_difference.estimate(mdp, 0, 0.01, 30, 100, lambda x: 1)
        print("\t" + str(v[0]))
        ts.append(v[0])
    plot(ts)
Ejemplo n.º 17
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    rubiks_MDP = MDP.MDP()
    rubiks_MDP.register_start_state("wwoobbggrrrryyyyoowwggbb")
    rubiks_MDP.register_actions(Test_Rubiks.ACTIONS)
    rubiks_MDP.register_operators(Test_Rubiks.OPERATORS)
    #rubiks_MDP.generateAllStates()
    #print("Total number of generated states: " + str(len(rubiks_MDP.known_states)))
    rubiks_MDP.register_transition_function(Test_Rubiks.T)
    rubiks_MDP.register_reward_function(Test_Rubiks.R)
    #rubiks_MDP.random_episode(1000)
    rubiks_MDP.QLearning(0.98, 1, 0.1)
Ejemplo n.º 18
0
def test():
    cube_MDP = MDP.MDP()
    cube_MDP.register_start_state(createInitialState())
    cube_MDP.register_actions(ACTIONS)
    cube_MDP.register_operators(OPERATORS)
    cube_MDP.register_transition_function(T)
    cube_MDP.register_reward_function(R)
    cube_MDP.register_describe_state(describeState)
    cube_MDP.register_goal_test(goalTest)
    cube_MDP.register_action_to_op(ACTION_TO_OP)
    cube_MDP.generateAllStates()
    cube_MDP.QLearning(0.8, 1000, 0.2)
    displayOptimalPolicy(cube_MDP)
Ejemplo n.º 19
0
def evaluate_policy():
    print("Creating MDP.")
    mdp = MDP.MDP(10, 3)
    print("Running Monte Carlo State-Value Estimation.")
    TS = []

    for i in range(5, 8):
        print(str(i) + "...")
        v0 = Monte_Carlo.first_visit_eval(mdp, 0, 0.01, 10, i * 10,
                                          lambda x: 1)[0]
        print(" " + str(v0))
        TS.append(v0[0])
    plot(TS)
Ejemplo n.º 20
0
def EvalutePredicted(SR, X_test, g_test):
    M=50
    N=30
    c1=1
    c2=1
    g_pred = []
    g_single = []
    
    y_pred=SR.predict(X_test)
    
    n = X_test.shape[0]
    for i in range(n//50):
        (lam,mu1,a,b)=X_test[i*50,:4]
        g_single=np.append(g_single, g_test[i*50])
        params = (M,N,c1,c2,lam,mu1,a,b)
        functionPredicted = MDP.ComposeF(np.append([0],y_pred[i*50:(i+1)*50]))
        f, newV, gMax = MDP.MDPSolver(params, f=functionPredicted)
        g_pred = np.append(g_pred,gMax)
        
    g_errors = np.divide(np.abs(g_single - g_pred),g_single)*100
    print(np.round(np.percentile(g_errors,[50,90,95,99,99.99]),2))
    return y_pred, g_pred, g_single, g_errors
Ejemplo n.º 21
0
    def __init__(self):

        self.MDP = Diagram.MDP()

        self.lambdaQ: float = 0.99

        self.alpha: float = 0.1

        self.threshold: float = 0.001

        self.change_rate: float = 0.99

        self.q_value_dict = self.MDP.get_States()
Ejemplo n.º 22
0
def find_schedule(M, LV, GV, N, delta, due_dates, release_dates, ALPHA, GAMMA,
                  EPSILON, EPOCHS, METHOD, STACT):

    # Generate heuristics for Q_learning rewards
    heur_job = heuristic_best_job(delta, LV, GV, N)
    heur_res = heuristic_best_resource(heur_job)
    heur_order = heuristic_order(delta, LV, GV, N)

    if STACT == "st_act":  # st_act for state-action pairs, act for only actions
        policy_init = np.zeros([2**N, N + 1])  # states, actions
    if STACT == "act":  # st_act for state-action pairs, act for only actions
        policy_init = np.zeros([N + 1])  # actions

    RL = MDP(LV, GV, N, policy_init, due_dates,
             release_dates)  # initialize MDP
    r_best = 99999
    best_schedule = []
    best_policy = np.zeros([LV, N + 1])
    epoch_best_found = 0
    timer_start = time.time()
    for epoch in range(EPOCHS):
        # if epoch%100==0:
        #     print(epoch)

        DONE = False
        z = 0
        RL.reset(due_dates, release_dates, LV, GV, N)

        # take timesteps until processing of all jobs is finished
        while not DONE:
            RL, DONE = RL.step(z, GV, N, METHOD, delta, ALPHA, GAMMA, EPSILON,
                               STACT, heur_job, heur_res, heur_order)
            z += 1

        schedule = RL.schedule.objectives()
        r = schedule.Cmax
        if r < r_best:
            r_best = r
            best_schedule = schedule
            epoch_best_found = epoch

            for i in range(len(RL.resources)):
                best_policy[i] = RL.resources[i].policy

            if METHOD == "JEPS":
                resources = RL.resources
                states = RL.states
                actions = RL.actions

                for i in range(len(resources)):
                    resource = update_policy_JEPS(resources[i], states,
                                                  actions, r_best, z, GAMMA,
                                                  STACT)
                    RL.resources[i] = resource

    timer_finish = time.time()
    calc_time = timer_finish - timer_start
    return r_best, best_schedule, best_policy, epoch_best_found, calc_time, RL
Ejemplo n.º 23
0
def main():

    R0 = (1, 0)
    Discount = 0.95
    N = 2

    #     V, action = MDP.mdp_value_interation(N, R0, Discount)

    V, action = MDP.mdp_policy_interation(N, R0, Discount)

    print("Optimal Action for " + str(R0) + ": " + str(action[R0]))
    print("Total reward: " + str(V[R0]))

    return 0
Ejemplo n.º 24
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    grid_MDP = MDP.MDP()
    grid_MDP.register_start_state((0, 0))
    grid_MDP.register_actions(ACTIONS)
    grid_MDP.register_operators(OPERATORS)
    grid_MDP.register_transition_function(T)
    grid_MDP.register_reward_function(R)
    grid_MDP.random_episode(100)
    grid_MDP.generateAllStates()
    grid_MDP.ValueIterations(0.9, 100)
    grid_print(grid_MDP.V)
    grid_MDP.QLearning(0.9, 1000, 0.05)
    QPrinter(grid_MDP.QValues)
    policyPrint(grid_MDP)
Ejemplo n.º 25
0
    def doOnlineComputation(self, S0):

        S0 = tuple(S0)

        N = self.problem.venture.getNumVentures()
        M = self.problem.venture.getManufacturingFunds()
        E = self.problem.venture.getAdditionalFunds()
        Gamma = self.problem.getDiscountFactor()
        Prices = self.problem.getSalePrices()

        cost, action = MDP.MDP_greedy_search(self.problem, self.valueTable, S0,
                                             N, M, E, Gamma, Prices)

        self.valueTable[S0] = cost
        self.policyTable[S0] = action
Ejemplo n.º 26
0
    def modelBasedRL(self,
                     s0,
                     defaultT,
                     initialR,
                     nEpisodes,
                     nSteps,
                     epsilon=0):

        cum_rewards = np.zeros((nEpisodes))

        cumActProb = np.cumsum(np.ones(self.mdp.nActions) / self.mdp.nActions)
        freq = np.zeros(
            [self.mdp.nActions, self.mdp.nStates, self.mdp.nStates])
        T = defaultT
        R = initialR
        model = MDP.MDP(T, R, self.mdp.discount)
        [policy, V, _] = model.policyIteration(np.zeros(model.nStates, int))
        for episId in xrange(nEpisodes):
            state = s0
            for iterId in xrange(nSteps):

                # choose action
                if epsilon > np.random.rand(1):
                    action = np.where(cumActProb >= np.random.rand(1))[0][0]
                else:
                    action = policy[state]

                # sample reward and next state
                [reward,
                 nextState] = self.sampleRewardAndNextState(state, action)
                cum_rewards[episId] += (self.mdp.discount**iterId) * reward

                # update counts
                freq[action, state, nextState] += 1
                asFreq = freq[action, state, :].sum()

                # update transition
                T[action, state, :] = freq[action, state, :] / asFreq

                # update reward
                R[action,
                  state] = (reward + (asFreq - 1) * R[action, state]) / asFreq

                # update policy
                [policy, V, _] = model.policyIteration(policy)

                state = nextState
        return [V, policy, cum_rewards]
Ejemplo n.º 27
0
def generate_investment_sim(p_noise=0, **kwargs):
    P = np.array([[[1 - p_noise, p_noise], [1 - p_noise, p_noise]],
                  [[p_noise, 1 - p_noise], [p_noise, 1 - p_noise]]])
    R_state_1 = kwargs.get("R_state_1", 2)
    R1_std = kwargs.get("R1_std", math.sqrt(2))
    R1D = np.array([1, R_state_1])
    R1D_std = np.array([0, R1_std])
    R = OneDVec2ThreeDVec(R1D, U=2)
    R_std = OneDVec2ThreeDVec(R1D_std, U=2)

    sparse_flag = kwargs.get("sparse_flag", False)
    if sparse_flag:
        pass
    else:
        mdp = MDP.MDP(P=P, R=R, R_std=R_std)
    return mdp
Ejemplo n.º 28
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    grid_MDP = MDP.MDP()
    grid_MDP.register_start_state((0, 0))
    grid_MDP.register_actions(ACTIONS)
    grid_MDP.register_operators(OPERATORS)
    grid_MDP.register_transition_function(T)
    grid_MDP.register_reward_function(R)
    #grid_MDP.random_episode(100)
    grid_MDP.generateAllStates()
    grid_MDP.valueIterations(0.9, 10)
    displayV(grid_MDP.V)
    grid_MDP.QLearning(0.9, 20000, 0.05)
    displayQ(grid_MDP.QValues)
    grid_MDP.extractPolicy(grid_MDP.QValues)
    displayOptimalPolicy(grid_MDP.optPolicy)
Ejemplo n.º 29
0
def test():
    cube_MDP = MDP.MDP()
    cube_MDP.register_start_state(createInitialState())
    cube_MDP.register_actions(ACTIONS)
    cube_MDP.register_operators(OPERATORS)
    cube_MDP.register_transition_function(T)
    cube_MDP.register_reward_function(R)
    cube_MDP.register_describe_state(describeState)
    cube_MDP.register_goal_test(goalTest)
    cube_MDP.register_action_to_op(ACTION_TO_OP)
    cube_MDP.generateAllStates()
    cube_MDP.random_episode(10)
    cube_MDP.QLearning(0.8, 1000, 0.2)
    displayOptimalPolicy(cube_MDP)


# DO NOT USE Q LEARNING. IT WILL TAKE FOREVER TO GENERATE ALL POSSIBLE STATES. NOT WORTH YOUR TIME.
#test()
Ejemplo n.º 30
0
def test():
    '''Create the MDP, then run an episode of random actions for 10 steps.'''
    global grid_MDP
    grid_MDP = MDP.MDP()
    grid_MDP.register_start_state((0, 0))
    grid_MDP.register_actions(ACTIONS)
    grid_MDP.register_operators(OPERATORS)
    grid_MDP.register_transition_function(T)
    grid_MDP.register_reward_function(R)
    grid_MDP.random_episode(100)
    grid_MDP.generateAllStates()
    grid_MDP.ValueIterations(0.9, 100)
    draw_grid_with_V_values(grid_MDP.V, 3, 4)
    grid_MDP.QLearning(0.8, 50, 0.8)
    draw_grid_with_Q_values(grid_MDP.QValues, 3, 4)
    grid_MDP.extractPolicy()
    extractPolicy(grid_MDP.optPolicy)
    return grid_MDP
Ejemplo n.º 31
0
    def create_and_convert_firegirl_pathways(self, pathway_count, years, start_ID):
        """Creates a set of FireGirl pathways, and then converts them into MDP pathways for use in the optimizer"""
        
        #setting up initial lists
        fg_pathways = [None] * pathway_count
        self.pathway_set = [None] * pathway_count
        
        for i in range(pathway_count):
            fg_pathways[i] = FireGirlPathway(i+start_ID)
            fg_pathways[i].Policy.b = self.Policy.b[:]
            fg_pathways[i].generateNewLandscape()
            fg_pathways[i].doYears(years)
            fg_pathways[i].updateNetValue()
            self.pathway_set[i] = MDP.convert_firegirl_pathway_to_MDP_pathway(fg_pathways[i])
        
        #normalizing pathways
        self.normalize_all_features()

        #populate initial weights
        self.calc_pathway_weights()
def policy_evaluation_forest_management(pi,
                                        U,
                                        mdp: MDP.ForestManagement(),
                                        epsilon=0.001):
    """Return an updated utility mapping U from each state in the MDP to its
    utility, using an approximation (modified policy iteration)."""
    gamma = mdp.gamma
    action = {'cut': 0, 'wait': 1}
    while True:
        delta = 0
        U1 = U.copy()
        for s in range(mdp.number_of_states):
            if pi[s] == action['cut']:
                value = mdp.reward_cut
            else:
                value = (1 - mdp.fire_prob) * mdp.gamma * U1[s + 1]
            U[s] = value
            delta = max(abs(U[s] - U1[s]), delta)
        if delta <= epsilon * (1 - gamma) / gamma:
            return U
Ejemplo n.º 33
0
 def MDP_vs_FG_1(self):
     #create a set of FG pathways for both optimizers to use and making a duplicate list of MDP-style pathways
     pathway_list = [None]*20
     MDP_list = [None]*20
     for i in range(20):
         pathway_list[i] = FireGirlPathway(i)
         pathway_list[i].generateNewLandscape()
         pathway_list[i].doYears(50)
         pathway_list[i].updateNetValue()
         MDP_list[i] = MDP.convert_firegirl_pathway_to_MDP_pathway(pathway_list[i])
     
     #creating optimizers
     opt_FG = FireGirlPolicyOptimizer()
     opt_MDP = MDP_PolicyOptimizer(11)
     
     #setting pathway lists
     opt_FG.pathway_set = pathway_list[:]
     opt_MDP.pathway_set = MDP_list[:]
     
     #populate initial weights
     opt_FG.calcPathwayWeights()
     opt_FG.pathway_weights_generation = opt_FG.pathway_weights[:]
     opt_MDP.calc_pathway_weights()
     #opt_MDP.pathway_weights_generation = opt_MDP.pathway_weights[:]
     
     #normalizing pathways
     opt_FG.normalizeAllFeatures()
     opt_MDP.normalize_all_features()
     
     #optimizing
     FG_output = opt_FG.optimizePolicy()
     MDP_output = opt_MDP.optimize_policy()
     
     print("FireGirl Optimizer Output:")
     print(FG_output)
     
     print("")
     print("")
     print("MDP Optimizer Output:")
     print(MDP_output)
Ejemplo n.º 34
0
    def MDP_random_start_policies(self, pathway_count=75, years=100, start_ID=0, supp_var_cost=300, supp_fixed_cost=0):
        """Creates and returns a set of MDP pathways which were each generated with random policies"""

        pathways = [None]*pathway_count
        for i in range(pathway_count):
            pathways[i] = FireGirlPathway(i+start_ID)
            
            #setting suppression costs
            pathways[i].fire_suppression_cost_per_day = supp_fixed_cost
            pathways[i].fire_suppression_cost_per_cell = supp_var_cost

            #creating a random policy (skipping first one: leaving constant parameter = 0)
            for p in range(1, len(pathways[i].Policy.b)):
                pathways[i].Policy.b[p] = round(random.uniform(-1,1), 2)

            #generating landscape, running pathway simulations, and converting to MDP_pathway object
            pathways[i].generateNewLandscape()
            pathways[i].doYears(years)
            pathways[i].updateNetValue()
            pathways[i] = MDP.convert_firegirl_pathway_to_MDP_pathway(pathways[i])

        #return the pathways
        return pathways