Exemple #1
0
    def NetworkBase_timeStep(self, time, supportDepressionImpact, 
        concealDiscriminateImpact, discriminateConcealImpact, 
        discriminateDepressionImpact, concealDepressionImpact,
        support=None, conceal=None, discrimination=None, 
        attitude=None, depression=None, policyScore=None, bias=0): 
        ONLY_NON_DISCRIMINATORY = 1
        ONLY_DISCRIMINATORY = 2

        # "Natural gap" between passing of enforced policies
        TIME_GAP = 5

        # Considers the cases where the type of policy is externally
        # enforced (not proposed at random in simulation)
        if (policyScore or bias) and time % TIME_GAP == 0:
            newPolicy = Policy(time, score=policyScore, biasPass=bias)

            # Converst from the numerical bias to a boolean for if
            # the scores are bias towards discriminatory or support
            if bias == ONLY_NON_DISCRIMINATORY: onlyDisc = False
            else: onlyDisc = True

            self.NetworkBase_enforcePolicy(time, score=policyScore, 
                onlyDisc=onlyDisc)

        else:
            newPolicy = Policy(time)
            newPolicy.Policy_considerPolicy(self, time, self.policyCap)
        
        self.NetworkBase_updatePolicyScore(time)
        for agentID in self.Agents:
            self.Agents[agentID].Agent_updateAgent(time, supportDepressionImpact,
                concealDiscriminateImpact, discriminateConcealImpact, 
                discriminateDepressionImpact, concealDepressionImpact,
                support, conceal, discrimination, attitude, depression)
Exemple #2
0
def get_soft_policy_from_qf_dict(
    qf_dict: SAf,
    softmax: bool,
    epsilon: float
) -> Policy:
    if softmax:
        ret = Policy({s: find_softmax_action_probs(v) for s, v in
                      qf_dict.items()})
    else:
        ret = Policy({s: find_epsilon_action_probs(v, epsilon) for s, v in
                      qf_dict.items()})
    return ret
Exemple #3
0
    def NetworkBase_enforcePolicy(self, time, score=None, onlyDisc=False):
        ONLY_NON_DISCRIMINATORY = 1
        ONLY_DISCRIMINATORY = 2 

        if self.policyScore + score > self.policyCap:
            return

        if score:
            enforcedPolicy = Policy(time=time, score=score)
        else:
            # Maps from boolean value to the ints specified above
            biasType = int(onlyDisc) + 1
            enforcedPolicy = Policy(time=time, biasPass=biasType)
            
        self.NetworkBase_addToPolicies(enforcedPolicy, time)
Exemple #4
0
    def one_step_lookahead(self, V, pe=0):
        new_policy_mat = [[[None for y in range(self.length)]
                           for x in range(self.width)]
                          for dir in range(self.num_dirs)]
        for state in self.env.states:
            adj_states = self.env.getAdjStates(state)
            max_action_value = float("-inf")
            best_action = None
            for action_tuple in action_space:
                move, rotate = action_tuple
                action = Action(move, rotate)
                action_value = 0
                for nxt_state in adj_states:
                    nxt_x, nxt_y, nxt_dir = nxt_state.getState()
                    action_value += self.get_trans_prob(
                        pe, state, action,
                        nxt_state) * V[nxt_dir][nxt_x][nxt_y]
                if action_value > max_action_value:
                    max_action_value = action_value
                    best_action = action

            cur_x, cur_y, cur_dir = state.getState()
            new_policy_mat[cur_dir][cur_x][cur_y] = best_action

        new_policy = Policy(new_policy_mat)
        return new_policy
Exemple #5
0
def main():
    env = gym.make('CartPole-v1')
    pi = Policy(LEARNING_RATE, GAMMA)
    score = 0.0
    print_interval = 20

    for n_epi in range(10000):
        s = env.reset()
        done = False

        while not done:
            prob = pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample()
            s_prime, r, done, info = env.step(a.item())
            pi.put_data((r, prob[a]))
            s = s_prime
            score += r

        pi.train_net()
        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score: {}".format(
                n_epi, score / print_interval))
            score = 0.0
    env.close()
Exemple #6
0
    def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1,
                 lambda_2, lambda_3, n_steps, l_margin):
        # Input Parameters
        self.eps = eps  # eps-greedy
        self.gamma = gamma  # discount factor
        self.batch_size = batch_size
        self.tau = tau  # frequency of target replacement
        self.ed = 0.005  # bonus for demonstration # todo they aren't used
        self.ea = 0.001  # todo they aren't used
        self.l_margin = l_margin
        self.n_steps = n_steps
        self.lambda1 = lambda_1  # n-step return
        self.lambda2 = lambda_2  # supervised loss
        self.lambda3 = lambda_3  # L2

        self.counter = 0  # target replacement counter # todo change to iter_counter
        self.replay = Memory(capacity=max_memory)
        self.loss = nn.MSELoss()
        self.policy = Policy()  # todo change not have to pass architecture
        self.opt = optim.Adam(self.policy.predictNet.parameters(),
                              lr=lr,
                              weight_decay=lambda_3)

        self.replay.e = 0
        self.demoReplay = ddict(list)

        self.noisy = hasattr(self.policy.predictNet, "sample")
Exemple #7
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, self.num_states, self.num_actions = get_env_info(
            self.env_id)

        # seeding
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(self.num_states, self.num_actions).to(device)

        self.value_net = Value(self.num_states).to(device)
        self.running_state = ZFilter((self.num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format(
                self.env_id, self.model_path, self.env_id))
            data = pickle.load(
                open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb"))
            self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state

        self.collector = MemoryCollector(self.env,
                                         self.policy_net,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)
Exemple #8
0
def create_greedy_worker(networks, counter, config):
    logger = Logger("output_{0}_greedy.out".format(config['experiment']))
    environment = HFOEnv(port=6321, seed=86868686, numOpponents=1)
    environment.connectToServer()

    w_args = (100000, networks["learning"], environment, Policy(logger=logger),
              logger, counter)
    return mp.Process(target=policy_worker.run, args=w_args)
def eval(model_type=model_type, model_path=model_path):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    env = LunarLander()

    if model_type == 'policy':
        model = Policy(env.observation_dim, env.action_dim)
    elif model_type == 'dqn':
        model = Network(env.observation_dim, env.action_dim)
    model.to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    episodes = 50
    wins = 0
    frames = []
    fuel_left = []
    for i in range(episodes):
        if i % 10 == 0:
            print(f"On episode {i}")
        frame_count = 0

        env.reset()
        state = env.get_state()
        while True:
            frame_count += 1

            action = model(
                torch.tensor(state, dtype=torch.float32,
                             device=device).unsqueeze(0)).argmax()

            state, reward, done = env.step(action)

            if done:
                if env.won:
                    wins += 1
                    frames.append(frame_count)
                    fuel_left.append(env.rocket.fuel)
                break
        env.close()

    if wins > 0:
        print(f"wins: {wins}")
        print(f"mean frames on wins {np.mean(frames)}")
        print(f"std frames on wins {np.std(frames, ddof=1)}")
        print(f"min frames on wins {np.min(frames)}")
        print(f"max frames on wins {np.max(frames)}")

        print(f"mean fuel on wins {np.mean(fuel_left)}")
        print(f"std fuel on wins {np.std(fuel_left, ddof=1)}")
        print(f"min fuel on wins {np.min(fuel_left)}")
        print(f"max fuel on wins {np.max(fuel_left)}")
    else:
        print("The model had 0 wins. Statistics can't be calculated")
Exemple #10
0
 def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"):
     self.FILE = FILE
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.policy = Policy().to(self.device)
     self.policy.load_state_dict(torch.load(self.FILE))
     self.policy.eval()
     self.criterion = nn.CrossEntropyLoss()
     self.learning_rate = learning_rate
     self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                       lr=self.learning_rate)
Exemple #11
0
    def __init__(self, params):
        self.params = params
        self.__state_dim = params['state_dim']
        self.__action_dim = params['action_dim']
        self.__buffer_size = params['buffer_size']
        self.__batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__lr = params['lr']
        self.__update_every = params['update_every']
        eps = params['eps']
        eps_decay = params['eps_decay']
        min_eps = params['min_eps']
        seed = params['seed']
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        critic_params = dict()
        critic_params['seed'] = seed
        critic_params['arch_params'] = params['arch_params_critic']
        self.critic_local = QNetwork(critic_params).to(device)
        self.critic_target = QNetwork(critic_params).to(device)
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=self.__lr)

        #Policy
        actor_params = dict()
        actor_params['seed'] = seed
        actor_params['arch_params'] = params['arch_params_actor']
        actor_params['noise_type'] = params['noise_type']
        actor_params['eps'] = eps
        actor_params['eps_decay'] = eps_decay
        actor_params['min_eps'] = min_eps
        actor_params['arch_params'] = params['arch_params_actor']
        self.actor_local = Policy(actor_params).to(device)
        self.actor_target = Policy(actor_params).to(device)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=self.__lr)

        self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size)
        self.__t_step = 0
Exemple #12
0
    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)
        qnet_params = checkpoint['critic_params']
        policy_params = checkpoint['actor_params']

        self.actor_local = Policy(policy_params['actor_params'])
        self.actor_local.load_state_dict(
            checkpoint['actor_params']['state_dict'])

        self.critic_local = QNetwork(qnet_params['critic_params'])
        self.critic_local.load_state_dict(
            checkpoint['critic_params']['state_dict'])
        return self
Exemple #13
0
    def __init__(self, env="LunarLanderContinuous-v2", gamma=0.99):
        self.policy = Policy(env_id=env)
        self.env = gym.make(env)
        self.runs = Runs(gamma=gamma)

        self.plotter = VisdomPlotter(env_name=env)

        self.device_cpu = torch.device("cpu")
        if torch.cuda.is_available():
            self.use_gpu = True
            self.device = torch.device("cuda")
        else:
            self.use_gpu = False
            self.device = torch.device("cpu")
Exemple #14
0
    def __evaluate_general_award(self, battle_info, card):
        free_pos = battle_info.field.get_empty_pos(self_side_flag=True)

        best_pos = -1
        best_award = -1
        for pos in free_pos:
            award = self.get_policy_award(battle_info, card, pos)
            print("Testing Card " + str(card.id) + " in pos" + str(pos) +
                  " AWard:" + str(award))
            if award > best_award:
                best_pos = pos
                best_award = award

        return Policy(card, best_pos, best_award)
Exemple #15
0
 def policy_iteration(self, tol=1e-4) -> DetPolicy:
     ''' Find the optimal policy using policy iteration '''
     pol = Policy({
         s: {a: 1. / len(v)
             for a in v}
         for s, v in self.state_action_dict.items()
     })
     vf = self.find_value_func_dict(pol)
     epsilon = tol * 1e4
     while epsilon >= tol:
         pol = self.find_improved_policy(pol)
         new_vf = self.find_value_func_dict(pol)
         epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
         vf = new_vf
     return pol
Exemple #16
0
 def parsePolicy(self, fname):
     width = None
     height = 0
     policy = []
     with open(fname) as file:
         while True:
             line = file.readline()
             if not line:
                 break
             if width != None and (len(line) - 1) != width:
                 raise Exception("Input width inconsistent")
             width = len(line) - 1
             height += 1
             rowActions = self.parseLine(line)
             policy.extend(rowActions)
     policy = Policy(policy)
     policy.setWidth(width)
     policy.setHeight(height)
     return (policy)
Exemple #17
0
def create_workers(config, logger):
    counter = mp.Value('i', 0)

    learning_network = network_factory.create_network(
        num_layers=config["num_layers"], hidden_size=config["hidden_size"])
    target_network = network_factory.create_network(
        num_layers=config["num_layers"], hidden_size=config["hidden_size"])
    learning_network.load_state_dict(target_network.state_dict())

    optimizer = SharedAdam(learning_network.parameters(),
                           lr=config["learning_rate"])
    optimizer.share_memory()

    workers = []
    for idx in range(0, config["n_workers"]):
        networks = {"learning": learning_network, "target": target_network}

        # environment = create_environment(idx)
        policy = Policy(epsilon=config["startingEpsilons"][idx],
                        numUpdates=config["numPolicyUpdates"],
                        minEpsilon=config["minEpsilons"][idx],
                        logger=logger)
        trainingArgs = (idx, networks, optimizer, counter, policy, config,
                        logger)
        p = mp.Process(target=Worker.train, args=trainingArgs)

        logger.log("Starting process: {0}".format(idx))

        p.start()

        logger.log("Process started: {0}".format(idx))
        workers.append(p)
        logger.log("Worker Appended: {0}".format(idx))

    logger.log("Creating the greedy worker")
    p = create_greedy_worker(networks, counter, config)
    p.start()
    workers.append(p)
    logger.log("Greedy worker started and appended")

    return workers, target_network
Exemple #18
0
    def value_iter(self, discount, pe=0):
        prev_value = np.zeros((self.num_dirs, self.width, self.length))
        new_policy_matrix = [[[None for y in range(self.length)]
                              for x in range(self.width)]
                             for dir in range(self.num_dirs)]

        converge = False

        while not converge:
            # print("\nValue Iteration ")
            new_value = np.zeros((self.num_dirs, self.width, self.length))
            for cur_state in self.env.states:
                cur_x, cur_y, cur_dir = cur_state.getState()
                adj_states = self.env.getAdjStates(cur_state)
                best_action = None
                max_action_value = float("-inf")
                for action_tuple in action_space:
                    move, rotate = action_tuple
                    action = Action(move, rotate)
                    action_value = 0
                    for nxt_state in adj_states:
                        x, y, dir = nxt_state.getState()
                        action_value += self.get_trans_prob(
                            pe, cur_state, action,
                            nxt_state) * (self.get_reward(cur_state) +
                                          discount * prev_value[dir][x][y])
                    if action_value > max_action_value:
                        best_action = action
                        max_action_value = action_value

                new_policy_matrix[cur_dir][cur_x][cur_y] = best_action
                new_value[cur_dir][cur_x][cur_y] = max_action_value

            diff = np.sum(np.abs(new_value - prev_value))
            # print("Value diff: ", diff)
            if np.array_equal(new_value, prev_value):
                converge = True
            prev_value = new_value
        new_policy = Policy(new_policy_matrix)
        return new_policy, new_value
Exemple #19
0
    def __init__(
        self,
        _NAUDIO_COMMANDS,  #scalar, number of possible audio commands
        _EEG_INPUT_SHAPE,  #shape, (ntimepoints, nchan, nfreqs)
        _LOGDIR,  #pass in directory to write summaries and whatnot
        _POLICY_LR=1e-4,  #scalar, policy learning rate
        _VALUE_LR=1e-3,  #scalar, value learning rate
        _REWARD_MA_LEN=100,  #scalar
        _LSTM_CELLS=[
            30, 30, 30
        ]  #lstm dimensions, (cell0_size, cell1_size, ...) when total length is number of cells
    ):

        # These should not be changed by user but may change later in architechture
        self._InputShape = list(_EEG_INPUT_SHAPE)
        self._LSTMCells = list(_LSTM_CELLS)
        self._LSTMUnrollLength = 1
        self._ValueDiscount = 1.0

        self._Policy = Policy(_LEARNING_RATE=_POLICY_LR,
                              _ACTIONSPACE_SIZE=_NAUDIO_COMMANDS)
        self._Value = Value(_LEARNING_RATE=_VALUE_LR,
                            _DISCOUNT_RATE=self._ValueDiscount)
        self._Reward = Reward(_INPUT_SHAPE=_EEG_INPUT_SHAPE,
                              _MA_LENGTH=_REWARD_MA_LEN)
        self._Shared = Shared(_CELLS=_LSTM_CELLS,
                              _UNROLL_LENGTH=self._LSTMUnrollLength)

        # We store a version of the hidden state which we pass in every iteration
        self._HiddenStateShape = (len(_LSTM_CELLS), 2, self._LSTMUnrollLength,
                                  _LSTM_CELLS[-1])
        self._LocalHiddenState = np.zeros(self._HiddenStateShape)

        # Save the logdir
        self.mLogdir = _LOGDIR

        self._buildModel()
        self._buildSummaries()
        self._buildFeedDicts()
        self._initSession()
Exemple #20
0
    def train_policies(self, load_best_policy=False, load_reinforcement=False):
        if load_reinforcement:
            for i in range(0, game_setting.K):
                policy = Policy(self.game_setting)
                file_name = policy.load_reinforcement_model(i)
                self.policies.append([policy, file_name, 0, 0])
            return

        if load_best_policy:
            start = 1
            policy = Policy(self.game_setting)
            nr_of_training_cases = policy.load_best_model()
            self.policies.append([policy, nr_of_training_cases, 0, 0])
        else:
            start = 0
        policy = Policy(self.game_setting)

        max_cases = min(
            policy.import_data_and_train(max_cases=self.max_cases,
                                         test_nr_of_cases=True),
            self.max_cases)

        if self.negative_training_power > 0:
            for i in range(start, self.K):
                nr_of_cases = max(
                    0, max_cases // ((i + 1)**self.negative_training_power))

                if nr_of_cases > 0:
                    policy = Policy(self.game_setting)
                    actual_nr_of_cases = policy.import_data_and_train(
                        max_cases=nr_of_cases)
                else:
                    policy = Policy(self.game_setting, no_model=True)
                    actual_nr_of_cases = 0

                self.policies.append([policy, actual_nr_of_cases, 0, 0])
        else:
            for i in range(start, self.K):
                policy = Policy(self.game_setting)
                nr_of_cases = max(
                    int(max_cases * (self.K - i - 1) / (self.K - 1)), 0)
                if nr_of_cases > 0:
                    actual_nr_of_cases = policy.import_data_and_train(
                        max_cases=nr_of_cases)
                    self.policies.append([policy, actual_nr_of_cases, 0, 0])
                else:
                    self.policies.append([policy, 0, 0, 0])
Exemple #21
0
		def setUp(self):
			self.ss = (7,9)

			self.a_map = OrderedDict()
			self.a_map['U'] = (1,1)
			self.a_map['D'] = (-1,-1)
			self.a_map['R'] = (1,0)
			self.a_map['L'] = (1,-2)

			self.ws = WorldSpace(self.ss, self.a_map)

			self.p_kw = {}

			self.p_kw['discount_factor'] = 1
			self.p_kw['exploration_factor'] = 0.95
			self.p_kw['is_static'] = False
			self.p_kw['learn_rate'] = 0.001

			self.policy = Policy(self.ws, **self.p_kw)

			self.p_kw['value_type'] = Policy.STATE_VALUES
			self.p_kw['init_variance'] = 0.01
			self.tab_pol = TabularPolicy(self.ws, **self.p_kw)
Exemple #22
0
import ConfigParser
import sys
import traceback

import IRecv_Module as IM

import logger
from BaseThread import BaseThread
from MPI_Wrapper import Client
from MPI_Wrapper import Tags
from Policy import Policy
from Task import SampleTask
from Task import TaskStatus
from WorkerRegistry import WorkerStatus

policy = Policy()

log = logger.getLogger('WorkerAgent')
wlog = None


def MSG_wrapper(**kwd):
    return json.dumps(kwd)


class HeartbeatThread(BaseThread):
    """
    ping to master to update status
    """
    def __init__(self, client, worker_agent):
        BaseThread.__init__(self, name='HeartbeatThread')
Exemple #23
0
testStickyWall = False
toImprovePolicy = False
selectPolicy = False

# run:
optimalPolicyThroughImprovement = None
optimalPolicyThroughValueIteration = None

if beliefTracking:
    bel = Belief(gridMap)
    #bel.explore(randomActionSelection)
    bel.explore(QMDP)

if testStickyWall:
    config = PolicyConfig(setStickyWalls=True)
    emptyPolicy = Policy([])
    emptyPolicy.setConfig(config)
    emptyPolicy.valueIteration(gridMap)
    print(emptyPolicy)
    emptyPolicy.resetValues()
    print("sticky policy:")
    print(emptyPolicy)
    for pSticky in [0.25, 0.5, 0.75, 0.9]:
        print("pSticky: " + str(pSticky))
        config.setStickyWallConfig(StickyWallConfig(pSticky))
        emptyPolicy.valueIteration(gridMap)
        emptyPolicy.resetValues()
        print(emptyPolicy)

if toImprovePolicy:
    # load default policy
Exemple #24
0
# 'policy' or 'dqn' to choose which type of model to evaluate
model_type = 'policy'
# model_type = 'dqn'
model_path = "policies/22-1-2021_13-44/policy0.tar"

env = LunarLander()
env.reset()
exit_program = False

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

if model_type == 'policy':
    model = Policy(env.observation_dim, env.action_dim)
elif model_type == 'dqn':
    model = Network(env.observation_dim, env.action_dim)
model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()
state = env.get_state()

while not exit_program:
    env.render()
    action = model(
        torch.tensor(state, dtype=torch.float32,
                     device=device).unsqueeze(0)).argmax()

    state, reward, done = env.step(action)
Exemple #25
0
def get_uniform_policy(state_action_dict: Mapping[S, Set[A]]) -> Policy:
    return Policy({s: {a: 1. / len(v) for a in v} for s, v in
                   state_action_dict.items()})
Exemple #26
0
    def initializeEnv(self):
        """
        Initializes the actor and critic neural networks and variables related to training
        Can be called to reinitialize the network to it's original state
        """
        # Set random seed
        self.statusBox.setText('Creating environment...')
        s = self.parameters['Learning']['random_seed']
        from random import seed

        if s != 0:
            seed(s)
            tf.random.set_random_seed(s)

        # Create environment
        envName = self.envSelectionDropdown.currentText().strip()
        try:
            self.env = gym.make(envName)
        except:
            import rl
            from rl.baselines import get_parameters
            config = get_parameters(envName)
            self.env = getattr(rl.environments, envName)(config=config)

        # Show screen
        try:
            self.env.render(mode="human")
        except:
            pass

        self.env.reset()
        self.done = False

        self.gamma = self.parameters['Learning']['gamma']
        self.lam = self.parameters['Learning']['lambda']
        self.policy_logvar = self.parameters['Learning']['log_variance']
        self.trajectories = []

        self.obs = self.env.observation_space.shape[0]
        try:
            self.actions = self.env.action_space.shape[0]
            self.actionWidget.setYRange(self.env.action_space.low[0] - .4,
                                        self.env.action_space.high[0] + .4)
        except:
            self.actions = self.env.action_space.n
            self.discrete = True

        # Create the list of deques that is used for averaging out the outputs of the actor network
        # during training of the network
        self.testAction = [deque(maxlen=5) for _ in range(self.actions)]

        self.valueFunction = NNValueFunction(self.obs, self.actions,
                                             self.parameters['Learning'],
                                             self.parameters['Networks'])
        self.policy = Policy(self.obs, self.actions,
                             self.parameters['Learning'],
                             self.parameters['Networks'], self.policy_logvar)
        self.policyLoss = [0]
        self.episode = 0
        self.mean_reward = []
        self.sums = 0.0
        self.mean_actions = np.zeros(
            [self.parameters['Learning']['batch_size'], 3])
        self.scaler = Scaler(self.env.observation_space.shape[0])
        self.observes, self.rewards, self.unscaled_obs = None, None, None
        self.step = 0
        self.statusBox.setText('Created {} environment.'.format(envName))
        self.buttonStatus('initialized')
Exemple #27
0
    feature_vector = convertFeatureVectorToFormat(rootstate.board.flatten('F'),
                                                  rootstate.toplay)

    training_data_file.write(",".join(
        str(int(input)) for input in feature_vector) + "|" +
                             ",".join(str(target)
                                      for target in target) + "|" + "\n")


game_setting = GameSetting()
file_path = training_data_file_path = DATA_DIR + 'n'.join(
    str(dim) for dim in game_setting.network_dimensions
) + "-" + str(time.time() + datetime.now().microsecond) + "-" + ''.join(
    random.SystemRandom().choice(string.ascii_uppercase + string.digits)
    for _ in range(5))
training_data_file = open(file_path, "w+")
"""
state = HexState1(game_setting)
print(state)
print(state.place_white((1,1)))
print(state.place_black((0,0)))
print(state.place_white((1,0)))
print(state.place_black((0,1)))
print(state)
print(state.winner())
"""
play_game(game_setting)
policy = Policy(game_setting)
policy.import_all_data_and_train()
play_game(game_setting, policy=policy)
training_data_file.close()
Exemple #28
0
    def __init__(self, world):
        self.w = world
        self.p = Policy(world)
        self.master = Tk()
        self.master.title("MDP Example: GridWorld")
        self.c = self.w.newCanvasToDraw(self.master)
        self.c.pack(side=LEFT, padx=10, pady=10)
        self.p.world.draw(self.c)

        self.frame = Frame(self.master, relief=RAISED, borderwidth=1)
        self.frame.pack(fill=BOTH, side=LEFT, expand=1)

        self.whatToShow = None
        self.turboMode = BooleanVar()
        self.algorithm = StringVar()
        self.algorithm.set("vi")

        self.bShowMap = Button(self.frame,
                               text="Show Map",
                               command=self.cbShowMap)

        self.bUtilities = Button(self.frame,
                                 text="Show Utilities",
                                 command=self.cbShowUtilities)

        self.bShowQvalues = Button(self.frame,
                                   text="Show Q-Values",
                                   command=self.cbShowQValues)

        self.bShowPolicy = Button(self.frame,
                                  text="Show Policy",
                                  command=self.cbShowPolicy)

        self.whatToShow = self.cbShowMap

        # COMPUTATION------------------------------------------
        self.frameComputation = Frame(self.frame)

        self.computationStarted = False
        self.bComputation = Button(self.frameComputation,
                                   text="Start Computation")

        self.bComputation.config(command=self.toggleComputation)
        self.bComputation.pack(side=TOP, padx=10, pady=5)

        self.radioBAlgorithms = []
        for text, mode in (("Value iteration", "vi"), ("Policy iteration",
                                                       "pi")):
            b = Radiobutton(self.frameComputation,
                            text=text,
                            variable=self.algorithm,
                            value=mode)
            b.pack(anchor=W, padx=10, pady=5)
            self.radioBAlgorithms.append(b)

        self.frameSleep = Frame(self.frameComputation)
        self.tSleep = Label(self.frameSleep, text="Sleep (sec): ")
        self.eSleep = Spinbox(self.frameSleep, from_=0, to=10, width=5)
        self.tSleep.pack(side=LEFT)
        self.eSleep.pack(side=LEFT)
        self.frameSleep.pack(side=TOP, padx=10, pady=5)

        self.turboModeCheck = Checkbutton(self.frameComputation,
                                          text="Turbo fix point",
                                          variable=self.turboMode)
        self.turboModeCheck.pack(side=TOP)

        self.tDebugModeIterations = Label(self.frameComputation,
                                          text="Iterations: 0")
        self.tDebugModeIterations.pack(side=TOP, padx=10, pady=5)

        self.bResetResults = Button(self.frameComputation,
                                    text="Reset Results")

        self.bResetResults.config(command=self.resetResults)
        self.bResetResults.pack(side=TOP, padx=10, pady=5)

        self.bShowMap.pack(side=TOP, padx=10, pady=5)
        self.bUtilities.pack(side=TOP, pady=5)
        self.bShowQvalues.pack(side=TOP, padx=10, pady=5)
        self.bShowPolicy.pack(side=TOP, pady=5)
        self.frameComputation.pack(side=BOTTOM, pady=20)
    print("----------------")
    print("This is the Policy")
    policy_data = {
        1: {
            'a': 0.4,
            'b': 0.6
        },
        2: {
            'a': 0.7,
            'c': 0.3
        },
        3: {
            'b': 1.0
        }
    }
    pol_obj = Policy(policy_data)
    print(pol_obj.policy_data)

    print("----------------")
    print("This is MRPRefined")
    mrp_refined_obj = mdp_refined_obj.get_mrp_refined(pol_obj)
    print("Transitions")
    print(mrp_refined_obj.mpgraph)
    print("Rewards Refined")
    print(mrp_refined_obj.rewards_refined)

    print("-----------------")
    print("This is MDP")
    print("Rewards")
    print(mdp_refined_obj.rewards)
Exemple #30
0
            'Study': 0.5,
            'FB': 0.5
        },
        'C2': {
            'Study': 0.5,
            'SLP': 0.5
        },
        'C3': {
            'Study': 0.5,
            'Pub': 0.5
        },
        'Facebook': {
            'FB': 0.5,
            'Quit': 0.5
        },
        'Sleep': {
            'SLP': 1
        }
    }

    mdp_obj = MDP(student, 0.999999)
    pol_obj = Policy(policy)
    mrp_obj = mdp_obj.find_mrp(pol_obj)

    #print('The sink states are: \n',mdp_obj.find_sink_states(), "\n")
    #print('The terminal states are: \n',mdp_obj.find_terminal_states(), "\n")
    print('The value obtained from pol evaluation is: \n',
          mdp_obj.policy_evaluation(pol_obj), "\n")
    print('The value obtained from pol evaluation is: \n',
          mdp_obj.find_value_func_dict(pol_obj), "\n")
    pol_obj_vi = print(mdp_obj.policy_iteration())