Python Exploration Exemples, Exploration Python Exemples

Exemple #1

0

Afficher le fichier

def filter_open_cases(log):
    log_selected = attributes_filter.apply(
        log, ["Payment Handled"],
        parameters={
            xes_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY:
            constants.concept_key,
            "positive": True
        })
    util.print_filtered_cases_count(len(log), len(log_selected))
    return log_selected

Exemple #2

0

Afficher le fichier

Fichier : Train.py Projet : mjkang16/DQN

def train_prioritized(Q_Network, train_batch, w_batch, Exp, s_scale,
                      input_size, num_actions):

    state_t_batch = [item[0] for item in train_batch]
    state_t_batch = np.array(state_t_batch)
    state_t_1_batch = [item[1] for item in train_batch]
    state_t_1_batch = np.array(state_t_1_batch)

    action_batch = [item[2] for item in train_batch]
    reward_batch = [item[3] for item in train_batch]
    reward_batch = np.array(reward_batch)
    done_batch = [item[4] for item in train_batch]
    done_batch = np.array(done_batch)

    action_t_batch = Q_Network.evaluate_critic(state_t_batch)
    w_batch = np.transpose(np.tile(w_batch, (num_actions, 1)))

    if Exp == 'epsilon':
        action_t_1_batch = Q_Network.evaluate_critic(state_t_1_batch)
        q_t_1 = Q_Network.evaluate_target_critic(state_t_1_batch)

        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][action_batch[i]] = reward_batch[
                    i] + GAMMA * q_t_1[i][np.argmax(action_t_1_batch[i])]

    elif Exp == 'softmax':
        action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch)
        q_t_1 = Exploration.softV(action_t_1_batch, s_scale)

        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][
                    action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i]

    elif Exp == 'sparsemax':
        action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch)
        q_t_1 = Exploration.sparsemax(action_t_1_batch, s_scale)

        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][
                    action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i]

    #Q_Network.train_critic_prioritized(state_t_batch, action_t_batch, w_batch)
    errors, cost, _ = Q_Network.train_critic_prioritized(
        state_t_batch, action_t_batch, w_batch)
    errors = np.sum(errors, axis=1)
    return errors, cost

Exemple #3

0

Afficher le fichier

Fichier : Train.py Projet : mjkang16/DQN

def train(Q_Network, train_batch, Exp, s_scale, input_size, num_actions):

    state_t_batch = [item[0] for item in train_batch]
    state_t_batch = np.array(state_t_batch)
    state_t_1_batch = [item[1] for item in train_batch]
    state_t_1_batch = np.array(state_t_1_batch)

    action_batch = [item[2] for item in train_batch]
    reward_batch = [item[3] for item in train_batch]
    reward_batch = np.array(reward_batch)
    done_batch = [item[4] for item in train_batch]
    done_batch = np.array(done_batch)

    action_t_batch = Q_Network.evaluate_critic(state_t_batch)

    if Exp == 'epsilon':
        action_t_1_batch = Q_Network.evaluate_critic(state_t_1_batch)
        q_t_1 = Q_Network.evaluate_target_critic(state_t_1_batch)

        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][action_batch[i]] = reward_batch[
                    i] + GAMMA * q_t_1[i][np.argmax(action_t_1_batch[i])]

    elif Exp == 'softmax':
        action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch)
        q_t_1 = Exploration.softV(action_t_1_batch, s_scale)

        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][
                    action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i]

    elif Exp == 'sparsemax':
        action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch)
        q_t_1 = Exploration.sparsemax(action_t_1_batch, s_scale)

        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][
                    action_batch[i]] = reward_batch[i] + GAMMA * q_t_1[i]

    # Update critic by minimizing the loss
    Q_Network.train_critic(state_t_batch, action_t_batch)

Exemple #4

0

Afficher le fichier

def main():
    algo = input('Algorithm? 1 - Random direction, 2 - RSS gradient\n')
    max_it = input('Max iterations?\n')
    nb_samples = input('Nb of RSS samples for averaging?\n')
    mode = input('Mode? 1 - Thread, 2 - Procedural\n')
    write = input('Output file? 1/0 (y/n)\n')

    sim = input('Simulation? 1/0 (y/n)\n')
    roomba_sim = None
    if sim:
        roomba_sim = Roomba_sim.Roomba_sim()
        roomba_sim.daemon = True  # Thread is killed when main thread killed
        roomba_sim.start()  # Start thread

    explore = Exploration.Exploration(roomba_sim)
    explore.daemon = True  # Thread is killed when main thread killed
    explore.start(
    )  # Start thread (Needed for procedural too, it activate the control of the robot)
    rssi_obj = RSSI_Measure.RSSI_Measure()
    localize = Localization.Localization(explore, rssi_obj, algo, max_it,
                                         nb_samples, mode, write)
    localize.daemon = True  # Thread is killed when main thread killed
    localize.start()  # Start thread
    print 'Algorithm Start'

    localize.join()
    explore.stop()
    explore.join()

Exemple #5

0

Afficher le fichier

Fichier : Train.py Projet : mjkang16/WARAS

def train_prioritized(Q_Network, train_batch, w_batch, Exp, s_scale, input_size, num_actions, size_action_batch):
    
    state_t_batch, state_t_1_batch, action_batch, reward_batch, done_batch = zip(*train_batch)
    
    state_t_batch = np.array(state_t_batch)
    state_t_1_batch = np.array(state_t_1_batch)
    action_batch = np.array(action_batch)
    reward_batch = np.array(reward_batch)
    done_batch = np.array(done_batch)
    
    batch_size = len(done_batch)
    q_t_1_batch = []
    
    if Exp == 'epsilon':
        # 'target-Q네트워크'로 계산한 action별 q값 계산
        q_t_1_batch = Q_Network.get_target_q_batch(state_t_1_batch)
        q_t_1_batch = np.reshape(q_t_1_batch, [batch_size, -1])
        
        # Exploration 기법 별로 V(next_state)를 계산
        q_t_1_batch = np.max(q_t_1_batch, axis = 1)
        # 측정 Q(state, action)을 q_t_1_batch로 저장 
        q_t_1_batch = reward_batch + GAMMA*q_t_1_batch#*(1-done_batch)
        
    elif Exp == 'softmax':
        q_t_1_batch = Q_Network.get_target_q_batch(state_t_1_batch)
        q_t_1_batch = np.reshape(q_t_1_batch, [batch_size, -1])
        
        q_t_1_batch = Exploration.softV(q_t_1_batch, s_scale)
        q_t_1_batch = reward_batch + GAMMA*q_t_1_batch#*(1-done_batch)
    
    elif Exp == 'sparsemax':
        q_t_1_batch = Q_Network.get_target_q_batch(state_t_1_batch)
        q_t_1_batch = np.reshape(q_t_1_batch, [batch_size, -1])
        
        q_t_1_batch = Exploration.sparsemax(q_t_1_batch, s_scale)
        q_t_1_batch = reward_batch + GAMMA*q_t_1_batch#*(1-done_batch)
        
    q_t_1_batch = np.reshape(q_t_1_batch,[-1,1])
    w_batch = np.reshape(w_batch,[-1,1])
    
    # q_t_1_batch와 오차가 가장 작아지는 방향으로 Q네트워크 training, weight 적용
    errors, cost, _ = Q_Network.train_critic_prioritized(state_t_batch, action_batch, q_t_1_batch, w_batch)
    errors = np.sum(errors, axis=1)
    
    return errors, cost, state_t_batch

Exemple #6

0

Afficher le fichier

 def getExplorer(self):
     #
     # Replay.
     replay_buffer = ReplayBuffer(self.replaySize, self.frameHist)
     explorer = Exploration.EpsilonGreedy(self.explorationSched,
                                          TensorConfig.TensorConfig(),
                                          replay_buffer, self.env,
                                          self.q_func, self.maxSteps)
     return explorer

Exemple #7

0

Afficher le fichier

Fichier : Train.py Projet : mjkang16/WARAS

def train(Q_Network, train_batch, Exp, s_scale, input_size, num_actions, size_action_batch):
    
    state_t_batch, state_t_1_batch, action_batch, reward_batch, done_batch = zip(*train_batch)
    
    state_t_batch = np.array(state_t_batch)
    state_t_1_batch = np.array(state_t_1_batch)
    action_batch = np.array(action_batch)
    reward_batch = np.array(reward_batch)
    done_batch = np.array(done_batch)
    
    q_t_1_batch = []
    
    if Exp == 'epsilon':
        
        for i in range(0, len(train_batch)):
            q_t_1_batch.append(np.reshape(Q_Network.get_target_q_batch(np.reshape(state_t_1_batch[i],[1,-1])),[1,-1])[0])
            
        q_t_1_batch = reward_batch + GAMMA*np.max(q_t_1_batch, axis=1)#*(1-done_batch)
        
    elif Exp == 'softmax':
        action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch)
        q_t_1 = Exploration.softV(action_t_1_batch, s_scale)
        
        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][action_batch[i]] = reward_batch[i] + GAMMA*q_t_1[i]
    
    elif Exp == 'sparsemax':
        action_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch)
        q_t_1 = Exploration.sparsemax(action_t_1_batch, s_scale)
        
        for i in range(0, len(train_batch)):
            if done_batch[i]:
                action_t_batch[i][action_batch[i]] = reward_batch[i]
            else:
                action_t_batch[i][action_batch[i]] = reward_batch[i] + GAMMA*q_t_1[i]
    
    q_t_1_batch = np.reshape(q_t_1_batch,[-1,1])
    
    cost, _ = Q_Network.train_critic(state_t_batch, action_batch, q_t_1_batch)
    
    return cost, state_t_batch

Exemple #8

0

Afficher le fichier

def CreateAndTrainModel(modelToSavePath):
    discountFactor = 0.9
    learningRate = 1

    trainingEstimator = Model.GetModel(modelToSavePath + "_trainning")
    targetEstimator = Model.GetModel(modelToSavePath)

    Model.CreateModelIfDoesntExist(modelToSavePath + "_trainning",
                                   trainingEstimator)
    Model.CreateModelIfDoesntExist(modelToSavePath, targetEstimator)

    experienceContainer = ER.ExperienceContainer(10000)

    maxMovesCount = 10
    epochCount = 1000
    totalMoves = 0
    for epoch in range(epochCount):
        print('epoch:' + str(epoch))
        state = Game.CreateState(None, Game.Pos(1, 2), None)
        print(state)
        nbMoves = 0
        tooManyMoves = False
        while (not (state.IsFinished() or tooManyMoves)):
            nbMoves += 1
            totalMoves += 1

            action = Exploration.GetNextActionBoltzmann(
                Model.GetQValue(state, targetEstimator), epoch, epochCount)
            print(str(action) + '\n')

            nextState = Game.Move(state, action)

            experienceContainer.Add(state, action, nextState.GetReward(),
                                    nextState)

            state = nextState

            tooManyMoves = nbMoves == maxMovesCount

            print(state
                  if not tooManyMoves else 'lost because did too many moves')

            if totalMoves % 5 == 0:
                Model.TrainModelWithExperienceReplay(experienceContainer,
                                                     trainingEstimator,
                                                     targetEstimator,
                                                     discountFactor,
                                                     learningRate)

            if totalMoves % 25 == 0:
                targetEstimator = trainingEstimator

    OverrideDir(modelToSavePath, modelToSavePath + "_trainning")

Exemple #9

0

Afficher le fichier

Fichier : Game.py Projet : Nuistrid/Pythonic-Text-RPG

def Game():
    os.system("cls")
    print("Welcome to A Text Based RPG")
    print("Developed by Malachi Wadas \n\n\n\n")
    itp = input("(n)ew / (l)oad game? >   ")
    if itp in ["n", "N"]:
        Player = NewGame()
        Easyregions = [
            gen.GenerateRegion(rnd.choice([1, 2, 3]),
                               themes=rnd.choice([
                                   "fire", "ice", "earth", "lightning",
                                   "random", "special", ""
                               ]))
        ] * 200
        Medregions = [
            gen.GenerateRegion(rnd.choice([4, 5]),
                               themes=rnd.choices([
                                   "fire", "ice", "earth", "lightning",
                                   "random", "special", ""
                               ],
                                                  k=2))
        ] * 100
        Hardregions = [
            gen.GenerateRegion(rnd.choice([6, 7]),
                               themes=rnd.choices([
                                   "fire", "ice", "earth", "lightning",
                                   "random", "special", ""
                               ],
                                                  k=2))
        ] * 25
        Reg = rnd.sample(Easyregions + Medregions + Hardregions, 5)

        for i in range(len(Reg)):
            print(
                str(i) + ":  " + Reg[i]["Name"] + ", Regions: " +
                str(Reg[i]["Total Regions"]))
        itp = int(
            input("Which one do you wnat to explore? 0 - " +
                  str(len(Reg) - 1) + " >  "))
        while not (0 <= itp <= len(Reg) - 1):
            for i in range(len(Reg)):
                print(
                    str(i) + ":  " + Reg[i]["Name"] + ", Regions: " +
                    str(Reg[i]["Total Regions"]))
            itp = int(
                input("Which one do you wnat to explore? 0 - " +
                      str(len(Reg) - 1) + " >  "))
        reg = Reg[itp]
        Player["Current Location"] = reg
        expr.ExploreRegion(Player, reg)
    if itp in ["l", "L"]:
        quit()

Exemple #10

0

Afficher le fichier

 def __init__(self, seed, envName, expName):
     super(Config, self).__init__(seed, envName, expName=expName)
     self.parallelCfg = Exploration.ExploreParallelCfg()
     self.parallelCfg.model = self.q_func
     self.parallelCfg.exploreSched = self.explorationSched
     self.parallelCfg.numFramesPerBuffer = self.frameHist + 1
     self.parallelCfg.sampleLatest = True
     self.parallelCfg.numEnv = 32
     self.batch_size = self.parallelCfg.numEnv
     self.logPeriod = int(self.parallelCfg.numEnv * 100)
     #
     # Dont need to wait since we are going sequentially, but allow for some randomness.
     self.learning_starts = 50
     self.learning_freq = 1
     self.epsilonStepSize = 1  # To match single step learning frequency.

Exemple #11

0

Afficher le fichier

    def loadSprites(self):
        self.tank = Tank(self.screen)
        self.exploration = Exploration(self.screen, (100, 100))
        self.map = Map(self.height, self.height, 32, 32)

        enemy_tank0 = EnemyTank(self.screen)
        enemy_tank1 = EnemyTank(self.screen)
        enemy_tank2 = EnemyTank(self.screen)
        enemy_tank3 = EnemyTank(self.screen)
        enemy_tank4 = EnemyTank(self.screen)

        self.enemy_group = pygame.sprite.Group()
        self.enemy_group.add(enemy_tank0, enemy_tank1, enemy_tank2,
                             enemy_tank3, enemy_tank4)

        self.empty_map = EmptyMap(self.screen, self.width, self.height)
        """currently it is tank_sprites,later, you should decouple these"""

Exemple #12

0

Afficher le fichier

Fichier : Train.py Projet : mjkang16/DQN

def train_error(Q_Network, train_batch, Exp, s_scale, input_size, num_actions):

    state_t_batch = [item[0] for item in train_batch]
    state_t_batch = np.array(state_t_batch)
    state_t_1_batch = [item[1] for item in train_batch]
    state_t_1_batch = np.array(state_t_1_batch)

    action_batch = [item[2] for item in train_batch]
    reward_batch = [item[3] for item in train_batch]
    reward_batch = np.array(reward_batch)
    done_batch = [1 if item[4] == False else 0 for item in train_batch]
    done_batch = np.array(done_batch)

    q_t_batch = Q_Network.evaluate_critic(state_t_batch)
    q_t = [q_t_batch[i][action_batch[i]] for i in range(len(train_batch))]
    q_t = np.array(q_t)

    action_t_1_batch = Q_Network.evaluate_critic(state_t_1_batch)
    q_t_1_batch = Q_Network.evaluate_target_critic(state_t_1_batch)

    if Exp == 'epsilon':
        q_t_1 = [
            q_t_1_batch[i][np.argmax(action_t_1_batch[i])]
            for i in range(len(train_batch))
        ]
        q_t_1 = np.array(q_t_1)
        error_batch = GAMMA * q_t_1 * done_batch + reward_batch - q_t

    elif Exp == 'softmax':
        q_t_1 = Exploration.softV(q_t_1_batch, s_scale)
        error_batch = GAMMA * q_t_1 * done_batch + reward_batch - q_t

    elif Exp == 'sparsemax':
        #q_t_1 = Exploration.sparsemax(q_t_1_batch, s_scale)
        #error_batch = GAMMA*q_t_1*done_batch + reward_batch - q_t
        q_t_1 = [
            q_t_1_batch[i][np.argmax(action_t_1_batch[i])]
            for i in range(len(train_batch))
        ]
        q_t_1 = np.array(q_t_1)
        error_batch = GAMMA * q_t_1 * done_batch + reward_batch - q_t

    return error_batch

Exemple #13

0

Afficher le fichier

def PlayWithModel(modelPath):
    if not os.path.isdir(modelPath):
        print("No model: " + modelPath)
        return

    estimator = Model.GetModel(modelPath)

    state = Game.CreateState(None, Game.Pos(1, 2), None)
    print(state)

    maxMovesCount = 5
    nbMoves = 0
    while (not state.IsFinished()):
        if nbMoves == maxMovesCount:
            print('lost because did too many moves')
            return

        action = Exploration.GetBestAction(Model.GetQValue(state, estimator))
        print(str(action) + '\n')
        state.Move(action)
        print(state)
        nbMoves += 1

Exemple #14

0

Afficher le fichier

Fichier : main.py Projet : MFang10/MDP_Robot_System

def explore_loop():
    global init_probing
    reached_goal = False
    finished = False

    if init_probing:
        ardoutQ.append("a")
        AhBot.turn_left()
        init_probed = False
        while not init_probed:
            if len(ardinQ) > 0:
                sensors = ardinQ.popleft()
                print("init ignored" + str(sensors))
                init_probed = True
        detect = exp_move("d")

    # main loop
    while not finished:
        arrow = 0  # detection result
        Explored.set_cleared(
            AhBot.x,
            AhBot.y)  # add the grids occuppied by the robot to the set CLEARED
        steps = Exploration.next_step(
        )  # compute the next step base on current explored map

        prev_bot_state = str(AhBot.x - 1) + "_" + str(20 -
                                                      AhBot.y) + "_" + str(
                                                          AhBot.face % 4)
        for action in steps:
            arrow = exp_move(action)
            if AhBot.x == END_PT[0] and AhBot.y == END_PT[1]:
                reached_goal = True
            if reached_goal and AhBot.x == START_PT[0] and AhBot.y == START_PT[
                    1]:  # back to the starting point
                finished = True
                break
        # send exp_map and ahbot state to bluetooth
        update_bt(arrow, prev_bot_state)

Exemple #15

0

Afficher le fichier

    def run_DQN(self, case_n, seed_n, Exp, Double, Dueling, Prioritized):
        sess = self.sess
        dis = self.dis
        REPLAY_MEMORY = self.REPLAY_MEMORY
        batch_size = self.batch_size

        Game = self.Game
        save_epi = self.save_epi
        max_episodes = self.max_episodes
        env = self.env
        input_size = self.input_size
        output_size = self.output_size
        
        alpha = self.alpha
        beta_init = self.beta_init
        eps = self.eps
        eps_div = self.eps_div
        s_scale = self.s_scale
        
        training_step = self.training_step
        copy_step = self.copy_step
        repu_num = self.repu_num
        
        ending_cond_epis = self.ending_cond_epis
        ending_cond_reward = self.ending_cond_reward
        
        conti_action_flag = self.conti_action_flag
        action_map = self.action_map
        
        env.seed(seed_n)
        np.random.seed(seed_n)
        tf.set_random_seed(seed_n)
        random.seed(seed_n)

        Q_Network = self.Q_Network
        
        end_episode = 0
        step_count_total = 0
        global_step = 0
        loss = 0

        replay_buffer = deque()
        Q_list = []
        TD_buffer = deque()
        steps_list = []
        step_avg_list = []
        global_step_list = []

        print("")
        print("CASE {}".format(case_n))
        print("  STATE DIM : {}, ACTION DIM : {}".format(input_size, self.action_dim))
        print("  Exp : {}".format(Exp))
        print("  Strategy : Double : {}, Dueling : {}, Prioritized : {}".format(Double, Dueling, Prioritized))
        
        for episode in range(1, max_episodes+1):
            done = False
            step_count = 0
            current_step = 0
            TD_error = 0
            state = env.reset()
            
            while not done:
                
                e = 1. / ((float(episode - 1) / eps_div) + 1)
                
                action = Exploration.choice_action(Exp, e, s_scale, Q_Network.evaluate_critic(np.reshape(state, [1, input_size]))[0])
                
                if conti_action_flag:
                    #action = np.array(action_map[action])
                    action0 = [action_map[action]]
                else:
                    action0 = action
                
                next_state, reward, done, _ = env.step(action0)
                step_count += reward
                global_step += 1
                current_step += 1
                
                if Prioritized:
                    q_t = np.max(Q_Network.evaluate_critic(np.reshape(state, [1, input_size])))
                    q_t_1 = np.max(Q_Network.evaluate_critic(np.reshape(next_state, [1, input_size])))
                    if done:
                        q_t_1 = reward
                    else:
                        q_t_1 = reward + dis*q_t_1
                    
                    TD_buffer.append(pow(abs(q_t_1-q_t)+eps,alpha))
                    if len(TD_buffer) > REPLAY_MEMORY:
                        TD_buffer.popleft()
                
                replay_buffer.append((state, next_state, action, reward, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                
                state = next_state
                
                minibatch = []
                TD_choice = []
                if global_step > batch_size and global_step % training_step == 0:
                    for re in range(repu_num):
                        minibatch = []
                        TD_choice = []
                        if Prioritized:
                            #TD_batch = Train.if_prioritized(Q_Network, replay_buffer, input_size, self.action_dim, eps, alpha)
                            TD_batch = np.array(TD_buffer)/sum(TD_buffer)

                            TD_choice = np.random.choice(len(TD_batch), size = batch_size, replace = False, p = TD_batch)
                            for i in range(batch_size):
                                minibatch.append(replay_buffer[TD_choice[i]])

                        else:
                            minibatch = random.sample(replay_buffer, batch_size)

                        Train.train(Q_Network, minibatch, Exp, s_scale, input_size, self.action_dim)

                        if Prioritized:
                            for i in range(batch_size):
                                state_m, next_state_m, action_m, reward_m, done_m = minibatch[i]
                                q_t = np.max(Q_Network.evaluate_critic(np.reshape(state_m, [1, input_size])))
                                q_t_1 = np.max(Q_Network.evaluate_critic(np.reshape(next_state_m, [1, input_size])))
                                if done:
                                    q_t_1 = reward_m
                                else:
                                    q_t_1 = reward_m + dis*q_t_1    

                                TD_buffer[TD_choice[i]] = pow(abs(q_t_1-q_t)+eps,alpha)
                    
                if global_step > batch_size and global_step % copy_step == 0:
                    Train.copy(Q_Network)
                    
            steps_list.append(step_count)
            global_step_list.append(global_step)
            
            # Print the average of result 
            if episode < ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / episode)

            if episode == ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / ending_cond_epis)

            if episode > ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 1 - ending_cond_epis]
                step_avg_list.append(step_count_total / ending_cond_epis)
            
            print("{}           {}".format(episode, round(step_avg_list[episode - 1], 3)))
            if Exp == 'epsilon':
                print ("                   ( Result : {},  Loss : {},  Epsilon : {},  Steps : {},  Global Steps : {} )"
                                   .format(round(step_count, 5), round(loss, 8), round(e, 5), current_step, global_step))
            else:
                print ("                   ( Result : {},  Loss : {},  Steps : {},  Global Steps : {} )"
                                   .format(round(step_count, 5), round(loss, 8), current_step, global_step))
            
            # Save the networks 
            if episode % save_epi == 0:
            #    #Q_Network.save_network(episode = episode, save_epi = save_epi)
            #    Action_Network.save_network(episode = episode, save_epi = save_epi)
                file_case = str(case_n)
                with open('/home/jolp/Desktop/Data/'+self.file_name+'_seed'+file_case, 'wb') as fout:
                    pickle.dump(step_avg_list, fout)
                with open('/home/jolp/Desktop/Data/'+self.file_name+'_global_'+'_seed'+file_case, 'wb') as fout2:
                    pickle.dump(global_step_list, fout2)

                x_values = list(range(1, episode+1))
                y_values = step_avg_list[:]
                plt.plot(x_values, y_values, c='green')
                plt.title(self.file_name)
                plt.grid(True)
                plt.show()
            
            end_episode += 1
            if step_avg_list[episode - 1] > ending_cond_reward:
                break

        print("--------------------------------------------------")
        print("--------------------------------------------------")
        for episode in range(end_episode + 1, max_episodes+1):
            s = env.reset()
            reward_sum = 0
            done = False
            while not done :
                #env.render()
                action = np.argmax(Q_Network.evaluate_critic(np.reshape(state, [1, input_size])))

                if conti_action_flag:
                    action = [action_map[action]]
                else:
                    action = action
                
                state, reward, done, _ = env.step(action)
                reward_sum += reward
                global_step += 1

                #if episode % save_epi == 0:
                #    Q_Network.save_network(episode = episode, save_epi = save_epi)
                #    Action_Network.save_network(episode = episode, save_epi = save_epi)

                if done :
                    steps_list.append(reward_sum)
                    global_step_list.append(global_step)
                    step_count_total += steps_list[episode - 1]
                    step_count_total -= steps_list[episode - 1 - ending_cond_epis]
                    step_avg_list.append(step_count_total / ending_cond_epis)
                    print("{}           {}".format(episode, round(step_avg_list[episode - 1], 3)))
                    print ("                   ( Result : {} )".format(reward_sum))
        
            if episode % save_epi == 0:
                file_case = str(case_n)
                with open('/home/jolp/Desktop/Data/'+self.file_name+'_seed'+file_case, 'wb') as fout:
                    pickle.dump(step_avg_list, fout)
                with open('/home/jolp/Desktop/Data/'+self.file_name+'_global_'+'_seed'+file_case, 'wb') as fout2:
                    pickle.dump(global_step_list, fout2)

                x_values = list(range(1, episode+1))
                y_values = step_avg_list[:]
                plt.plot(x_values, y_values, c='green')
                plt.title(self.file_name)
                plt.grid(True)
                plt.show()
        
        
        file_case = str(case_n)
        with open('/home/jolp/Desktop/Data/'+self.file_name+'_seed'+file_case, 'wb') as fout:
            pickle.dump(step_avg_list, fout)
        with open('/home/jolp/Desktop/Data/'+self.file_name+'_global_'+'_seed'+file_case, 'wb') as fout2:
            pickle.dump(global_step_list, fout2)
        
        x_values = list(range(1, max_episodes+1))
        y_values = step_avg_list[:]
        plt.plot(x_values, y_values, c='green')
        plt.title(self.file_name)
        plt.grid(True)
        plt.show()

Exemple #16

0

Afficher le fichier

Fichier : Train.py Projet : karanjitsingh/dqn-2048-python

def MiniBatchTrain(config, load_model=False):

    #  Set learning parameters
    gamma = config["discount-factor"]
    num_episodes = config["epochs"]
    learning_rate = config["learning-rate"]
    [eps_start, eps_stop, eps_steps] = config["epsilon-params"]
    model_id = config["model-id"]
    memory_size = config["replay-size"]
    exploration = Exploration.getExplorationFromArgs(config["exploration"])
    batch_size = config["batch-size"]
    update_mode = config["update-mode"]
    tensorboard_port = config["tensorboard"]

    # Initialize TF Network and variables
    Qout, inputs = Network.getNetworkFromArgs(config["architecture"])
    Qmean = tf.reduce_mean(Qout)
    Qmax = tf.reduce_max(Qout)
    predict = tf.argmax(Qout, 1)

    # Initialize TF output and optimizer
    nextQ = tf.placeholder(shape=[None, 4], dtype=tf.float32)
    loss = Losses.getLossFromArgs(config["loss"])(nextQ, Qout)
    trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    updateModel = trainer.minimize(loss)

    init = tf.global_variables_initializer()

    # Initialize tensorboard summary
    summary_op = Summary.init_summary_writer(model_name=model_id,
                                             var_list=[("loss", loss),
                                                       ("Qmean", Qmean),
                                                       ("Qmax", Qmax)],
                                             tb_port=tensorboard_port)

    # Random action parameter
    _epsilon = Gradients.Exponential(start=eps_start, stop=eps_stop)

    def epsilon(step):
        # Exponentially decreasing epsilon to 0.1 for first 25% epochs, constant value of 0.1 from there on
        if step < num_episodes / (100.0 / eps_steps):
            return _epsilon(step / (num_episodes / (100.0 / eps_steps)))
        else:
            return eps_stop

    memory = ReplayMemory(memory_size)

    def update_model():
        if memory.full:
            replay = memory.sample(batch_size)
            state_list = []
            target_list = []

            for sample in replay:
                input = sample[0]
                action = sample[1]
                reward_list = sample[2]
                possible_states = sample[3]
                targetQ = []

                _, allQ = sess.run([predict, Qout],
                                   feed_dict={inputs: [input]})

                if update_mode == "single":
                    next_state = possible_states[action]
                    next_input = normalize(next_state.grid_to_input())
                    Q1 = sess.run(Qout, feed_dict={inputs: [next_input]})
                    maxQ1 = np.max(Q1)
                    targetQ = allQ

                    targetQ[0, action] = reward_list[action] + \
                          (0 if next_state.halt else gamma * maxQ1)

                elif update_mode == "all":
                    next_inputs = [
                        normalize(s.grid_to_input()) for s in possible_states
                    ]
                    Q1 = sess.run(Qout, feed_dict={inputs: next_inputs})
                    maxQs = [np.max(Q) for Q in Q1]

                    targetQ = allQ

                    for k in range(4):
                        if possible_states[k].valid:
                            targetQ[0, k] = reward_list[k] + \
                             (0 if possible_states[k].halt else gamma * maxQs[k])

                state_list.insert(0, input)
                target_list.insert(0, targetQ[0])

            _, summary = sess.run([updateModel, summary_op],
                                  feed_dict={
                                      inputs: state_list,
                                      nextQ: target_list
                                  })
            Summary.write_summary_operation(summary, total_steps + steps)

    with tf.Session() as sess:
        sess.run(init)

        total_steps = 0

        for i in range(num_episodes):
            # Reset environment and get first new observation
            state = Game.new_game(4)
            reward_sum = 0
            steps = 0
            rand_steps = 0
            invalid_steps = 0
            # The Q-Network
            while not state.halt:

                s = normalize(state.grid_to_input())

                steps += 1
                if i == 0:
                    state.printstate()
                    print ""
                # Choose an action by greedily (with e chance of random action) from the Q-network
                a, allQ = sess.run([predict, Qout], feed_dict={inputs: [s]})

                possible_states, action, ra, invalid_prediction = exploration(
                    a[0], allQ, i, epsilon, state)

                if ra:
                    rand_steps += 1
                if invalid_prediction:
                    invalid_steps += 1

                reward_list = []
                for k, nextstate in enumerate(possible_states):
                    r = reward(state, nextstate)
                    if r is not 0:
                        r = np.log2(nextstate.score - state.score) / 2.0
                    reward_list.insert(k, r)

                reward_sum += reward_list[action]

                # [normailzed input, action_taken, rewards for all action, all possible states]
                memory.push([s, action, reward_list, possible_states])

                # update step
                update_model()

                state = possible_states[action]

            maxtile = max([max(state.grid[k]) for k in range(len(state.grid))])
            stat = {
                'max-tile': maxtile,
                'score': state.score,
                'steps': steps,
                'r': reward_sum,
                'rand-steps': "{0:.3f}".format(float(rand_steps) / steps)
            }
            total_steps += steps

            Summary.write_scalar_summaries(
                [
                    ("steps", steps),
                    ("epsilon", epsilon(i)),
                    ("score", state.score),
                    ("rand-steps", float(rand_steps) / steps),
                    ("maxtile", maxtile),
                    # ("invalid-steps", steps)
                ],
                i)

            print i, "\t", stat

        sess.close()

Exemple #17

0

Afficher le fichier

    def run_DQN(self, case_n, seed_n, Exp, Double, Dueling, Prioritized):
        sess = self.sess
        dis = self.dis
        REPLAY_MEMORY = self.REPLAY_MEMORY
        replay_memory = self.replay_memory
        batch_size = self.batch_size

        Game = self.Game
        save_epi = self.save_epi
        max_episodes = self.max_episodes
        env = self.env
        input_size = self.input_size
        output_size = self.output_size

        alpha = self.alpha
        beta_init = self.beta_init
        eps = self.eps
        eps_div = self.eps_div
        s_scale = self.s_scale

        training_step = self.training_step
        copy_step = self.copy_step
        repu_num = self.repu_num

        ending_cond_epis = self.ending_cond_epis
        ending_cond_reward = self.ending_cond_reward

        conti_action_flag = self.conti_action_flag
        action_map = self.action_map

        env.seed(seed_n)
        np.random.seed(seed_n)
        tf.set_random_seed(seed_n)
        random.seed(seed_n)

        Q_Network = self.Q_Network

        end_episode = 0
        step_count_total = 0
        global_step = 0
        loss = 0

        replay_buffer = deque()
        Q_list = []
        TD_buffer = deque()
        steps_list = []
        step_avg_list = []
        global_step_list = []

        print("")
        print("CASE {}".format(case_n))
        print("  STATE DIM : {}, ACTION DIM : {}".format(
            input_size, self.action_dim))
        print("  Exp : {}".format(Exp))
        print(
            "  Strategy : Double : {}, Dueling : {}, Prioritized : {}".format(
                Double, Dueling, Prioritized))

        t = t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = 0
        for episode in range(1, max_episodes + 1):
            t1 = time.time()
            #print(    "TIME {} --- EPISODE {}".format(t1-t,episode))
            t = t1

            done = False
            step_count = 0
            current_step = 0
            cost = 0
            state = env.reset()

            while not done:
                t7 = time.time()
                e = 1. / ((float(episode - 1) / eps_div) + 1)

                action = Exploration.choice_action(
                    Exp, e, s_scale,
                    Q_Network.evaluate_critic(
                        np.reshape(state, [1, input_size]))[0])

                if conti_action_flag:
                    action0 = [action_map[action]]
                else:
                    action0 = action

                next_state, reward, done, _ = env.step(action0)
                step_count += reward
                global_step += 1
                current_step += 1

                #t2 = time.time()
                #print(    "TIME {} --- 1 {}  {}".format(t2-t7,episode,current_step))
                if Prioritized:
                    replay_memory.save_experience(state, action, reward,
                                                  next_state, done)

                else:
                    replay_buffer.append(
                        (state, next_state, action, reward, done))
                    if len(replay_buffer) > REPLAY_MEMORY:
                        replay_buffer.popleft()
                #t2 = time.time()
                #print(    "TIME {} --- 2 {}  {}".format(t2-t7,episode,current_step))

                state = next_state

                replay_memory.anneal_per_importance_sampling(
                    global_step, max_episodes * 1000)
                if global_step > batch_size and global_step % training_step == 0:
                    for re in range(repu_num):
                        minibatch = []
                        if Prioritized:
                            idx, priorities, w_batch, experience = replay_memory.retrieve_experience(
                                batch_size)
                            minibatch = self.format_experience(
                                experience, minibatch)
                            errors, cost = Train.train_prioritized(
                                Q_Network, minibatch, w_batch, Exp, s_scale,
                                input_size, output_size)
                            #print(errors)

                            #t2 = time.time()
                            #print(    "TIME {} --- 3 {}  {}".format(t2-t7,episode,current_step))
                            """
                            errors = []
                            for i in range(batch_size):
                                state_m, next_state_m, action_m, reward_m, done_m = minibatch[i]
                                q_t = np.max(Q_Network.evaluate_critic(np.reshape(state_m, [1, input_size])))
                                q_t_1 = np.max(Q_Network.evaluate_critic(np.reshape(next_state_m, [1, input_size])))
                                if done_m:
                                    q_t_1 = reward_m
                                else:
                                    q_t_1 = reward_m + dis*q_t_1
                                errors.append(q_t_1-q_t)
                            """
                            #errors = Train.train_error(Q_Network, minibatch, Exp, s_scale, input_size, output_size)
                            #t2 = time.time()
                            #print(    "TIME {} --- 4 {}  {}".format(t2-t7,episode,current_step))
                            replay_memory.update_experience_weight(idx, errors)

                        else:
                            minibatch = random.sample(replay_buffer,
                                                      batch_size)
                            Train.train(Q_Network, minibatch, Exp, s_scale,
                                        input_size, self.action_dim)

                if global_step > batch_size and global_step % copy_step == 0:
                    Train.copy(Q_Network)

                #t8 = time.time()
                #print(    "TIME {} --- CYCLE {}  {}".format(t8-t7,episode,current_step))

            steps_list.append(step_count)
            global_step_list.append(global_step)

            # Print the average of result
            if episode < ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / episode)

            if episode == ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / ending_cond_epis)

            if episode > ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 1 - ending_cond_epis]
                step_avg_list.append(step_count_total / ending_cond_epis)

            print("{}           {}".format(
                episode, round(step_avg_list[episode - 1], 3)))
            if Exp == 'epsilon' or Exp == 'sparsemax':
                print(
                    "                   ( Result : {},  Loss : {},  Epsilon : {},  Steps : {},  Global Steps : {} )"
                    .format(round(step_count, 5), round(cost, 5), round(e, 5),
                            current_step, global_step))
            else:
                print(
                    "                   ( Result : {},  Loss : {},  Steps : {},  Global Steps : {} )"
                    .format(round(step_count, 5), round(cost, 5), current_step,
                            global_step))

            # Save the networks
            if episode % save_epi == 0:
                file_case = str(case_n)
                Q_Network.save_network(game_name=self.file_name + '_seed' +
                                       file_case,
                                       episode=episode,
                                       save_epi=save_epi)

                with open(
                        '/home/jolp/Desktop/Data/' + self.file_name + '_seed' +
                        file_case, 'wb') as fout:
                    pickle.dump(step_avg_list, fout)
                with open(
                        '/home/jolp/Desktop/Data/' + self.file_name +
                        '_global_' + '_seed' + file_case, 'wb') as fout2:
                    pickle.dump(global_step_list, fout2)

                x_values = list(range(1, episode + 1))
                y_values = step_avg_list[:]
                plt.plot(x_values, y_values, c='green')
                plt.title(self.file_name)
                plt.grid(True)
                plt.show()

            end_episode += 1

            if step_avg_list[episode - 1] > ending_cond_reward:
                break

        print("--------------------------------------------------")
        print("--------------------------------------------------")
        for episode in range(end_episode + 1, max_episodes + 1):
            s = env.reset()
            reward_sum = 0
            done = False
            while not done:
                #env.render()
                action = np.argmax(
                    Q_Network.evaluate_critic(
                        np.reshape(state, [1, input_size])))

                if conti_action_flag:
                    action = [action_map[action]]
                else:
                    action = action

                state, reward, done, _ = env.step(action)
                reward_sum += reward
                global_step += 1

                #if episode % save_epi == 0:
                #    Q_Network.save_network(episode = episode, save_epi = save_epi)
                #    Action_Network.save_network(episode = episode, save_epi = save_epi)

                if done:
                    steps_list.append(reward_sum)
                    global_step_list.append(global_step)
                    step_count_total += steps_list[episode - 1]
                    step_count_total -= steps_list[episode - 1 -
                                                   ending_cond_epis]
                    step_avg_list.append(step_count_total / ending_cond_epis)
                    print("{}           {}".format(
                        episode, round(step_avg_list[episode - 1], 3)))
                    print("                   ( Result : {} )".format(
                        reward_sum))

            if episode % save_epi == 0:
                file_case = str(case_n)
                Q_Network.save_network(game_name=self.file_name + '_seed' +
                                       file_case,
                                       episode=episode,
                                       save_epi=save_epi)
                with open(
                        '/home/jolp/Desktop/Data/' + self.file_name + '_seed' +
                        file_case, 'wb') as fout:
                    pickle.dump(step_avg_list, fout)
                with open(
                        '/home/jolp/Desktop/Data/' + self.file_name +
                        '_global_' + '_seed' + file_case, 'wb') as fout2:
                    pickle.dump(global_step_list, fout2)

                x_values = list(range(1, episode + 1))
                y_values = step_avg_list[:]
                plt.plot(x_values, y_values, c='green')
                plt.title(self.file_name)
                plt.grid(True)
                plt.show()

        file_case = str(case_n)
        with open(
                '/home/jolp/Desktop/Data/' + self.file_name + '_seed' +
                file_case, 'wb') as fout:
            pickle.dump(step_avg_list, fout)
        with open(
                '/home/jolp/Desktop/Data/' + self.file_name + '_global_' +
                '_seed' + file_case, 'wb') as fout2:
            pickle.dump(global_step_list, fout2)

        x_values = list(range(1, max_episodes + 1))
        y_values = step_avg_list[:]
        plt.plot(x_values, y_values, c='green')
        plt.title(self.file_name)
        plt.grid(True)
        plt.show()

Exemple #18

0

Afficher le fichier

 def getExplorer(self):
     explorer = Exploration.ParallelExplorer(self.parallelCfg)
     return explorer

Exemple #19

0

Afficher le fichier

def SmoothMFD (Db, a, Wkt, Window=GaussWin, Par=50.,
                           Delta=0.1, SphereGrid=False,
                           Box=[], Buffer=[], Grid=[],
                           Threshold=-100, Unwrap=False,
                           ZeroRates=False):

  if Par <= 0:
    Par = np.inf

  # Catalogue selection
  DbS = Sel.AreaSelect(Db, Wkt, Owrite=0, Buffer=Buffer, Unwrap=Unwrap)
  x,y,z = Exp.GetHypocenter(DbS)

  # Creating the mesh grid
  P = CU.Polygon()
  P.Load(Wkt)

  # Unwrapping coordinates
  if Unwrap:
    x = [i if i > 0. else i+360. for i in x]
    P.Unwrap()

  if Grid:
    XY = [G for G in Grid if P.IsInside(G[0], G[1])]
  else:
    if SphereGrid:
      XY = P.SphereGrid(Delta=Delta, Unwrap=Unwrap)
    else:
      XY = P.CartGrid(Dx=Delta, Dy=Delta, Bounds=Box)

  Win = []
  for xyP in XY:
    Win.append(0)
    for xyE in zip(x,y):
      Dis = CU.WgsDistance(xyP[1], xyP[0], xyE[1], xyE[0])
      Win[-1] += Window(Dis, Par)

  # Scaling and normalising the rates
  Norm = np.sum(Win)

  A = []; X = []; Y = []
  for I,W in enumerate(Win):

    aT = -np.inf
    if Norm > 0. and W > 0.:
      aT = a + np.log10(W/Norm)
      if aT < Threshold:
        # Filter below threshold
        aT = -np.inf

    if ZeroRates:
      A.append(aT)
      X.append(XY[I][0])
      Y.append(XY[I][1])
    else:
      if aT > -np.inf:
        A.append(aT)
        X.append(XY[I][0])
        Y.append(XY[I][1])

  if Unwrap:
    # Wrap back longitudes
    X = [x if x < 180. else x-360. for x in X]

  return X, Y, A

Exemple #20

0

Afficher le fichier

    def run_DQN(self, seed_n, Exp, Double, Prioritized):
        ############## parameter 복사 ##############
        sess = self.sess
        dis = self.dis
        REPLAY_MEMORY = self.REPLAY_MEMORY
        replay_memory = self.replay_memory
        batch_size = self.batch_size
        size_action_batch = self.size_action_batch

        Game = self.Game
        save_epi = self.save_epi
        save_network = self.save_network
        max_episodes = self.max_episodes
        max_steps = self.max_steps
        env = self.env
        random_action = self.random_action

        input_size = self.input_size
        output_size = self.output_size

        alpha = self.alpha
        beta_init = self.beta_init
        beta_max_step = self.beta_max_step
        eps = self.eps
        eps_div = self.eps_div
        s_scale = self.s_scale

        training_step = self.training_step
        copy_step = self.copy_step
        action_copy_step = self.action_copy_step
        action_train = self.action_train
        weighted_train = self.weighted_train
        repu_num = self.repu_num

        DDPG = self.DDPG

        ending_cond_epis = self.ending_cond_epis
        ending_cond_reward = self.ending_cond_reward

        env.seed(seed_n)
        np.random.seed(seed_n)
        tf.set_random_seed(seed_n)
        random.seed(seed_n)
        #############################################

        Q_Network = self.Q_Network
        A_batch = Q_Network.get_action_batch()
        if DDPG:
            Action_Network = self.Action_Network

        # DDPG Action Network 학습 시 사용되는 grad_inv 설정
        action_max = np.array(env.action_space.high).tolist()
        action_min = np.array(env.action_space.low).tolist()
        action_bounds = [action_max, action_min]
        grad_inv = grad_inverter(sess, action_bounds)

        case_n = seed_n + 1
        end_episode = 0
        step_count_total = 0
        global_step = 0
        loss = 0
        e = 1.

        replay_buffer = deque()
        Q_list = []
        TD_buffer = deque()
        steps_list = []
        step_avg_list = []
        global_step_list = []

        average_distance = []
        rate_of_adjacent = []

        print("")
        print("CASE {}".format(case_n))
        print("  STATE DIM : {}, ACTION DIM : {}".format(
            input_size, self.action_dim))
        print("  Exp : {}".format(Exp))
        if DDPG:
            print("  Strategy : Double : {},  Prioritized : {},  DDPG : {}".
                  format(Double, Prioritized, DDPG))
        elif random_action:
            if action_train:
                print(
                    "  Strategy : Double : {},  Prioritized : {},  ACTION : RANDOM,  ACTION TRAIN 'ON'"
                    .format(Double, Prioritized))
            else:
                print(
                    "  Strategy : Double : {},  Prioritized : {},  ACTION : RANDOM"
                    .format(Double, Prioritized))
        else:
            if action_train:
                print(
                    "  Strategy : Double : {},  Prioritized : {},  ACTION : DISCRETIZATION,  ACTION TRAIN 'ON'"
                    .format(Double, Prioritized))
            else:
                print(
                    "  Strategy : Double : {},  Prioritized : {},  ACTION : DISCRETIZATION"
                    .format(Double, Prioritized))
        print("")

        for episode in range(1, max_episodes + 1):

            done = False
            step_count = 0
            current_step = 0
            cost = 0
            state = env.reset()

            while not done:
                # 입실론 값 조정, 0.001미만이 될 시 더 이상 작아지지 않는다.
                if e > 0.001:
                    #e = 1. / ((float(episode - 1) / eps_div) + 1)
                    e = 1. / ((float(global_step) / eps_div) + 1)

                t4 = time.time()
                if DDPG:  # DDPG true 시, 액션네트워크로부터 행동을 결정받음
                    action = Action_Network.evaluate_actor(
                        np.reshape(state, [1, input_size]))[0]
                else:  # DDPG false 시, state에 따른 각 행동 별 q 값을 get_q_batch로 받은 후 Exploration 방식에 따라 행동 결정
                    action0 = Exploration.choice_action(Exp, e, s_scale,\
                                                 np.reshape(Q_Network.get_q_batch(np.reshape(state,[1,-1])),[1,-1])[0])
                    action = A_batch[action0]

                next_state, reward, done, _ = env.step(action)
                step_count += reward
                global_step += 1
                current_step += 1

                # Prioritized 시 tree(replay_memory)에 저장, 아닐 시 랜덤으로 추출할 replay_beffer에 저장
                if Prioritized:
                    replay_memory.save_experience(state, action, reward,
                                                  next_state, done)
                else:
                    replay_buffer.append(
                        (state, next_state, action, reward, done))
                    if len(replay_buffer) > REPLAY_MEMORY:
                        replay_buffer.popleft()

                state = next_state

                if global_step <= beta_max_step:
                    replay_memory.anneal_per_importance_sampling(
                        global_step, beta_max_step)

                # training step마다 traing 실행
                if global_step > batch_size and global_step % training_step == 0:
                    for re in range(
                            repu_num):  # repu_num만큼 반복 training. 거의 1로 사용.
                        if Prioritized:
                            # replay_memory로부터 batch를 추출
                            idx, priorities, w_batch, experience = replay_memory.retrieve_experience(
                                batch_size)
                            minibatch = self.format_experience(experience)
                            if DDPG:
                                # DDPG true시 Q네트워크와 Action네트워크 모두 training
                                errors, cost = Train.train_prioritized_DDPG(
                                    Q_Network, Action_Network, minibatch,
                                    w_batch, output_size, grad_inv)
                                replay_memory.update_experience_weight(
                                    idx, errors)

                            else:
                                # DDPG false시 Q네트워크 training
                                errors, cost, state_t_batch = Train.train_prioritized(
                                    Q_Network, minibatch, w_batch, Exp,
                                    s_scale, input_size, output_size,
                                    size_action_batch)
                                replay_memory.update_experience_weight(
                                    idx, errors)

                                # action_copy_step 마다 action set을 training, action_train이 false 시 RAS 알고리즘
                                if action_train and global_step % action_copy_step == 0:
                                    action_weight = []

                                    if weighted_train:  # WARAS 알고리즘
                                        # weight 계산
                                        for k in range(batch_size):
                                            state_t = np.reshape(
                                                state_t_batch[k], [1, -1])

                                            q_batch = Q_Network.get_q_batch(
                                                state_t)
                                            q_batch = np.reshape(
                                                q_batch, [1, -1])[0]
                                            q_batch = q_batch * 10.
                                            max_q = np.max(q_batch)
                                            q_batch = np.exp(q_batch - max_q)
                                            action_weight.append(q_batch)

                                    else:  # ARAS 알고리즘
                                        # 모든 weight를 1로 설정
                                        action_weight = np.ones(
                                            [batch_size, size_action_batch])
                                    # weight 값을 이용한 Q네트워크 training
                                    Q_Network.train_weighted_actor(
                                        state_t_batch, action_weight)

                                    # target-action set을 update
                                    Q_Network.update_action_target_critic()
                                    A_batch = Q_Network.get_action_batch()
                                    t_A_batch = Q_Network.get_target_action_batch(
                                    )
                                    """
                                    # 거리가 가까운 action 쌍을 찾아 resampling
                                    A_batch, t_A_batch = self.realign_action_batch(A_batch, t_A_batch)
                                    Q_Network.realign_action_batch(A_batch, t_A_batch)
                                    A_batch = Q_Network.get_action_batch()
                                    t_A_batch = Q_Network.get_target_action_batch()
                                    """
                        else:  # Prioritized가 아닐 시 랜덤하게 minibatch를 생성해 training
                            minibatch = random.sample(replay_buffer,
                                                      batch_size)
                            if DDPG:
                                cost = Train.train_DDPG(
                                    Q_Network, Action_Network, minibatch,
                                    output_size, grad_inv)

                            else:
                                cost, state_t_batch = Train.train(
                                    Q_Network, minibatch, Exp, s_scale,
                                    input_size, output_size, size_action_batch)

                # copy_step 마다 Q네트워크 업데이트
                if global_step % copy_step == 0:
                    if DDPG:
                        # Update target Critic and actor network
                        Q_Network.update_target_critic()
                        Q_Network.update_action_target_critic()
                        Action_Network.update_target_actor()

                    else:
                        Q_Network.update_target_critic()
                        Q_Network.update_action_target_critic()

            steps_list.append(step_count)
            global_step_list.append(global_step)

            # Print the average of result
            if episode < ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / episode)

            if episode == ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_avg_list.append(step_count_total / ending_cond_epis)

            if episode > ending_cond_epis:
                step_count_total += steps_list[episode - 1]
                step_count_total -= steps_list[episode - 1 - ending_cond_epis]
                step_avg_list.append(step_count_total / ending_cond_epis)

            print("{}           {}".format(
                episode, round(step_avg_list[episode - 1], 3)))
            if DDPG:
                print ("                   ( Result : {},  Loss : {},  Steps : {},  Global Steps : {} )"
                                   #.format(round(step_count, 3), round(cost, 5), current_step, global_step))
                                   .format(round(step_count, 3), 0, current_step, global_step))
            elif Exp == 'epsilon' or Exp == 'sparsemax':
                print ("                   ( Result : {},  Loss : {},  Epsilon : {},  Steps : {},  Global Steps : {} )"
                                   #.format(round(step_count, 3), round(cost, 5), round(e, 4), current_step, global_step))
                                   .format(round(step_count, 3), 0, round(e, 5), current_step, global_step))
            else:
                print ("                   ( Result : {},  Loss : {},  Steps : {},  Global Steps : {} )"
                                   #.format(round(step_count, 3), round(cost, 5), current_step, global_step))
                                   .format(round(step_count, 3), 0, current_step, global_step))

            distance, per_of_sim, per_of_sim2 = self.get_action_variance(
                A_batch)
            print(
                "                   ( Action Batch  ::::  Distance : {},  Percent : {}%({}%) )"
                .format(distance, per_of_sim, per_of_sim2))
            average_distance.append(distance)
            rate_of_adjacent.append(per_of_sim)

            # Save the networks
            if episode % save_epi == 0:
                file_case = str(case_n)
                if save_network:
                    Q_Network.save_network(game_name=self.file_name + '_seed' +
                                           file_case,
                                           episode=episode,
                                           save_epi=save_epi)

                with open(
                        '/home/minjae/Desktop/JOLP/' + self.file_name +
                        '_seed' + file_case, 'wb') as fout:
                    pickle.dump(step_avg_list, fout)
                with open(
                        '/home/minjae/Desktop/JOLP/' + self.file_name +
                        '_global_' + '_seed' + file_case, 'wb') as fout2:
                    pickle.dump(global_step_list, fout2)

                x_values = list(range(1, episode + 1))
                y_values = step_avg_list[:]
                plt.plot(x_values, y_values, c='green')
                plt.title(self.file_name)
                plt.grid(True)
                plt.show()

                with open(
                        '/home/minjae/Desktop/JOLP/' +
                        'Average_of_Distance_(' + self.file_name + '_seed' +
                        file_case + ')', 'wb') as fout:
                    pickle.dump(average_distance, fout)
                with open(
                        '/home/minjae/Desktop/JOLP/' + 'Rate_of_Adjacent_(' +
                        self.file_name + '_global_' + '_seed' + file_case +
                        ')', 'wb') as fout2:
                    pickle.dump(rate_of_adjacent, fout2)

                p_values = list(range(1, episode + 1))
                q_values = average_distance[:]
                r_values = rate_of_adjacent[:]
                plt.plot(p_values, q_values, c='r')
                plt.title('Average of Distance between Actions')
                plt.grid(True)
                plt.show()
                plt.plot(p_values, r_values, c='b')
                plt.title('Rate of Adjacent Actions')
                plt.grid(True)
                plt.show()

            end_episode += 1

            # 결과가 목표치를 달성하면 학습 중단
            if step_avg_list[episode - 1] > ending_cond_reward:
                break
            # max_steps 만큼 학습되었으면 학습 중단
            if global_step > max_steps:
                break

        print("--------------------------------------------------")
        print("--------------------------------------------------")

        # 목표치를 달성하여 학습 중단 시, 남은 episode 만큼 실행
        for episode in range(end_episode + 1, max_episodes + 1):

            if global_step > max_steps:
                break

            s = env.reset()
            reward_sum = 0
            done = False
            while not done:
                # 최대 Q 값을 나타내는 행동 선택
                action = np.argmax(
                    Q_Network.evaluate_critic(
                        np.reshape(state, [1, input_size])))

                if conti_action_flag:
                    action = [action_map[action]]
                else:
                    action = action

                state, reward, done, _ = env.step(action)
                reward_sum += reward
                global_step += 1

                if done:
                    steps_list.append(reward_sum)
                    global_step_list.append(global_step)
                    step_count_total += steps_list[episode - 1]
                    step_count_total -= steps_list[episode - 1 -
                                                   ending_cond_epis]
                    step_avg_list.append(step_count_total / ending_cond_epis)
                    print("{}           {}".format(
                        episode, round(step_avg_list[episode - 1], 3)))
                    print("                   ( Result : {} )".format(
                        reward_sum))

            if episode % save_epi == 0:
                file_case = str(case_n)
                if save_network:
                    Q_Network.save_network(game_name=self.file_name + '_seed' +
                                           file_case,
                                           episode=episode,
                                           save_epi=save_epi)
                with open(
                        '/home/minjae/Desktop/JOLP/' + self.file_name +
                        '_seed' + file_case, 'wb') as fout:
                    pickle.dump(step_avg_list, fout)
                with open(
                        '/home/minjae/Desktop/JOLP/' + self.file_name +
                        '_global_' + '_seed' + file_case, 'wb') as fout2:
                    pickle.dump(global_step_list, fout2)

                x_values = list(range(1, episode + 1))
                y_values = step_avg_list[:]
                plt.plot(x_values, y_values, c='green')
                plt.title(self.file_name)
                plt.grid(True)
                plt.show()

        # parameter 저장
        file_case = str(case_n)
        with open(
                '/home/minjae/Desktop/JOLP/' + self.file_name + '_seed' +
                file_case, 'wb') as fout:
            pickle.dump(step_avg_list, fout)
        with open(
                '/home/minjae/Desktop/JOLP/' + self.file_name + '_global_' +
                '_seed' + file_case, 'wb') as fout2:
            pickle.dump(global_step_list, fout2)

        # 그래프 출력
        x_values = list(range(1, len(step_avg_list) + 1))
        y_values = step_avg_list[:]
        plt.plot(x_values, y_values, c='green')
        plt.title(self.file_name)
        plt.grid(True)
        plt.show()

        with open(
                '/home/minjae/Desktop/JOLP/' + 'Average_of_Distance_(' +
                self.file_name + '_seed' + file_case + ')', 'wb') as fout:
            pickle.dump(average_distance, fout)
        with open(
                '/home/minjae/Desktop/JOLP/' + 'Rate_of_Adjacent_(' +
                self.file_name + '_global_' + '_seed' + file_case + ')',
                'wb') as fout2:
            pickle.dump(rate_of_adjacent, fout2)

        p_values = list(range(1, episode + 1))
        q_values = average_distance[:]
        r_values = rate_of_adjacent[:]
        plt.plot(p_values, q_values, c='r')
        plt.title('Average of Distance between Actions')
        plt.grid(True)
        plt.show()
        plt.plot(p_values, r_values, c='b')
        plt.title('Rate of Adjacent Actions')
        plt.grid(True)
        plt.show()

Exemple #21

0

Afficher le fichier

def filter_cases_before_2018(log):
    log_filtered = EventLog(
        [case for case in log if case[0]['time:timestamp'].year >= 2018])
    util.print_filtered_cases_count(len(log), len(log_filtered))
    return log_filtered

Exemple #22

0

Afficher le fichier

    for key, values in criteria.items():
        for value in values:
            tofilter_log = attributes_filter.apply(
                tofilter_log, [value],
                parameters={
                    xes_constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY: key,
                    "positive": True
                })
    tofilter_cases = [
        case.attributes[constants.concept_key] for case in tofilter_log
    ]
    log_filtered = EventLog([
        case for case in log
        if case.attributes[constants.concept_key] not in tofilter_cases
    ])
    util.print_filtered_cases_count(len(log), len(log) - len(tofilter_log))
    return log_filtered


def determine_stage(words):
    stages = ['declaration', 'permit', 'request', 'trip']
    for word in words:
        if word.lower() in stages:
            return stages[stages.index(word.lower())]
    else:
        return ' '.join(words)


def determine_action(words):
    for word in words:
        if word.isupper():

Python Exploration, deep-reinforcement-learning Exemples