Ejemplo n.º 1
0
    def test_text_init(self):
        "Test the Probe object creation from text"

        # 1. Create Probe object from text
        myobj = probe.Probe(text=aoc_17.from_text(EXAMPLE_TEXT))

        # 2. Make sure it has the expected values
        self.assertEqual(myobj.part2, False)
        self.assertEqual(len(myobj.text), 1)
        self.assertEqual(myobj.start, (0, 0))
        self.assertEqual(myobj.target, [20, 30, -10, -5])
        self.assertEqual(myobj.position, (0, 0))
        self.assertEqual(myobj.velocity, None)
        self.assertEqual(myobj.height, 0)

        # 3. Check methods
        myobj.reload((7, 2))
        self.assertEqual(myobj.position, (0, 0))
        self.assertEqual(myobj.velocity, (7, 2))
        self.assertEqual(myobj.is_in_target(), False)
        self.assertEqual(myobj.is_possible(), True)
        myobj.step()
        self.assertEqual(myobj.position, (7, 2))
        self.assertEqual(myobj.velocity, (6, 1))
        self.assertEqual(myobj.is_in_target(), False)
        self.assertEqual(myobj.is_possible(), True)
        myobj.step()
        self.assertEqual(myobj.position, (13, 3))
        self.assertEqual(myobj.velocity, (5, 0))
        self.assertEqual(myobj.is_in_target(), False)
        self.assertEqual(myobj.is_possible(), True)
        myobj.step()
        myobj.step()
        myobj.step()
        myobj.step()
        self.assertEqual(myobj.position, (27, -3))
        self.assertEqual(myobj.velocity, (1, -4))
        self.assertEqual(myobj.is_in_target(), False)
        self.assertEqual(myobj.is_possible(), True)
        myobj.step()
        self.assertEqual(myobj.position, (28, -7))
        self.assertEqual(myobj.velocity, (0, -5))
        self.assertEqual(myobj.is_in_target(), True)
        self.assertEqual(myobj.is_possible(), True)
        myobj.step()
        self.assertEqual(myobj.position, (28, -12))
        self.assertEqual(myobj.velocity, (0, -6))
        self.assertEqual(myobj.is_in_target(), False)
        self.assertEqual(myobj.is_possible(), False)
        self.assertEqual(myobj.height, 3)

        self.assertEqual(myobj.fire((7, 2)), 3)
        self.assertEqual(myobj.fire((6, 3)), 6)
        self.assertEqual(myobj.fire((9, 0)), 0)
        self.assertEqual(myobj.fire((17, -4)), -1)
        self.assertEqual(myobj.fire((6, 9)), 45)

        self.assertEqual(myobj.highest(), 45)
        self.assertEqual(myobj.count(), 112)
Ejemplo n.º 2
0
    def test_part_two(self):
        "Test part two example of Probe object"

        # 1. Create Probe object from text
        myobj = probe.Probe(part2=True, text=aoc_17.from_text(PART_TWO_TEXT))

        # 2. Check the part two result
        self.assertEqual(myobj.part_two(verbose=False), PART_TWO_RESULT)
Ejemplo n.º 3
0
    def test_part_one(self):
        "Test part one example of Probe object"

        # 1. Create Probe object from text
        myobj = probe.Probe(text=aoc_17.from_text(PART_ONE_TEXT))

        # 2. Check the part one result
        self.assertEqual(myobj.part_one(verbose=False), PART_ONE_RESULT)
Ejemplo n.º 4
0
    def probe_callback(self, data):
        theprobe = probe.Probe(data, self.theta)
        wallOffset = theprobe.offsetBetweenWalls(90,5)
        forwardOffset = theprobe.offsetBetweenWalls(45,5)
        slope = -theprobe.theWalls(70,90,110,3)
        forwardDistance = theprobe.averageRanges(-20,20)
        stopCondition = (theprobe.averageRanges(-10,10) < .25)
        # leftRearObject = theprobe.objectDetection(-90,-125,1,0.9,"Left") #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object
        # rightRearObject = theprobe.objectDetection(90,125,1,0.9,"Right") #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object
        

        #leftRearObject = theprobe.objectDetection3(25,110,135,1,0.5,"Right")
        rightSideObj = theprobe.objectDetection3(25,90,130,1.5,0.5,"Right")
        leftSideObj = theprobe.objectDetection3(-25,-90,-130,1.5,0.5,"Left")

        #theprobe.edgeDetection(45,120,5)
        
        # print "distance: %f\tangle: %f\n" % theprobe.closestObjectsAngle()
        # print "distance: %f\tangle: %f\n" % theprobe.closestObjectsAngleFullRanges()



        # print "\n\n\n\n\nAngle\tH-Dist\t\tA-Dist" #Hypotenuse Distance/Distance to wall from Lidar #Adjacent Distance/X Component of distance to wall
        # theprobe.perpendicularLineDistance(-120)
        # theprobe.perpendicularLineDistance(-105)
        # theprobe.perpendicularLineDistance(-90)
        # theprobe.perpendicularLineDistance(-75)
        # theprobe.perpendicularLineDistance(-60)
        # theprobe.perpendicularLineDistance(-45)
        # theprobe.perpendicularLineDistance(-30)
        # theprobe.perpendicularLineDistance(-15)
        # theprobe.perpendicularLineDistance(0)
        # theprobe.perpendicularLineDistance(15)
        # theprobe.perpendicularLineDistance(30)
        # theprobe.perpendicularLineDistance(45)
        # theprobe.perpendicularLineDistance(60)
        # theprobe.perpendicularLineDistance(75)
        # theprobe.perpendicularLineDistance(90)
        # theprobe.perpendicularLineDistance(105)
        # theprobe.perpendicularLineDistance(120)
        


        # rightRearObject = theprobe.objectDetection(-30,30,1,0.9,"Center")
        # leftRearObject = theprobe.objectDetection(-30,30,1,0.9,"Center")
        # theprobe.objectDetection2(45,90,135, 2, 1.5, "test") #frontAngle, midAngle, rearAngle, tempMaxDistance, objectDetectionCutoffDistance, debugDirectionString

        
        overRideSpeed = 0.6
        overRideTurn = rightSideObj
        if leftSideObj != 0: overRideTurn = -leftSideObj

        #slope = -theprobe.theWalls(45,80,100,3)
        #theprobe.averageWallSlope(-45,-135)
        if not self.pilotMode:
            self.stopList = []
        self.lidarData = [slope, wallOffset, forwardOffset, forwardDistance, stopCondition, self.stopList, overRideSpeed, overRideTurn]
Ejemplo n.º 5
0
def read_probe(num = 10000):
  #reading Probe Data
  print 'Reading probe points ...', num
  with open('Partition6467ProbePoints.csv', 'rb') as csvfile:
      spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
      for i, row in enumerate(spamreader):
          probe_obj=probe.Probe(row)
          probe_data.append(probe_obj)
          if i > num:
            break
Ejemplo n.º 6
0
    def test_empty_init(self):
        "Test the default Probe creation"

        # 1. Create default Probe object
        myobj = probe.Probe()

        # 2. Make sure it has the default values
        self.assertEqual(myobj.part2, False)
        self.assertEqual(myobj.text, None)
        self.assertEqual(myobj.start, (0, 0))
        self.assertEqual(myobj.target, None)
        self.assertEqual(myobj.position, (0, 0))
        self.assertEqual(myobj.velocity, None)
        self.assertEqual(myobj.height, 0)
Ejemplo n.º 7
0
def part_two(args, input_lines):
    "Process part two of the puzzle"

    # 1. Create the puzzle solver
    solver = probe.Probe(part2=True, text=input_lines)

    # 2. Determine the solution for part two
    solution = solver.part_two(verbose=args.verbose, limit=args.limit)
    if solution is None:
        print("There is no solution")
    else:
        print("The solution for part two is %s" % (solution))

    # 3. Return result
    return solution is not None
Ejemplo n.º 8
0
def read_probe():
    #reading Probe Data
    print 'reading probe points ...'
    with open('Partition6467ProbePoints.csv', 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
        i = 0
        for row in spamreader:
            probe_obj = probe.Probe(row)
            probe_data.append(probe_obj)
            # x.append(probe_obj.longitude)
            # y.append(probe_obj.latitude)

            # print 'No=',i,'latitude=',probe_obj.latitude,'longitude=', probe_obj.longitude
            i = i + 1
            if i > 100:
                break
Ejemplo n.º 9
0
def main():
    appList = json.load(open('./src/data/applications.json'))
    logging.info('Using SSDP to discover nodes')
    nodes = DIAL.discover()
    logging.info('Found {} nodes'.format(len(nodes)))
    for node in nodes:
        print('Name: {}\nManufacturer: {}\nModel: {}'.format(
            node['friendlyName'], node['manufacturer'], node['model']))
        url = node['application-url']
        logging.info('Probing {}'.format(url))
        p = probe.Probe(url, appList)
        availableApps = p.checkAllApps()
        for app in availableApps:
            print('Found: {}'.format(app))

    logging.info('Done')
Ejemplo n.º 10
0
    def probe_callback(self, data):
        theprobe = probe.Probe(data, self.theta)
        wallOffset = theprobe.offsetBetweenWalls(90, 5)
        forwardOffset = theprobe.offsetBetweenWalls(45, 5)
        slope = -theprobe.theWalls(70, 90, 110, 3)
        forwardDistance = theprobe.averageRanges(-20, 20)
        stopCondition = (theprobe.averageRanges(-10, 10) < .25)
        leftRearObject = theprobe.objectDetection(
            -90, -125, 1, 0.9, "Left"
        )  #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object
        rightRearObject = theprobe.objectDetection(
            90, 125, 1, 0.9, "Right"
        )  #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object#rightRearObject = theprobe.objectDetection(90,125,2,1.8) #front angle, rear angle, max distance, distance required to detect object

        #slope = -theprobe.theWalls(45,80,100,3)
        #theprobe.averageWallSlope(-45,-135)
        if not self.pilotMode:
            self.stopList = []
        self.lidarData = [
            slope, wallOffset, forwardOffset, forwardDistance, stopCondition,
            self.stopList
        ]
Ejemplo n.º 11
0
    def __init__(self, target, ai_settings, screen, probes, treasures):
        super().__init__(target, ai_settings, screen, probes, treasures)

        self.rank = self.ai_settings.black_dragon_rank

        self.image = self.ai_settings.black_dragon_left[0]
        self.image_left = self.ai_settings.black_dragon_left
        self.image_right = self.ai_settings.black_dragon_right
        self.boom_image = self.ai_settings.black_dragon_boom

        self.width = self.ai_settings.black_dragon_width
        self.height = self.ai_settings.black_dragon_height

        self.adjusted_borny = random.uniform(
            self.orig_borny,
            (self.orig_borny + ai_settings.maze_block_width - self.width))
        self.adjusted_bornx = random.uniform(
            self.orig_bornx,
            (self.orig_bornx + ai_settings.maze_block_height - self.height))

        self.rect = pygame.Rect(self.adjusted_borny, self.adjusted_bornx,
                                self.ai_settings.black_dragon_width,
                                self.ai_settings.black_dragon_height)

        self.health = self.ai_settings.black_dragon_health
        self.speed = ai_settings.black_dragon_speed_factor
        self.atk_distance = self.ai_settings.black_dragon_atk_distance
        self.atk = self.ai_settings.black_dragon_atk
        self.ATKPRT = self.ai_settings.black_dragon_ATKPRBT * float(
            random.uniform(0.5, 1.5))

        # Create the first probe of this monster
        self.probe = p.Probe(self.target, self.rect, self.ai_settings,
                             self.screen)
        probes.append(self.probe)
        self.x_speed = 0
        self.y_speed = 0
        self.is_target_within_range = False
Ejemplo n.º 12
0
 def detect(self, target, probes):
     # The new probe should be created after the last one "dead"
     if not self.probe.is_alive:
         self.probe = p.Probe(target, self.rect, self.ai_settings,
                              self.screen)
         probes.append(self.probe)
Ejemplo n.º 13
0
def test_function(config, config_suffix=None):

    config_main = config['main']
    config_probe = config['probe']
    config_VAE = config['VAE']
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    config_ablation = config['ablation']
    use_pi_e = config_ablation['use_pi_e']
    phase = config_main['phase']
    assert (phase == 'validation' or phase == 'test')

    domain = config_main['domain']

    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'mujoco':
        domain_name = "config_mujoco.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("test_ablation.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)

    n_state = config_domain['n_state']
    n_action = config_domain['n_action']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    N_instances = config_domain['N_test_instances']
    N_episodes = config_domain['N_test_episodes']
    test_steps = config_domain['test_steps']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']

    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain, phase)

    # Instantiate probe policy
    n_probe_steps = config_domain['traj_length']
    assert (n_probe_steps < test_steps)
    if use_pi_e:
        pi_e = probe.Probe(config_probe, n_state, n_action)
    else:
        # initial z
        z_avg = pickle.load(open('../results/%s/z_avg.p' % dir_name, 'rb'))

    # Instantiate VAE
    buffer_size_vae = config_VAE['buffer_size']
    batch_size_vae = config_VAE['batch_size']
    del config_VAE['buffer_size']
    vae = vae_import.VAE(n_state,
                         n_action,
                         n_probe_steps,
                         seed=seed,
                         **config_VAE)

    # Instantiate control policy
    if config_DDQN['activate']:
        pi_c = ddqn.DDQN(config_DDQN, n_state, n_action,
                         config_PER['activate'], config_VAE['n_latent'])

    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)

    saver = tf.train.Saver()
    print("Restoring variables from %s" % dir_name)
    saver.restore(sess, '../results/%s/%s' % (dir_name, model_name))

    reward_total = 0
    cumulative_reward = np.zeros((test_steps, N_instances))
    # Iterate through random instances from the HPMDP
    for idx_instance in range(1, N_instances + 1):

        hpmdp.switch_instance()
        print("idx_instance", idx_instance, " | Switching instance to",
              hpmdp.instance_param_set)

        # N_episodes should be 1, but we let it be flexible in case needed
        for idx_episode in range(1, N_episodes + 1):

            reward_episode = 0

            collected_probe_traj = False
            while not collected_probe_traj:

                # list of (state, action) pairs
                traj_probe = []
                state = hpmdp.reset()
                episode_step = 0
                done = False

                probe_finished_early = False
                # Generate probe trajectory
                for step in range(1, n_probe_steps + 1):

                    if use_pi_e:
                        action = pi_e.run_actor(state, sess)
                    else:
                        action = pi_c.run_actor(state, z_avg, sess, epsilon=0)
                    # print("Probe step %d action %d" % (step, action))
                    action_1hot = np.zeros(n_action)
                    action_1hot[action] = 1
                    traj_probe.append((state, action_1hot))
                    state_next, reward, done = hpmdp.step(action)
                    reward_episode += reward
                    cumulative_reward[episode_step,
                                      idx_instance - 1] = reward_episode
                    state = state_next
                    episode_step += 1
                    if done and step < n_probe_steps:
                        probe_finished_early = True
                        print(
                            "test_ablation.py : done is True while generating probe trajectory"
                        )
                        break

                if not probe_finished_early:
                    collected_probe_traj = True

            # Use VAE to estimate hidden parameter
            z = vae.encode(sess, traj_probe)

            print(z)

            if config_DDQN['activate']:
                # Start control policy
                while not done and episode_step < test_steps:
                    # Use DDQN with prioritized replay for this
                    action = pi_c.run_actor(state, z, sess, epsilon=0)
                    state_next, reward, done = hpmdp.step(action)
                    reward_episode += reward
                    cumulative_reward[episode_step,
                                      idx_instance - 1] = reward_episode
                    state = state_next
                    episode_step += 1
                print(reward_episode)
                # If episode ended earlier than test_steps, fill in the
                # rest of the cumulative rewards with the last value
                if episode_step < test_steps:
                    remaining = np.ones(test_steps -
                                        episode_step) * reward_episode
                    cumulative_reward[episode_step:,
                                      idx_instance - 1] = remaining

                reward_total += reward_episode

    header = 'Step'
    for idx in range(1, N_instances + 1):
        header += ',R_%d' % idx
    indices = np.arange(1, test_steps + 1).reshape(test_steps, 1)
    concated = np.concatenate([indices, cumulative_reward], axis=1)
    save_loc = '_'.join(dir_name.split('_')[:-1])
    os.makedirs('../results/%s' % save_loc, exist_ok=True)
    run_number = dir_name.split('_')[-1]
    np.savetxt('../results/%s/test_%s.csv' % (save_loc, run_number),
               concated,
               delimiter=',',
               fmt='%.3e',
               header=header)

    print("Avg episode reward", reward_total / float(N_instances * N_episodes))
Ejemplo n.º 14
0
def train_function(config, config_suffix=None):

    config_main = config['main']
    config_probe = config['probe']
    autoencoder = config_main['autoencoder']
    if autoencoder == 'VAE':
        config_VAE = config['VAE']
    else:
        raise ValueError("Other autoencoders not supported")
    config_DDQN = config['DDQN']
    config_PER = config['PER']
    phase = config_main['phase']
    assert (phase == 'train')

    domain = config_main['domain']

    # Domain-specific parameters (e.g. state and action space dimensions)
    if domain == '2D':
        domain_name = "config_2D.json"
    elif domain == 'acrobot':
        domain_name = "config_acrobot.json"
    elif domain == 'hiv':
        if config_suffix is not None:
            domain_name = "config_hiv{}.json".format(config_suffix)
        else:
            domain_name = "config_hiv.json"
    elif domain == 'lander':
        domain_name = "config_lander.json"
    elif domain == 'cancer':
        domain_name = "config_cancer.json"
    else:
        raise ValueError("train.py : domain not recognized")
    with open(domain_name) as f:
        config_domain = json.load(f)

    n_state = config_domain['n_state']
    n_action = config_domain['n_action']
    min_samples_before_train = config_domain['min_samples_before_train']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    N_instances = config_main['N_instances']
    N_episodes = config_main['N_episodes']
    period = config_main['period']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']

    os.makedirs('../results/%s' % dir_name, exist_ok=True)

    # Instantiate HPMDP
    hpmdp = HiPMDP.HiPMDP(domain, config_domain)

    # Instantiate probe policy
    n_probe_steps = config_domain['traj_length']
    pi_e = probe.Probe(config_probe, n_state, n_action)

    # Instantiate VAE
    buffer_size_vae = config_VAE['buffer_size']
    batch_size_vae = config_VAE['batch_size']
    del config_VAE['buffer_size']
    if autoencoder == 'VAE':
        vae = vae_import.VAE(n_state,
                             n_action,
                             n_probe_steps,
                             seed=seed,
                             **config_VAE)
    else:
        raise ValueError('Other autoencoders not supported')

    # Instantiate control policy
    if config_DDQN['activate']:
        pi_c = ddqn.DDQN(config_DDQN, n_state, n_action,
                         config_PER['activate'], config_VAE['n_latent'])
        epsilon_start = config_DDQN['epsilon_start']
        epsilon_end = config_DDQN['epsilon_end']
        epsilon_decay = np.exp(
            np.log(epsilon_end / epsilon_start) / (N_instances * N_episodes))
        steps_per_train = config_DDQN['steps_per_train']

    # TF session
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())

    if config_DDQN['activate']:
        sess.run(pi_c.list_initialize_target_ops)
        epsilon = epsilon_start

    if config_VAE['dual']:
        sess.run(vae.list_equate_dual_ops)

    writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)

    saver = tf.train.Saver()

    # use the DQN version of the replay, so instance_count and bnn-specific params do not matter
    exp_replay_param = {
        'episode_count': N_instances * N_episodes,
        'instance_count': 0,
        'max_task_examples': hpmdp.max_steps_per_episode,
        'ddqn_batch_size': config_DDQN['batch_size'],
        'num_strata_samples': config_PER['num_strata_samples'],
        'PER_alpha': config_PER['alpha'],
        'PER_beta_zero': config_PER['beta_zero'],
        'bnn_batch_size': 0,
        'bnn_start': 0,
        'dqn_start': min_samples_before_train
    }

    buf = ExperienceReplay.ExperienceReplay(
        exp_replay_param, buffer_size=config_PER['buffer_size'])

    # Logging
    header = "Episode,R_avg,R_p\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)
    reward_period = 0
    reward_p_period = 0

    list_trajs = []  # circular buffer to store probe trajectories for VAE
    idx_traj = 0  # counter for list_trajs
    control_step = 0
    train_count_probe = 1
    train_count_vae = 1
    train_count_control = 1
    total_episodes = 0
    t_start = time.time()
    # Iterate through random instances from the HPMDP
    for idx_instance in range(1, N_instances + 1):

        hpmdp.switch_instance()
        print("idx_instance", idx_instance, " | Switching instance to",
              hpmdp.instance_param_set)

        # Iterate through many episodes
        for idx_episode in range(1, N_episodes + 1):

            total_episodes += 1

            # list of (state, action) pairs
            traj_probe = []
            state = hpmdp.reset()
            done = False
            reward_episode = 0

            # Generate probe trajectory
            probe_finished_early = False
            for step in range(1, n_probe_steps + 1):

                action = pi_e.run_actor(state, sess)
                action_1hot = np.zeros(n_action)
                action_1hot[action] = 1
                traj_probe.append((state, action_1hot))
                state_next, reward, done = hpmdp.step(action)
                state = state_next
                reward_episode += reward

                if done and step < n_probe_steps:
                    probe_finished_early = True
                    print(
                        "train.py : done is True while generating probe trajectory"
                    )
                    break

            if probe_finished_early:
                # Skip over pi_e and VAE training if probe finished early
                continue

            if idx_traj >= len(list_trajs):
                list_trajs.append(traj_probe)
            else:
                list_trajs[idx_traj] = traj_probe
            idx_traj = (idx_traj + 1) % buffer_size_vae

            # Compute probe reward using VAE
            if config_probe['reward'] == 'vae':
                reward_e = vae.compute_lower_bound(traj_probe, sess)
            elif config_probe['reward'] == 'total_variation':
                reward_e = pi_e.compute_reward(traj_probe)
            elif config_probe['reward'] == 'negvae':
                # this reward encourages maximizing entropy
                reward_e = -vae.compute_lower_bound(traj_probe, sess)

            # Write Tensorboard at the final episode of every instance
            if total_episodes % period == 0:
                summarize = True
            else:
                summarize = False

            # Train probe policy
            pi_e.train_step(sess, traj_probe, reward_e, train_count_probe,
                            summarize, writer)
            train_count_probe += 1

            # Train VAE
            if len(list_trajs) >= batch_size_vae:
                vae.train_step(sess, list_trajs, train_count_vae, summarize,
                               writer)
                train_count_vae += 1

            # Use VAE to estimate hidden parameter
            z = vae.encode(sess, traj_probe)

            if config_DDQN['activate']:
                # Start control policy
                summarized = False
                while not done:
                    # Use DDQN with prioritized replay for this
                    action = pi_c.run_actor(state, z, sess, epsilon)
                    state_next, reward, done = hpmdp.step(action)
                    control_step += 1
                    reward_episode += reward

                    buf.add(
                        np.reshape(
                            np.array(
                                [state, action, reward, state_next, done, z]),
                            (1, 6)))
                    state = state_next

                    if control_step >= min_samples_before_train and control_step % steps_per_train == 0:
                        batch, IS_weights, indices = buf.sample(control_step)
                        if not summarized:
                            # Write TF summary at first train step of the last episode of every instance
                            td_loss = pi_c.train_step(sess, batch, IS_weights,
                                                      indices,
                                                      train_count_control,
                                                      summarize, writer)
                            summarized = True
                        else:
                            td_loss = pi_c.train_step(sess, batch, IS_weights,
                                                      indices,
                                                      train_count_control,
                                                      False, writer)
                        train_count_control += 1

                        if config_PER['activate']:
                            buf.update_priorities(
                                np.hstack(
                                    (np.reshape(td_loss, (len(td_loss), -1)),
                                     np.reshape(indices, (len(indices), -1)))))

                reward_period += reward_episode
                reward_p_period += reward_e

                if epsilon > epsilon_end:
                    epsilon *= epsilon_decay

                # Logging
                if total_episodes % period == 0:
                    s = "%d,%.2f,%.2f\n" % (total_episodes,
                                            reward_period / float(period),
                                            reward_p_period / float(period))
                    print(s)
                    with open("../results/%s/log.csv" % dir_name, 'a') as f:
                        f.write(s)
                    if config_domain[
                            'save_threshold'] and reward_period / float(
                                period) > config_domain['save_threshold']:
                        saver.save(
                            sess, '../results/%s/%s.%d' %
                            (dir_name, model_name, total_episodes))
                    reward_period = 0
                    reward_p_period = 0

    with open("../results/%s/time.txt" % dir_name, 'a') as f:
        f.write("%.5e" % (time.time() - t_start))

    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))