temp_act[action] = 1.
        actions.append(temp_act)

        # Store reward
        values.append(reward)

        # Time step
        t += 1

        # Get intrinsic reward
        if t <= mem_size:
            states_mem.append(state)
            actions_mem.append(temp_act)
            next_states_mem.append(next_state)
            if t == mem_size:
                pre_z_mean, pre_z_std = tuc.dump_z_mean_std(
                    state, temp_act[np.newaxis, :])
            intrinsic_reward_1 = 0
            intrinsic_reward_2 = 0
        else:
            tuc.train_tuc(np.vstack(states_mem), np.vstack(next_states_mem),
                          np.vstack(actions_mem))
            z_mean, z_std = tuc.dump_z_mean_std(state, temp_act[np.newaxis, :])
            intrinsic_reward_1 = KL_divergence(pre_z_mean, pre_z_std, z_mean,
                                               z_std) / mem_size
            intrinsic_reward_2 = tuc.dump_regret(state, action)
            pre_z_mean = z_mean
            pre_z_std = z_std
            # Update memory
            states_mem.append(state)
            actions_mem.append(temp_act)
            next_states_mem.append(next_state)
Esempio n. 2
0
                # find the max bounding box in the region image
                new_iou = find_max_bounding_box(gt_masks, region_mask,
                                                classes_gt_objects,
                                                CLASS_OBJECT)
                reward = get_reward_movement(iou, new_iou)
                iou = new_iou

                # update model-based module
                action_np_vec = np.zeros([1, 6])
                action_np_vec[0, action - 1] = 1.
                action_vec = torch.from_numpy(action_np_vec).float().cuda()
                actions_matrix.append(action_vec)

                if step == 0:
                    pre_mean, pre_std = TUC_dynamic.dump_z_mean_std(
                        state, action_vec)

                TUC_dynamic.train_enc_dec(state, next_state, action_vec)

                if step > 0:
                    mean, std = TUC_dynamic.dump_z_mean_std(state, action_vec)
                    intrinsic_reward = TUC_dynamic.dump_exploration_reward(
                        pre_mean, pre_std, mean, std)

            if i > 0:
                penalty = TUC_dynamic.dump_regret(state, action - 1)

            #print(intrinsic_reward,penalty)
            #print(ratio_1*((epochs-epoch)/epochs)*intrinsic_reward - ratio_2*(epoch/epochs)*penalty)

            # Store the transition in memory