Exemple #1
0
def tif_txt_extract(input_files_path,
                    output_files_path,
                    ocr_path=None,
                    verbose=False):
    """ this will extract text from  tif files """
    try:
        if platform.system() == 'Windows':
            pytesseract.pytesseract.tesseract_cmd = ocr_path

        files = set(
            glob.glob(input_files_path + "/*.TIF") +
            glob.glob(input_files_path + "/*.tif"))

        ll = len(files)
        for i, file in enumerate(files, start=1):
            #  __verbose_print(os.path.basename(file) + " file text is being extracted....", verbose)
            file_name = os.path.basename(file)

            images = MultiImage(file, plugin='pil')
            img_str = ''
            for img in images:
                img_str += pytesseract.image_to_string(Image.fromarray(img))

            f = open(output_files_path + os.sep + file_name[:-4] + ".txt",
                     "a+",
                     encoding="utf-8")
            f.write(img_str)
            f.close()

            __verbose_print(
                str(i) + " of " + str(ll) + " file(s) completed", verbose)
    except Exception as e:
        print(e)
Exemple #2
0
    def __tifutil(self, file, output_path):
        """ this will extract text from tif images """
        file_name = os.path.basename(file)

        images = MultiImage(file, plugin='pil')

        for i, img in enumerate(images, start=1):
            pil_img = Image.fromarray(img)
            path = output_path + os.sep + file_name + '_' + str(i) + ".jpg"
            pil_img.save(path, 'JPEG')
    def __init__(self,
                 slide_fn,
                 level=2,
                 tile_size=128,
                 mask_fn=None,
                 data_provider=None):
        self.slide_fn = slide_fn
        self.level = level
        self.tile_size = tile_size

        self.img = MultiImage(self.slide_fn)[self.level].copy()
        self.dims = np.array(self.img.shape[:2][::-1])
        self.ds_img = MultiImage(self.slide_fn)[2].copy()
        self.tissue_mask = makeTissueMask(self.ds_img)

        self.mask_fn = mask_fn
        self.data_provider = data_provider
        if not (self.mask_fn == None or self.data_provider == None):
            self.mask = MultiImage(mask_fn)[level].sum(axis=-1)

        self.tile_coords = None
Exemple #4
0
 def __tile(self, img_path, mask_path, number_of_tiles=12):
     img = MultiImage(img_path)[-1]
     mask = MultiImage(mask_path)[-1]
     shape = img.shape
     pad0, pad1 = (self.__patch_size -
                   shape[0] % self.__patch_size) % self.__patch_size, (
                       self.__patch_size -
                       shape[1] % self.__patch_size) % self.__patch_size
     img = np.pad(img, [[pad0 // 2, pad0 - pad0 // 2],
                        [pad1 // 2, pad1 - pad1 // 2], [0, 0]],
                  constant_values=255)
     mask = np.pad(mask, [[pad0 // 2, pad0 - pad0 // 2],
                          [pad1 // 2, pad1 - pad1 // 2], [0, 0]],
                   constant_values=0)
     img = img.reshape(img.shape[0] // self.__patch_size, self.__patch_size,
                       img.shape[1] // self.__patch_size, self.__patch_size,
                       3)
     img = img.transpose(0, 2, 1, 3, 4).reshape(-1, self.__patch_size,
                                                self.__patch_size, 3)
     mask = mask.reshape(mask.shape[0] // self.__patch_size,
                         self.__patch_size,
                         mask.shape[1] // self.__patch_size,
                         self.__patch_size, 3)
     mask = mask.transpose(0, 2, 1, 3, 4).reshape(-1, self.__patch_size,
                                                  self.__patch_size, 3)
     if len(img) < number_of_tiles:
         mask = np.pad(
             mask,
             [[0, number_of_tiles - len(img)], [0, 0], [0, 0], [0, 0]],
             constant_values=0)
         img = np.pad(
             img, [[0, number_of_tiles - len(img)], [0, 0], [0, 0], [0, 0]],
             constant_values=255)
     idxs = np.argsort(img.reshape(img.shape[0],
                                   -1).sum(-1))[:number_of_tiles]
     img = img[idxs]
     mask = mask[idxs]
     return img, mask
Exemple #5
0
    def __getitem__(self, item):
        name = self.images[item]
        path = os.path.join(self.data_dir, name)
        label = self.labels[item]

        label = np.array([(1 if i < label else 0) for i in range(5)],
                         dtype=np.float32)

        img = MultiImage(path)[self.tiff_scale]
        img = cv.cvtColor(img, cv.COLOR_RGB2BGR)

        img = tile(img, self.sz, self.N, self.transforms, self.random)

        img = torch.from_numpy(img.transpose(2, 0, 1))
        return img, label
Exemple #6
0
    def __getitem__(self, item):
        name = self.images[item]
        if not os.path.splitext(name)[1]:
            name += ".tiff"
        path = os.path.join(self.data_dir, name)

        img = MultiImage(path)[self.tiff_scale]
        img = cv.cvtColor(img, cv.COLOR_RGB2BGR)
        img, _ = get_minimal_image(img)
        img = tile(img, self.tile_size, self.num_tiles)
        if self.transforms is not None:
            img = self.transforms(image=img)["image"]

        img = torch.from_numpy(img.transpose(2, 0, 1))
        return img
Exemple #7
0
    def __cropPatchesFromImage(self, image_name, downsample_level=2):
        # downsample_level: 0, 1, 2
        # Resolution downsample levels: 1, 4, 16
        multi_image = MultiImage(image_name)
        image_to_crop = multi_image[downsample_level]
        image_shape = image_to_crop.shape
        resolution_relation = 4 ** (2 - downsample_level)
        patch_shape = (self.__patch_size, self.__patch_size)

        # Find coordinates from where to select patch
        cell_coordinates = self.__getCellCoordinatesFromImage(
            multi_image, resolution_relation, image_shape)

        # Crop patches
        patches = []
        for i in range(self.__patches_per_image):
            j = 0
            while True:
                j += 1
                random_index = random.randint(0, cell_coordinates.shape[1] - 1)

                # Scale coordinates by the number of resolution relation
                # between low-resolution image and high/mid-resolution
                start_y, start_x = \
                    cell_coordinates[:, random_index] * resolution_relation
                start_x = max(0, min(
                    start_x, image_shape[1] - self.__patch_size))
                start_y = max(0, min(
                    start_y, image_shape[0] - self.__patch_size))
                end_x, end_y = np.array(
                    [start_x, start_y]) + self.__patch_size

                # Crop from mid/high resolution image
                patch = image_to_crop[start_y:end_y, start_x:end_x]

                # Resize if original image size was smaller than patch_size
                if patch.shape[:2] != patch_shape:
                    patch = cv2.resize(
                        patch, dsize=patch_shape,
                        interpolation=cv2.INTER_CUBIC)

                # Patch has enough colored areas (not pure white) or has been
                # iterated more than 5 times
                if np.mean(patch) < 230 or j >= 5:
                    patches.append(patch)
                    break
        return patches
Exemple #8
0
    def __getitem__(self, idx):
        path = self.image_path + self.image_id[idx]

        if self.is_train or self.is_val:
            path += '.png'
            image = cv2.imread(path)
        else:
            path += '.tiff'
            image = MultiImage(path)[-1]
            image = cv2.resize(image, (HEIGHT, WIDTH))

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.aug(image=image)['image'].reshape((3, HEIGHT, WIDTH))
        
        if self.is_train or self.is_val:
            isup_grade = cat([self.data.isup_grade[idx]], num_classes=6)
            gleason_0 = cat([self.data.gleason_score[idx][0]], num_classes=5)
            gleason_1 = cat([self.data.gleason_score[idx][1]], num_classes=5)
            target = np.concatenate([isup_grade, gleason_0, gleason_1], axis=1)
            
        if self.is_train or self.is_val:
            return FloatTensor(image), FloatTensor(target)
        else:
            return FloatTensor(image)
Exemple #9
0
def gettextFrom_tiff_Image(file):
    qq = MultiImage(file, plugin='pil')
    for i, frame in enumerate(qq, start=1):
        pil_img = Image.fromarray(frame)
        img_str = pytesseract.image_to_string(pil_img)
        print(img_str)
 def setUp(self):
     # This multipage TIF file was created with imagemagick:
     # convert im1.tif im2.tif -adjoin multipage.tif
     if PIL_available:
         self.img = MultiImage(os.path.join(data_dir, 'multipage.tif'))
    def __cropPatchesFromImage(self, image_name, downsample_level=None):
        patch_shape = (self.__patch_size, self.__patch_size)

        # downsample_level: 0, 1, 2, None (random)
        # Use only 2 or None (MultiImage is used for low resolution image,
        # OpenSlide for high resolution image (to save memory and faster
        # process, Openslide did not work for low resolution image))
        # Resolution downsample levels: 1, 4, 16
        multi_image = MultiImage(image_name)
        use_mixed_resolutions = False
        if downsample_level is None:
            use_mixed_resolutions = True
            image_slide = OpenSlide(image_name)
            image_to_crop = multi_image[-1]
        else:
            image_to_crop = multi_image[downsample_level]
            image_shape = tuple(image_to_crop.shape[::-1][1:])
            resolution_relation = 4**(2 - downsample_level)

        # Find coordinates from where to select patch
        cell_coordinates = self.__getCellCoordinatesFromImage(multi_image)

        # Crop patches
        patches = []
        for i in range(self.__patches_per_image):

            # Choose mixed down sample level (low and high (not mid))
            if use_mixed_resolutions:
                downsample_level = int(i * 2 / self.__patches_per_image) * 2
                image_shape = image_slide.level_dimensions[downsample_level]
                resolution_relation = 4**(2 - downsample_level)

            # Iterate good patch
            for j in range(5):
                random_index = random.randint(0, cell_coordinates.shape[1] - 1)

                # Scale coordinates by the number of resolution relation
                # between low-resolution image and high/mid-resolution.
                # Take center of the cell coordinate by subtracting
                # 0.5*patch_size.
                start_y, start_x = (
                    cell_coordinates[:, random_index] * resolution_relation -
                    int(0.5 * self.__patch_size))
                start_x = max(0,
                              min(start_x, image_shape[0] - self.__patch_size))
                start_y = max(0,
                              min(start_y, image_shape[1] - self.__patch_size))
                end_x, end_y = np.array([start_x, start_y]) + self.__patch_size

                # Crop from mid/high resolution image
                if downsample_level == 0:
                    patch = np.array(
                        image_slide.read_region((start_x, start_y), 0,
                                                patch_shape))[..., :3]
                else:
                    patch = image_to_crop[start_y:end_y, start_x:end_x]

                # Resize if original image size was smaller than patch_size
                if patch.shape[:2] != patch_shape:
                    padding = np.subtract(patch_shape, patch.shape[:2])
                    padding = ([0, padding[0]], [0, padding[1]], [0, 0])
                    patch = np.pad(patch, padding, constant_values=255)

                # Patch has enough colored areas (not pure white)
                # Otherwise iterate again
                if np.mean(patch) < 230:
                    break
            patches.append(patch)
        return patches
Exemple #12
0
    return result_img, minimal_boxes


# In[6]:


names = [name for name in os.listdir(IMAGES)]
compact_representation = {}

mean_ratio = 0

for name in tqdm(names):
    img_path = os.path.join(IMAGES, name)

    img = MultiImage(img_path)[-1]

    compact_image, minimal_boxes = get_minimal_image(img)
    compact_representation[name] = {"original_size": img.shape[:2], "rectangles": minimal_boxes}

    mean_ratio += np.prod(compact_image.shape[:2]) / np.prod(img.shape[:2])
print(f"Mean ratio: {mean_ratio / len(names)}")
#
#
# # In[7]:
#
#
# with open("../dataset/compact_representation.json", "w") as file:
#     json.dump(compact_representation, file)

Exemple #13
0
import os
import json
import sys
sys.path.append("../")

from tqdm import tqdm
from skimage.io import MultiImage
import cv2 as cv

from utils.data_utils import get_tile
import matplotlib.pyplot as plt

images_dir = "../input/prostate-cancer-grade-assessment/train_images"
output_dir = "../input/256_36_hsv"

with open("../notebooks/256_36_hsv.json", 'r') as file:
    data = json.load(file)

os.makedirs(output_dir, exist_ok=True)


for path, boxes in tqdm(data.items()):
    img = MultiImage(os.path.join(images_dir, path) + ".tiff")[1]
    img = get_tile(img, boxes, 256, 36)
    img = 255 - cv.cvtColor(img, cv.COLOR_RGB2BGR)
    cv.imwrite(os.path.join(output_dir, path) + ".png", img)
    def _worker(paths: Tuple[Path, Optional[Path]], namespace) -> NoReturn:
        self = namespace.self
        train_meta = namespace.train_meta
        image_path, mask_path = paths
        name = image_path.stem
        mask_path = Path(
            str(image_path).replace("train_images",
                                    "train_label_masks").replace(
                                        ".tiff", "_mask.tiff"))

        image_slide = MultiImage(str(image_path))
        mask_slide = MultiImage(str(mask_path))
        large_image = get_layer_safely(image_slide, layer=0)
        large_mask = get_layer_safely(
            mask_slide, layer=0, is_mask=True) if mask_path.exists() else None
        small_image = get_layer_safely(image_slide, layer=2)
        if large_image is None:
            return

        if small_image is None:
            scale = 1 / 16
            small_image = cv2.resize(large_image,
                                     dsize=(0, 0),
                                     fx=scale,
                                     fy=scale,
                                     interpolation=cv2.INTER_LANCZOS4)

        try:
            pre_processor = ImagePreProcessor(reduce_memory=False)
            large_image = pre_processor.dual(large_image, small_image)
            if large_mask is not None:
                large_mask = pre_processor.single(large_mask)

            row = train_meta[train_meta.image_id == name].iloc[0]
            data_provider = row["data_provider"]
            gleason_score = row["gleason_score"]
            label = row["isup_grade"]
            slide = OpenSlide(str(image_path))
            additional = {
                "data_provider": data_provider,
                "gleason_score": gleason_score,
                "image_shape": large_image.shape[:2],
                "source_image_shape": slide.dimensions,
                "x_resolution": float(slide.properties["tiff.XResolution"]),
                "y_resolution": float(slide.properties["tiff.YResolution"]),
                "resolution_unit": slide.properties["tiff.ResolutionUnit"]
            }

            if large_mask is None:
                visualization = None
            else:
                masked = draw_overlay_mask(
                    large_image,
                    large_mask,
                    color_map=get_color_map(data_provider, normalized=False))
                title_text = f"{data_provider} - id={name[:10]} isup={label} gleason={gleason_score}"
                visualization = plot_meta(
                    masked,
                    title_text,
                    color_map=get_color_map(data_provider, normalized=True),
                    classname_map=get_classname_map(data_provider),
                    show_keys=list(np.unique(large_mask)))

            record = Record(large_image,
                            large_mask,
                            visualization,
                            name,
                            label,
                            phase=Phase.TRAIN,
                            additional=additional)
            self._writer.put(record)

        except Exception as e:
            print(f"{name} - {e}")
    def __getitem__(self, idx):
        path = os.path.join(self.root_path, 'train_images')
        # Skimage seems to be slightly faster
        #image = openslide.OpenSlide(os.path.join(path, self.df['image_id'].iloc[idx] + '.tiff'))
        image = MultiImage(os.path.join(
            path, self.df['image_id'].iloc[idx] + '.tiff'),
                           conserve_memory=False)[self.level]

        #image = np.array(image.read_region((0, 0), self.level, image.level_dimensions[self.level]))

        # Only look at regions of the image that aren't empty space and put a bounding box on it
        # Find those regions using a subsampled image, since NumPy is slow
        stride = self.patch_size // 8
        f_blank = lambda x, axis: np.mean(
            (x - 255)**2, axis=axis) * np.var(x, axis=axis)
        proportion_blank = block_reduce(image[::stride, ::stride],
                                        block_size=(self.patch_size // stride,
                                                    self.patch_size // stride,
                                                    3),
                                        func=f_blank)

        regions = np.argsort(proportion_blank, axis=None)[::-1]
        x = regions % proportion_blank.shape[1] * self.patch_size
        y = regions // proportion_blank.shape[1] * self.patch_size

        patches = np.full(
            (self.num_patches, self.patch_size, self.patch_size, 3),
            255,
            dtype=np.uint8)
        for i in range(min(self.num_patches, x.shape[0])):
            img = image[y[i]:y[i] + self.patch_size,
                        x[i]:x[i] + self.patch_size]
            patches[i, :img.shape[0], :img.shape[1]] = img
        image = patches

        label = torch.zeros(5)
        label[:self.df['isup_grade'].iloc[idx]] = 1

        if self.use_mask:
            #mask = openslide.OpenSlide(os.path.join(self.root_path, 'train_label_masks', self.df['image_id'].iloc[idx] + '_mask.tiff'))
            mask = MultiImage(os.path.join(
                self.root_path, 'train_label_masks',
                self.df['image_id'].iloc[idx] + '_mask.tiff'),
                              conserve_memory=False)[self.level]
            mask = mask[..., 0]

            mask_patches = np.zeros(
                (self.num_patches, self.patch_size, self.patch_size),
                dtype=np.uint8)
            for i in range(min(self.num_patches, x.shape[0])):
                msk = mask[y[i]:y[i] + self.patch_size,
                           x[i]:x[i] + self.patch_size]
                mask_patches[i, :msk.shape[0], :msk.shape[1]] = msk
            mask = mask_patches

            if self.df['data_provider'].iloc[
                    idx] == 'karolinska':  # Different data providers have different mask formats, normalise them to be the same
                mask[mask == 2] = 3
                mask[mask == 1] = 2

            if self.transforms:
                for i in range(
                        self.num_patches
                ):  # We need to iterate and apply to each image separately
                    augmented = self.transforms(image=image[i], mask=mask[i])
                    image[i] = augmented['image']
                    mask[i] = augmented['mask']

            # Convert our mask to binned binary just like the labels
            mask_binary = np.zeros(
                (mask.shape[0], 6, mask.shape[1], mask.shape[2]))
            for i in range(6):
                mask_binary[:, i] = (i == mask)
            mask = mask_binary

            #n = int(np.sqrt(self.num_patches))
            #image = image.reshape(n, n, self.patch_size, self.patch_size, 3).transpose((0, 2, 1, 3, 4)).reshape(n * self.patch_size, n * self.patch_size, 3)
            #mask = mask.reshape(n, n, self.patch_size, self.patch_size, 6).transpose((0, 2, 1, 3, 4)).reshape(n * self.patch_size, n * self.patch_size, 6)

            return torch.tensor(image).permute(0, 3, 1,
                                               2), (torch.tensor(mask), label)

        if self.transforms:
            for i in range(
                    self.num_patches
            ):  # We need to iterate and apply to each image separately
                image[i] = self.transforms(image=image[i])['image']

        #n = int(np.sqrt(self.num_patches))
        #image = image.reshape(n, n, self.patch_size, self.patch_size, 3).transpose((0, 2, 1, 3, 4)).reshape(n * self.patch_size, n * self.patch_size, 6)

        return torch.tensor(image).permute(0, 3, 1, 2), label
Exemple #16
0
import os

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid import AxesGrid

from skimage.io import MultiImage
from skimage import data_dir

# Load the multi-layer image
fname = os.path.join(data_dir, 'multipage.tif')
img = MultiImage(fname)

# Create an image grid
fig = plt.figure()
grid = AxesGrid(fig, rect=(1, 1, 1), nrows_ncols=(1, 2), axes_pad=0.1)

# Plot the layers on the image grid
for i, frame in enumerate(img):
    grid[i].imshow(frame, cmap=plt.cm.gray)
    grid[i].set_xlabel('Frame %s' % i)
    grid[i].set_xticks([])
    grid[i].set_yticks([])

plt.show()