Example #1
0
def test_list_folders():
    with tempfile.TemporaryDirectory() as temp_dir:
        # set up temp_dir subdirs
        dirnames = [
            'tf_txt',
            'othertf_txt',
            'test_csv',
            'test_out',
        ]
        for dirname in dirnames:
            os.mkdir(os.path.join(temp_dir, dirname))

        # add extra file
        pathlib.Path(os.path.join(temp_dir, 'test_badfile.txt')).touch()

        # test substrs is None (default)
        get_all = iou.list_folders(temp_dir)
        assert get_all.sort() == dirnames.sort()

        # test substrs is not list (single string)
        get_txt = iou.list_folders(temp_dir, substrs='_txt')
        assert get_txt.sort() == dirnames[0:2].sort()

        # test substrs is list
        get_test_and_other = iou.list_folders(temp_dir,
                                              substrs=['test_', 'other'])
        assert get_test_and_other.sort() == dirnames[1:].sort()
Example #2
0
def compute_complete_expression_matrices(segmentation_labels,
                                         tiff_dir,
                                         img_sub_folder,
                                         is_mibitiff=False,
                                         points=None,
                                         batch_size=5):
    """
    This function takes the segmented data and computes the expression matrices batch-wise
    while also validating inputs

    Inputs:
        segmentation_labels (xarray): an xarray with the segmented data
        tiff_dir (str): the name of the directory which contains the single_channel_inputs
        img_sub_folder (str): the name of the folder where the TIF images are located
        points (list): a list of points we wish to analyze, if None will default to all points
        is_mibitiff (bool): a flag to indicate whether or not the base images are MIBItiffs
        mibitiff_suffix (str): if is_mibitiff is true, then needs to be specified to select
            which points to load from mibitiff
        batch_size (int): how large we want each of the batches of points to be when computing,
            adjust as necessary for speed and memory considerations

    Returns:
        combined_normalized_data (pandas): a DataFrame containing the size_norm transformed data
        combined_transformed_data (pandas): a DataFrame containing the arcsinh transformed data
    """

    # if no points are specified, then load all the points
    if points is None:
        # handle mibitiffs with an assumed file structure
        if is_mibitiff:
            filenames = io_utils.list_files(tiff_dir, substrs=['.tif'])
            points = io_utils.extract_delimited_names(filenames,
                                                      delimiter=None)
        # otherwise assume the tree-like directory as defined for tree loading
        else:
            filenames = io_utils.list_folders(tiff_dir)
            points = filenames

    # check segmentation_labels for given points (img loaders will fail otherwise)
    point_values = [
        point for point in points
        if point not in segmentation_labels['fovs'].values
    ]
    if point_values:
        raise ValueError(
            f"Invalid point values specified: "
            f"points {','.join(point_values)} not found in segmentation_labels fovs"
        )

    # get full filenames from given points
    filenames = io_utils.list_files(tiff_dir, substrs=points)

    # sort the points
    points.sort()
    filenames.sort()

    # defined some vars for batch processing
    cohort_len = len(points)

    # create the final dfs to store the processed data
    combined_cell_size_normalized_data = pd.DataFrame()
    combined_arcsinh_transformed_data = pd.DataFrame()

    # iterate over all the batches
    for batch_names, batch_files in zip([
            points[i:i + batch_size] for i in range(0, cohort_len, batch_size)
    ], [filenames[i:i + batch_size]
            for i in range(0, cohort_len, batch_size)]):
        # and extract the image data for each batch
        if is_mibitiff:
            image_data = data_utils.load_imgs_from_mibitiff(
                data_dir=tiff_dir, mibitiff_files=batch_files)
        else:
            image_data = data_utils.load_imgs_from_tree(
                data_dir=tiff_dir,
                img_sub_folder=img_sub_folder,
                fovs=batch_names)

        # as well as the labels corresponding to each of them
        current_labels = segmentation_labels.loc[batch_names, :, :, :]

        # segment the imaging data
        cell_size_normalized_data, arcsinh_transformed_data = generate_expression_matrix(
            segmentation_labels=current_labels, image_data=image_data)

        # now append to the final dfs to return
        combined_cell_size_normalized_data = combined_cell_size_normalized_data.append(
            cell_size_normalized_data)
        combined_arcsinh_transformed_data = combined_arcsinh_transformed_data.append(
            arcsinh_transformed_data)

    return combined_cell_size_normalized_data, combined_arcsinh_transformed_data
def load_imgs_from_tree(data_dir,
                        img_sub_folder=None,
                        fovs=None,
                        channels=None,
                        dtype="int16",
                        variable_sizes=False):
    """Takes a set of imgs from a directory structure and loads them into an xarray.

        Args:
            data_dir (str): directory containing folders of images
            img_sub_folder (str): optional name of image sub-folder within each fov
            fovs (list): optional list of folders to load imgs from. Default loads all folders
            channels (list): optional list of imgs to load, otherwise loads all imgs
            dtype (str/type): dtype of array which will be used to store values
            variable_sizes (bool): if true, will pad loaded images with zeros to fit into array

        Returns:
            img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, tifs]
    """

    if fovs is None:
        # get all fovs
        fovs = iou.list_folders(data_dir)
        fovs.sort()

    if len(fovs) == 0:
        raise ValueError(f"No fovs found in directory, {data_dir}")

    if img_sub_folder is None:
        # no img_sub_folder, change to empty string to read directly from base folder
        img_sub_folder = ""

    # get imgs from first fov if no img names supplied
    if channels is None:
        channels = iou.list_files(os.path.join(data_dir, fovs[0],
                                               img_sub_folder),
                                  substrs=['.tif', '.jpg', '.png'])

        # if taking all channels from directory, sort them alphabetically
        channels.sort()
    # otherwise, fill channel names with correct file extension
    elif not all(
        [img.endswith(("tif", "tiff", "jpg", "png")) for img in channels]):
        channels = iou.list_files(os.path.join(data_dir, fovs[0],
                                               img_sub_folder),
                                  substrs=channels)

    if len(channels) == 0:
        raise ValueError("No images found in designated folder")

    test_img = io.imread(
        os.path.join(data_dir, fovs[0], img_sub_folder, channels[0]))

    # check to make sure that float dtype was supplied if image data is float
    data_dtype = test_img.dtype
    if np.issubdtype(data_dtype, np.floating):
        if not np.issubdtype(dtype, np.floating):
            warnings.warn(
                f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, "
                f"because the loaded images are floats")
            dtype = data_dtype

    if variable_sizes:
        img_data = np.zeros((len(fovs), 1024, 1024, len(channels)),
                            dtype=dtype)
    else:
        img_data = np.zeros(
            (len(fovs), test_img.shape[0], test_img.shape[1], len(channels)),
            dtype=dtype)

    for fov in range(len(fovs)):
        for img in range(len(channels)):
            if variable_sizes:
                temp_img = io.imread(
                    os.path.join(data_dir, fovs[fov], img_sub_folder,
                                 channels[img]))
                img_data[fov, :temp_img.shape[0], :temp_img.shape[1],
                         img] = temp_img
            else:
                img_data[fov, :, :, img] = io.imread(
                    os.path.join(data_dir, fovs[fov], img_sub_folder,
                                 channels[img]))

    # check to make sure that dtype wasn't too small for range of data
    if np.min(img_data) < 0:
        raise ValueError(
            "Integer overflow from loading TIF image, try a larger dtype")

    if variable_sizes:
        row_coords, col_coords = range(1024), range(1024)
    else:
        row_coords, col_coords = range(test_img.shape[0]), range(
            test_img.shape[1])

    # remove .tif or .tiff from image name
    img_names = [os.path.splitext(img)[0] for img in channels]

    img_xr = xr.DataArray(img_data,
                          coords=[fovs, row_coords, col_coords, img_names],
                          dims=["fovs", "rows", "cols", "channels"])

    return img_xr
    io.imsave(os.path.join(save_dir, 'Carbon.tiff'), carbon.astype('float32'))

# Figure 4b
# extract data and create outlines for visualization
cell_labels = xr.open_dataarray(
    os.path.join(data_dir, 'segmentation_labels_cell.xr'))
nuc_labels = xr.open_dataarray(
    os.path.join(data_dir, 'segmentation_labels_nuc.xr'))

# compute subcellular localization of imaging signal
img_list = [
    'CD44.tif', 'COX2.tif', 'ECAD.tif', 'GLUT1.tif', 'HER2.tif', 'HH3.tif',
    'Ki67.tif', 'P.tif', 'PanKRT.tif', 'pS6.tif'
]

folders = io_utils.list_folders(data_dir, 'Point')

fovs = io_utils.list_folders(data_dir, 'Point')

# Since each point has different channels, we need to segment them one at a time
segmentation_labels = xr.DataArray(np.concatenate(
    (cell_labels.values, nuc_labels.values), axis=-1),
                                   coords=[
                                       cell_labels.fovs, cell_labels.rows,
                                       cell_labels.cols,
                                       ['whole_cell', 'nuclear']
                                   ],
                                   dims=cell_labels.dims)

compartment_df_pred_raw = pd.DataFrame()
compartment_df_pred_norm = pd.DataFrame()