def test_list_folders(): with tempfile.TemporaryDirectory() as temp_dir: # set up temp_dir subdirs dirnames = [ 'tf_txt', 'othertf_txt', 'test_csv', 'test_out', ] for dirname in dirnames: os.mkdir(os.path.join(temp_dir, dirname)) # add extra file pathlib.Path(os.path.join(temp_dir, 'test_badfile.txt')).touch() # test substrs is None (default) get_all = iou.list_folders(temp_dir) assert get_all.sort() == dirnames.sort() # test substrs is not list (single string) get_txt = iou.list_folders(temp_dir, substrs='_txt') assert get_txt.sort() == dirnames[0:2].sort() # test substrs is list get_test_and_other = iou.list_folders(temp_dir, substrs=['test_', 'other']) assert get_test_and_other.sort() == dirnames[1:].sort()
def compute_complete_expression_matrices(segmentation_labels, tiff_dir, img_sub_folder, is_mibitiff=False, points=None, batch_size=5): """ This function takes the segmented data and computes the expression matrices batch-wise while also validating inputs Inputs: segmentation_labels (xarray): an xarray with the segmented data tiff_dir (str): the name of the directory which contains the single_channel_inputs img_sub_folder (str): the name of the folder where the TIF images are located points (list): a list of points we wish to analyze, if None will default to all points is_mibitiff (bool): a flag to indicate whether or not the base images are MIBItiffs mibitiff_suffix (str): if is_mibitiff is true, then needs to be specified to select which points to load from mibitiff batch_size (int): how large we want each of the batches of points to be when computing, adjust as necessary for speed and memory considerations Returns: combined_normalized_data (pandas): a DataFrame containing the size_norm transformed data combined_transformed_data (pandas): a DataFrame containing the arcsinh transformed data """ # if no points are specified, then load all the points if points is None: # handle mibitiffs with an assumed file structure if is_mibitiff: filenames = io_utils.list_files(tiff_dir, substrs=['.tif']) points = io_utils.extract_delimited_names(filenames, delimiter=None) # otherwise assume the tree-like directory as defined for tree loading else: filenames = io_utils.list_folders(tiff_dir) points = filenames # check segmentation_labels for given points (img loaders will fail otherwise) point_values = [ point for point in points if point not in segmentation_labels['fovs'].values ] if point_values: raise ValueError( f"Invalid point values specified: " f"points {','.join(point_values)} not found in segmentation_labels fovs" ) # get full filenames from given points filenames = io_utils.list_files(tiff_dir, substrs=points) # sort the points points.sort() filenames.sort() # defined some vars for batch processing cohort_len = len(points) # create the final dfs to store the processed data combined_cell_size_normalized_data = pd.DataFrame() combined_arcsinh_transformed_data = pd.DataFrame() # iterate over all the batches for batch_names, batch_files in zip([ points[i:i + batch_size] for i in range(0, cohort_len, batch_size) ], [filenames[i:i + batch_size] for i in range(0, cohort_len, batch_size)]): # and extract the image data for each batch if is_mibitiff: image_data = data_utils.load_imgs_from_mibitiff( data_dir=tiff_dir, mibitiff_files=batch_files) else: image_data = data_utils.load_imgs_from_tree( data_dir=tiff_dir, img_sub_folder=img_sub_folder, fovs=batch_names) # as well as the labels corresponding to each of them current_labels = segmentation_labels.loc[batch_names, :, :, :] # segment the imaging data cell_size_normalized_data, arcsinh_transformed_data = generate_expression_matrix( segmentation_labels=current_labels, image_data=image_data) # now append to the final dfs to return combined_cell_size_normalized_data = combined_cell_size_normalized_data.append( cell_size_normalized_data) combined_arcsinh_transformed_data = combined_arcsinh_transformed_data.append( arcsinh_transformed_data) return combined_cell_size_normalized_data, combined_arcsinh_transformed_data
def load_imgs_from_tree(data_dir, img_sub_folder=None, fovs=None, channels=None, dtype="int16", variable_sizes=False): """Takes a set of imgs from a directory structure and loads them into an xarray. Args: data_dir (str): directory containing folders of images img_sub_folder (str): optional name of image sub-folder within each fov fovs (list): optional list of folders to load imgs from. Default loads all folders channels (list): optional list of imgs to load, otherwise loads all imgs dtype (str/type): dtype of array which will be used to store values variable_sizes (bool): if true, will pad loaded images with zeros to fit into array Returns: img_xr (xr.DataArray): xarray with shape [fovs, x_dim, y_dim, tifs] """ if fovs is None: # get all fovs fovs = iou.list_folders(data_dir) fovs.sort() if len(fovs) == 0: raise ValueError(f"No fovs found in directory, {data_dir}") if img_sub_folder is None: # no img_sub_folder, change to empty string to read directly from base folder img_sub_folder = "" # get imgs from first fov if no img names supplied if channels is None: channels = iou.list_files(os.path.join(data_dir, fovs[0], img_sub_folder), substrs=['.tif', '.jpg', '.png']) # if taking all channels from directory, sort them alphabetically channels.sort() # otherwise, fill channel names with correct file extension elif not all( [img.endswith(("tif", "tiff", "jpg", "png")) for img in channels]): channels = iou.list_files(os.path.join(data_dir, fovs[0], img_sub_folder), substrs=channels) if len(channels) == 0: raise ValueError("No images found in designated folder") test_img = io.imread( os.path.join(data_dir, fovs[0], img_sub_folder, channels[0])) # check to make sure that float dtype was supplied if image data is float data_dtype = test_img.dtype if np.issubdtype(data_dtype, np.floating): if not np.issubdtype(dtype, np.floating): warnings.warn( f"The supplied non-float dtype {dtype} was overwritten to {data_dtype}, " f"because the loaded images are floats") dtype = data_dtype if variable_sizes: img_data = np.zeros((len(fovs), 1024, 1024, len(channels)), dtype=dtype) else: img_data = np.zeros( (len(fovs), test_img.shape[0], test_img.shape[1], len(channels)), dtype=dtype) for fov in range(len(fovs)): for img in range(len(channels)): if variable_sizes: temp_img = io.imread( os.path.join(data_dir, fovs[fov], img_sub_folder, channels[img])) img_data[fov, :temp_img.shape[0], :temp_img.shape[1], img] = temp_img else: img_data[fov, :, :, img] = io.imread( os.path.join(data_dir, fovs[fov], img_sub_folder, channels[img])) # check to make sure that dtype wasn't too small for range of data if np.min(img_data) < 0: raise ValueError( "Integer overflow from loading TIF image, try a larger dtype") if variable_sizes: row_coords, col_coords = range(1024), range(1024) else: row_coords, col_coords = range(test_img.shape[0]), range( test_img.shape[1]) # remove .tif or .tiff from image name img_names = [os.path.splitext(img)[0] for img in channels] img_xr = xr.DataArray(img_data, coords=[fovs, row_coords, col_coords, img_names], dims=["fovs", "rows", "cols", "channels"]) return img_xr
io.imsave(os.path.join(save_dir, 'Carbon.tiff'), carbon.astype('float32')) # Figure 4b # extract data and create outlines for visualization cell_labels = xr.open_dataarray( os.path.join(data_dir, 'segmentation_labels_cell.xr')) nuc_labels = xr.open_dataarray( os.path.join(data_dir, 'segmentation_labels_nuc.xr')) # compute subcellular localization of imaging signal img_list = [ 'CD44.tif', 'COX2.tif', 'ECAD.tif', 'GLUT1.tif', 'HER2.tif', 'HH3.tif', 'Ki67.tif', 'P.tif', 'PanKRT.tif', 'pS6.tif' ] folders = io_utils.list_folders(data_dir, 'Point') fovs = io_utils.list_folders(data_dir, 'Point') # Since each point has different channels, we need to segment them one at a time segmentation_labels = xr.DataArray(np.concatenate( (cell_labels.values, nuc_labels.values), axis=-1), coords=[ cell_labels.fovs, cell_labels.rows, cell_labels.cols, ['whole_cell', 'nuclear'] ], dims=cell_labels.dims) compartment_df_pred_raw = pd.DataFrame() compartment_df_pred_norm = pd.DataFrame()