def load_features_and_labels(path, feature_table_name='', label_table_name='', outlier_table_name=None): with open_file(path, 'r') as f: cols, table = read_table(f, feature_table_name) label_ids = table[:, cols.index('label_ids')] marker_pattern = 'marker' feat_names = [name for name in cols if marker_pattern in name] feat_ids = [ii for ii, name in enumerate(cols) if marker_pattern in name] feats = table[:, feat_ids] with open_file(path, 'r') as f: cols, table = read_table(f, label_table_name) this_label_ids = table[:, cols.index('label_ids')] assert np.array_equal(label_ids, this_label_ids) labels = table[:, cols.index('infected_labels')] # filter out outliers from the training data if we have an outlier table if outlier_table_name is not None: with open_file(path, 'r') as f: cols, table = read_table(f, outlier_table_name) outlier_col_name = 'is_outlier' outlier = table[:, cols.index(outlier_col_name)] mask = outlier != 1 else: mask = None if mask is not None: feats, labels = feats[mask], labels[mask] return feats, labels, feat_names
def export_default_table(table_file, table_name, output_path, output_format=None, skip_existing=True): if os.path.exists(output_path) and skip_existing: return # table file can be a file path or an opened file object if isinstance(table_file, str): with open_file(table_file, 'r') as f: columns, table = read_table(f, table_name) else: columns, table = read_table(table_file, table_name) export_table(columns, table, output_path, output_format)
def copy_labels(in_path, out_path): in_table_key = '' with open_file(in_path, 'r') as f: col_names, table = read_table(f, in_table_key) with open_file(out_path, 'a') as f: write_table(f)
def _load_table(path, columns=None): return_cols = columns is None with open_file(path, 'r') as f: this_columns, this_table = read_table(f, table_name) has_props_table = has_table(f, props_table_name) if has_props_table: prop_cols, prop_table = read_table(f, props_table_name) anchors = np.concatenate([prop_table[:, prop_cols.index('anchor_x')][:, None], prop_table[:, prop_cols.index('anchor_y')][:, None]], axis=1) else: n_cells = len(this_table) anchors = np.array(2 * [n_cells * [None]]).T if columns is None: columns = initial_columns + this_columns if len(anchors) != len(this_table): assert has_props_table label_ids1 = this_table[:, this_columns.index('label_id')] label_ids2 = prop_table[:, prop_cols.index('label_id')] assert len(label_ids2) > len(label_ids1) # we assume it's sorted by label ids ! keep_anchors = np.isin(label_ids2, label_ids1) anchors = anchors[keep_anchors] image_name = os.path.splitext(os.path.split(path)[1])[0] well_name = image_name_to_well_name(image_name) site_name = image_name_to_site_name(image_name) plate_col = np.array([plate_name] * len(this_table)) well_col = np.array([well_name] * len(this_table)) site_col = np.array([site_name] * len(this_table)) res_table = np.concatenate([plate_col[:, None], well_col[:, None], site_col[:, None], anchors, this_table], axis=1) if return_cols: return res_table, columns else: return res_table
def load_labels(path): key = 'proofread_infected_labels' with h5py.File(path, 'r') as f: if has_table(f, key): print("Loading proofread labels for", path) _, infected_labels = read_table(f, key) infected_labels = infected_labels[:, 1] else: print("Did not find proofread labels for", path) infected_labels = None return infected_labels
def cell_outliers_per_plate(plate): image_files = glob(os.path.join(plate, '*.h5')) outlier_tab_name = 'cell_segmentation_serum_IgG/serum_IgG_outliers' n, n_outliers = 0, 0 for imf in image_files: with h5py.File(imf, 'r') as f: cols, tab = read_table(f, outlier_tab_name) n += len(tab) outliers = tab[:, cols.index('is_outlier')] == 1 n_outliers += outliers.sum() return n, n_outliers
def get_well_bg_fractions(name, root): folder = os.path.join(root, name) table_path = os.path.join(folder, f'{name}_table.hdf5') bg_table_key = 'wells/backgrounds' with h5py.File(table_path, 'r') as f: col_names, table = read_table(f, bg_table_key) wells = table[:, col_names.index('well_name')] fractions = table[:, col_names.index('background_fraction')] values = table[:, col_names.index('serum_IgG_median')] return dict(zip(wells, values)), dict(zip(wells, fractions))
def get_actual_bg_well(name, root, channel_dict, bg_table_key='plate/backgrounds_from_min_well'): folder = os.path.join(root, name) table_path = os.path.join(folder, f'{name}_table.hdf5') with h5py.File(table_path, 'r') as f: col_names, table = read_table(f, bg_table_key) return { chan_name: table[0, col_names.index(well_key)] for chan_name, well_key in channel_dict.items() }
def quantify_well_or_image(folder, name): plate_name = os.path.split(folder)[1] table_path = os.path.join(folder, f'{plate_name}_table.hdf5') with h5py.File(table_path, 'r') as f: cols, tab = read_table(f, f'{name}/default') n = len(tab) outliers = tab[:, cols.index('IgG_is_outlier')] == 1 n_outliers = outliers.sum() outlier_type = tab[:, cols.index('IgG_outlier_type')] n_manual = len([otype for otype in outlier_type if 'manual: 1' in otype]) return n, n_outliers, n_manual
def determine_igm_mad(root): plates = glob(os.path.join(root, '*IgM*')) print(plates) mads = [] mad_key = 'serum_IgM_mad' for plate in plates: plate_name = os.path.split(plate)[1] table_path = os.path.join(plate, f'{plate_name}_table.hdf5') with open_file(table_path, 'r') as f: cols, table = read_table(f, 'plate/backgrounds_min_well') mad = table[:, cols.index(mad_key)] print(plate_name, ":", mad) if 'plate2' in plate_name: continue mads.append(mad) print() print("Mean IgM MAD:") print(np.mean(mads))
def get_image_edges_and_labels(path, saturation_factor=1.5, edge_width=2, return_seg=False): with h5py.File(path, 'r') as f: serum = normalize(read_image(f, 'serum_IgG')) marker = quantile_normalize(read_image(f, 'marker')) nuclei = normalize(read_image(f, 'nuclei')) seg = read_image(f, 'cell_segmentation') _, infected_labels = read_table(f, 'infected_cell_labels') assert len(infected_labels) == seg.max() + 1 assert infected_labels.shape[1] == 2 bg_mask = seg == 0 def subtract_bg(raw): bg = np.median(raw[bg_mask]) raw -= bg return raw serum = subtract_bg(serum) marker = subtract_bg(marker) nuclei = subtract_bg(nuclei) infected_labels = infected_labels[:, 1] edges = get_edge_segmentation(seg, edge_width) raw = np.concatenate( [marker[..., None], serum[..., None], nuclei[..., None]], axis=-1) if saturation_factor > 1: raw = skc.rgb2hsv(raw) raw[..., 1] *= saturation_factor raw = skc.hsv2rgb(raw).clip(0, 1) if return_seg: return raw, edges, infected_labels, seg else: return raw, edges, infected_labels
def bg_dict_for_plots(bg_params, table_path): bg_dict = {} with open_file(table_path, 'r') as f: for channel_name, bg_param in bg_params.items(): if isinstance(bg_param, str): assert has_table(f, bg_param) cols, table = read_table(f, bg_param) bg_src = bg_param.split('/')[-1] if bg_param == 'plate/backgrounds': bg_val = table[0, cols.index(f'{channel_name}_median')] bg_msg = f' as {bg_val}' elif bg_param == 'plate/backgrounds_min_well': bg_val = table[0, cols.index(f'{channel_name}_median')] bg_wells = table[0, cols.index(f'{channel_name}_min_well')] bg_src = f' the wells {bg_wells}' bg_msg = f' as {bg_val}' else: bg_msg = '' bg_info = f'background computed from {bg_src}{bg_msg}' else: bg_info = f'background fixed to {bg_param}' bg_dict[channel_name] = bg_info return bg_dict
def export_scores(folder_list, output_path, score_patterns=DEFAULT_SCORE_PATTERNS, table_name='wells/default', metadata_repository=None, filter_outliers=True): """ """ def name_matches_score(name): return any(pattern in name for pattern in score_patterns) # first pass: find all column names that match the pattern result_columns = [] for folder in folder_list: plate_name = os.path.split(folder)[1] table_path = os.path.join(folder, plate_name + '_table.hdf5') if not os.path.exists(table_path): raise RuntimeError(f"Did not find a result table @ {table_path}") with open_file(table_path, 'r') as f: col_names, _ = read_table(f, table_name) assert 'well_name' in col_names col_names = [col_name for col_name in col_names if name_matches_score(col_name)] result_columns.extend(col_names) result_columns = ['well_name'] + list(set(result_columns)) columns = ['plate_name'] + result_columns # append cohort_id, elisa results and cohort_type (positive/control/unknow) if we have db if metadata_repository is not None: db_metadata = ['cohort_id', 'cohort', 'cohort_type'] + TEST_NAMES columns += db_metadata # second pass: load the tables table = [] for folder in folder_list: plate_name = os.path.split(folder)[1] table_path = os.path.join(folder, plate_name + '_table.hdf5') with open_file(table_path, 'r') as f: this_result_columns, this_result_table = read_table(f, table_name) if filter_outliers: this_result_table = outliers_to_nan(this_result_table, this_result_columns, score_patterns) this_len = len(this_result_table) plate_col = np.array([plate_name] * this_len) col_ids = [this_result_columns.index(name) if name in this_result_columns else -1 for name in result_columns] this_table = [np.array([None] * this_len)[:, None] if col_id == -1 else this_result_table[:, col_id:col_id + 1] for col_id in col_ids] this_table = np.concatenate([plate_col[:, None]] + this_table, axis=1) # extend table with the values from DB if metadata_repository is not None: metadata = _get_db_metadata(this_table[:, 1], metadata_repository, plate_name) assert len(metadata) == len(this_table) assert metadata.shape[1] == len(db_metadata), f"{metadata.shape[1], len(db_metadata)}" this_table = np.concatenate([this_table, metadata], axis=1) table.append(this_table) logger.info(f'Columns: {columns}') table = np.concatenate(table, axis=0) export_table(columns, table, output_path)