def clustering(self, dist=10): """Handle data for finding clusters of cells.""" kws = {'Dist': dist} # Maximum distance for considering clustering data = None # Listing of paths of channels on which clusters are to be found cluster_chans = [ p for p in self.channelPaths for t in Sett.cluster_channels if t.lower() == p.stem.lower() ] for path in cluster_chans: # Loop paths, read file, and find clusters try: data = system.read_data(path, header=0) except (FileNotFoundError, AttributeError): msg = "No file for channel {}".format(path.stem) lg.logprint(LAM_logger, "{}: {}".format(self.name, msg), 'w') print("-> {}".format(msg)) # Discard earlier versions of found clusters, if present if data is not None: data = data.loc[:, ~data.columns.str.contains('ClusterID')] data.name = path.stem # The name of the clustering channel # Find clusters self.find_distances(data, vol_incl=Sett.cl_inclusion, compare=Sett.cl_incl_type, clusters=True, **kws)
def get_mps(self, mp_name: str, use_mp: bool, datadir: pl.Path) -> pd.Series: """Collect MPs for sample anchoring.""" if use_mp: try: # Get measurement point for anchoring mp_dir_path = next( self.channelpaths.pop(i) for i, s in enumerate(self.channelpaths) if str('_' + mp_name + '_') in str(s)) mp_path = next(mp_dir_path.glob("*Position.csv")) mp_data = system.read_data(mp_path, header=Sett.header_row, test=False) mp_data = mp_data.loc[:, ['Position X', 'Position Y']] if not mp_data.empty: mp_bin = self.project_mps(mp_data, datadir, filename="MPs.csv") mp_df = pd.DataFrame({'MP': mp_bin.values.codes}) mp_df.to_csv(self.sampledir.joinpath("MPs.csv"), index=False) except (StopIteration, ValueError, UnboundLocalError): mp_bin = None msg = f'could not find MP position for {self.name}' lg.logprint(LAM_logger, msg, 'e') print(" -> Failed to find MP position data.") else: # Sets measurement point values to zero when MP's are not used mp_bin = pd.Series(0, name=self.name) system.save_to_file(mp_bin, datadir, "MPs.csv") system.save_to_file(mp_bin, self.sampledir, "MPs.csv", append=False) return mp_bin
def __init__(self, paths=None, child=False): if child: return # Creation of variables related to all samples, that are later passed # on to child classes. Samplegroups._groups = sorted(Store.samplegroups) Samplegroups._chanPaths = list(paths.datadir.glob('Norm_*')) Samplegroups.sample_paths = [ p for p in paths.samplesdir.iterdir() if p.is_dir() ] Samplegroups._addData = list(paths.datadir.glob('Avg_*')) # Data and other usable directories Samplegroups.paths = paths # Total length of needed data matrix of all anchored samples Samplegroups.bin_length = Store.totalLength # Get MPs of all samples mp_path = paths.datadir.joinpath('MPs.csv') Samplegroups.sample_mps = system.read_data(mp_path, header=0, test=False) # If anchor point index is defined, find the start index of samples if Store.center is not None: Samplegroups.center_bin = Store.center # Assign color for each sample group groupcolors = sns.xkcd_palette(Sett.palette_colors) for i, grp in enumerate(Samplegroups._groups): Samplegroups.grp_palette.update({grp: groupcolors[i]}) lg.logprint(LAM_logger, 'Sample groups established.', 'i')
def avg_add_data(self, paths: system.Paths, data_names: dict, total_len: int): """Find bin averages of additional data.""" samples = self.starts.index for sample in samples: sample_dir = paths.samplesdir.joinpath(sample) data_file = sample_dir.glob(str(self.channel + '.csv')) data = system.read_data(next(data_file), header=0) for data_type in data_names.keys(): sample_data = data.loc[:, data.columns.str.contains(str(data_type) )] if sample_data.empty: continue binned_data = data.loc[:, 'DistBin'] bins = np.arange(0, Sett.projBins) for col in sample_data: avg_s = pd.Series(np.full(total_len, np.nan), name=sample) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=RuntimeWarning) insert = [ np.nanmean(sample_data.loc[binned_data == i, col]) for i in bins ] insert = [0 if np.isnan(v) else v for v in insert] start = int(self.starts.at[sample]) end = int(start + Sett.projBins) avg_s[start:end] = insert filename = str('Avg_{}_{}.csv'.format(self.channel, col)) system.save_to_file(avg_s, paths.datadir, filename)
def distance_mean(self, dist=25): """Prepare and handle data for cell-to-cell distances.""" kws = {'Dist': dist} # Maximum distance used to find cells # List paths of channels where distances are to be found dist_chans = [ p for p in self.channelPaths for t in Sett.distance_channels if t.lower() == p.stem.lower() ] if Sett.use_target: # If distances are found against other channel: target = Sett.target_chan # Get the name of the target channel try: # Find target's data file, read, and update data to keywords file = '{}.csv'.format(target) test_namer = re.compile(file, re.I) target_path = [ p for p in self.channelPaths if test_namer.fullmatch(str(p.name)) ] test_data = system.read_data(target_path[0], header=0) kws.update({'test_data': test_data}) except (FileNotFoundError, IndexError): msg = "No file for channel {}".format(target) lg.logprint(LAM_logger, "{}: {}".format(self.name, msg), 'w') print("-> {}".format(msg)) return # Loop through the channels, read, and find distances for path in dist_chans: try: data = system.read_data(path, header=0) except FileNotFoundError: msg = "No file for channel {}".format(path.stem) lg.logprint(LAM_logger, "{}: {}".format(self.name, msg), 'w') print("-> {}".format(msg)) return # Discard earlier versions of calculated distances, if present data = data.loc[:, ~data.columns.str.startswith('Nearest_')] # Find distances data.name = path.stem self.find_distances(data, vol_incl=Sett.inclusion, compare=Sett.incl_type, **kws)
def read_channel(self, path): """Read channel data into a dataframe.""" try: data = system.read_data(str(path), header=Sett.header_row) channel = self.name if channel.lower() not in [ c.lower() for c in Store.channels ] and channel.lower() != Sett.MPname.lower(): Store.channels.append(self.name) return data except ValueError: lg.logprint(LAM_logger, 'Cannot read channel path {}'.format(path), 'ex')
def __init__(self, path, groups, plot_dir, stat_dir): self.dataerror = False self.error_vars = {} self.plot_dir = plot_dir self.stat_dir = stat_dir self.filename = path.stem self.data = system.read_data(path, header=0, test=False, index_col=0) # Test that data exists if self.data is None or self.data.empty: self.dataerror = True self.groups = groups self.test_grps = [g for g in groups if g != Sett.cntrlGroup] self.stat_data = None
def mww_test(self, channel_path): """Perform MWW-test for a data set of two groups.""" self.error = False self.channel = ' '.join(str(channel_path.stem).split('_')[1:]) data = system.read_data(channel_path, header=0, test=False) # Test that data exists and has non-zero numeric values cols = data.any().index valid_data = data.loc[:, cols] valid_grp_n = cols.map(lambda x: str(x).split('_')[0]).unique().size if not valid_data.any().any() or valid_grp_n < 2: self.error = True # Find group-specific data grp_data = valid_data.T.groupby(lambda x: str(x).split('_')[0]) try: self.ctrl_data = grp_data.get_group(self.ctrl_grp).T self.test_data = grp_data.get_group(self.test_grp).T except KeyError: # If sample group not found, i.e. no sample has data self.error = True if self.error: print(f"WARNING: {self.channel} - Insufficient data, skipped.") stat_cols = [ 'U Score', 'Corr. Greater', 'P Greater', 'Reject Greater', 'Corr. Lesser', 'P Lesser', 'Reject Lesser', 'Corr. Two-sided', 'P Two-sided', 'Reject Two-sided' ] stat_data = pd.DataFrame(index=data.index, columns=stat_cols) if Sett.windowed: # If doing rolling window stats stat_data = self.windowed_test(stat_data) else: # Bin-by-bin stats: stat_data = self.bin_test(stat_data) # Correct for multiple testing: stat_data = correct(stat_data, stat_data.iloc[:, 2], 1, 3) # greater stat_data = correct(stat_data, stat_data.iloc[:, 5], 4, 6) # lesser stat_data = correct(stat_data, stat_data.iloc[:, 8], 7, 9) # 2-sided # Save statistics filename = f'Stats_{self.title} = {self.channel}.csv' system.save_to_file(stat_data, self.stat_dir, filename, append=False) self.stat_data = stat_data
def get_vect_data(self, channel): """Get channel data that is used for vector creation.""" try: # Search string: namer = str("_{}_".format(channel)) namerreg = re.compile(namer, re.I) # Search found paths with string dir_path = [ self.channelpaths[i] for i, s in enumerate(self.channelpaths) if namerreg.search(str(s)) ][0] vect_path = next(dir_path.glob('*Position.csv')) vect_data = system.read_data(vect_path, header=Sett.header_row) # Read data except (FileNotFoundError, IndexError): # If data file not found msg = 'No valid datafile for vector creation.' if LAM_logger is not None: lg.logprint(LAM_logger, msg, 'w') print('-> {}'.format(msg)) vect_data = None return vect_data
def read_channel(self, path, groups, drop=False, name_sep=1): """Read channel data and concatenate sample group info into DF.""" data = system.read_data(path, header=0, test=False) read_data = pd.DataFrame() # Loop through given groups and give an identification variable for # each sample belonging to the group. for grp in groups: namerreg = re.compile('^{}_'.format(grp), re.I) # Get only the samples that belong to the loop's current group temp = data.loc[:, data.columns.str.contains(namerreg)].T if Sett.Drop_Outliers and drop: # conditionfull_dfy drop outliers temp = drop_outlier(temp) temp['Sample Group'] = grp # Giving of sample group identification if read_data.empty: read_data = temp else: read_data = pd.concat([read_data, temp]) # Finding the name of the data under analysis from its filepath name = '_'.join(str(path.stem).split('_')[name_sep:]) center = self.center_bin # Getting the bin to which samples are centered return read_data, name, center
def get_widths(samplesdir, datadir): """Find widths of samples along their vectors.""" msg = "Necessary files for width approximation not found for " data, vector_data = None, None for path in [p for p in samplesdir.iterdir() if p.is_dir()]: # Find necessary data files: files = [p for p in path.iterdir() if p.is_file()] # Search terms vreg = re.compile('^vector.', re.I) # vector dreg = re.compile(f'^{Sett.vectChannel}.csv', re.I) # channel data try: # Match terms to found paths vect_paths = [p for p in files if vreg.match(p.name)] data_paths = [p for p in files if dreg.match(p.name)] # Read found paths vector_data = system.read_vector(vect_paths) data = system.read_data(data_paths[0], header=0) # Error handling except (StopIteration, IndexError): name = path.name full_msg = msg + name print(f"WARNING: {full_msg}") if 'vector_data' not in locals(): # if vector not found print("-> Could not read vector data.") continue if 'data' not in locals(): # if channel data not found print("Could not read channel data") print("Make sure channel is set right (vector channel)\n") continue lg.logprint(LAM_logger, full_msg, 'w') # Compute widths process.DefineWidths(data, vector_data, path, datadir)
def get_counts(paths): """Handle data to anchor samples and find cell counts.""" try: # Test that MPs are found for the sample mps = system.read_data(next(paths.datadir.glob('MPs.csv')), header=0, test=False) except (FileNotFoundError, StopIteration): msg = "MPs.csv NOT found!" print("ERROR: {}".format(msg)) lg.logprint(LAM_logger, msg, 'c') msg = "-> Perform 'Count' before continuing.\n" print("{}".format(msg)) lg.logprint(LAM_logger, msg, 'i') raise SystemExit # Find the smallest and largest anchor bin-number of the dataset mp_max, mp_min = mps.max(axis=1).values[0], mps.min(axis=1).values[0] # Store the bin number of the row onto which samples are anchored to Store.center = mp_max # Find the size of needed dataframe, i.e. so that all anchored samples fit mp_diff = mp_max - mp_min if not any([Sett.process_counts, Sett.process_samples]): # Find all sample groups in the analysis from the found MPs. found_samples = [p for p in paths.samplesdir.iterdir() if p.is_dir()] samples = mps.columns.tolist() if len(found_samples) != len( samples): # Test whether sample numbers match msg = "Mismatch of sample N between MPs.csv and sample folders" print('WARNING: {}'.format(msg)) lg.logprint(LAM_logger, msg, 'w') groups = set({s.casefold(): s.split('_')[0] for s in samples}.values()) Store.samplegroups = sorted(groups) Store.channels = [ c.stem.split('_')[1] for c in paths.datadir.glob("All_*.csv") ] try: # If required lengths of matrices haven't been defined because # Process and Count are both False, get the sizes from files. chan = Sett.vectChannel path = paths.datadir.joinpath("Norm_{}.csv".format(chan)) temp = system.read_data(path, test=False, header=0) Store.totalLength = temp.shape[0] # Length of anchored matrices path = paths.datadir.joinpath("All_{}.csv".format(chan)) temp = system.read_data(path, test=False, header=0) Sett.projBins = temp.shape[0] except AttributeError: msg = "Cannot determine length of sample matrix\n-> Must perform 'Count' before continuing." lg.logprint(LAM_logger, msg, 'c') print("ERROR: {}".format(msg)) return # The total length of needed matrix when using 'Count' Store.totalLength = int(Sett.projBins + mp_diff) # Counting and anchoring of data: if Sett.process_counts: lg.logprint(LAM_logger, 'Begin normalization of channels.', 'i') print('\n---Normalizing sample data---') # Get combined channel files of all samples countpaths = paths.datadir.glob('All_*') for path in countpaths: name = str(path.stem).split('_')[1] print(' {} ...'.format(name)) # Anchor sample's data to the full data matrix ch_counts = Normalize(path) ch_counts.starts, norm_counts = ch_counts.normalize_samples( mps, Store.totalLength, Store.center) # Get average bin counts ch_counts.averages(norm_counts) # Get averages of additional data per bin ch_counts.avg_add_data(paths, Sett.AddData, Store.totalLength) # Approximate width of sample if Sett.measure_width: print(' Width ...') width_path = paths.datadir.joinpath('Sample_widths.csv') width_counts = Normalize(width_path) _, _ = width_counts.normalize_samples(mps * 2, Store.totalLength * 2, Store.center * 2, name='Sample_widths_norm') lg.logprint(LAM_logger, 'Channels normalized.', 'i')
def __init__(self, path): self.path = pl.Path(path) self.channel = str(self.path.stem).split('_')[1] self.counts = system.read_data(path, header=0, test=False) self.starts = None
def read_additional(self, data_keys): """Read relevant additional data of channel.""" def _test_variance(data): """Test if additional data column contains variance.""" for column in data.columns.difference(['ID']): test = data.loc[:, column].dropna() test = (test - test.min()) / test.max() if test.std() < 0.01: self.datafail.append(column) data.loc[:, column] = np.nan return data def _rename_id(data): """Rename filename identification of channel.""" # I.e. as defined by settings.channelID for column in data.columns: id_str = str(column).split('_')[-1] if id_str in Sett.channelID.keys(): new_id = Sett.channelID.get(id_str) data.rename(columns={ column: column.replace(f'_{id_str}', f'-{new_id}') }, inplace=True) return data add_data = pd.DataFrame(self.data.loc[:, 'ID']) for key, values in data_keys.items(): paths = list(self.path.glob(f'*{values[0]}*')) if not paths: print(f"-> {self.name} {key} file not found") continue if len(paths) == 1: namer = re.compile(f'^{key}', re.I) if paths[0] == self.pospath and any( self.data.columns.str.contains(namer)): continue if paths[0] == self.pospath and not any( self.data.columns.str.contains(namer)): print( f"'{key}' not in {self.pospath.name} of {self.sample.name} on channel {self.name}" ) temp_data = system.read_data(str(paths[0]), header=Sett.header_row) cols = temp_data.columns.map(lambda x, matcher=namer: bool( re.match(matcher, x)) or x == 'ID') temp_data = temp_data.loc[:, cols] add_data = pd.merge(add_data, temp_data, on='ID') else: # If multiple files, e.g. intensity, get all for path in paths: # Search identifier for column from filename strings = str(path.stem).split(f'{values[0]}_') id_string = strings[1].split('_')[0] # Locate columns temp_data = system.read_data(str(path), header=Sett.header_row) temp_data = temp_data.loc[:, [key, 'ID']] for col in [c for c in temp_data.columns if c != 'ID']: rename = str(col + '_' + id_string) temp_data.rename(columns={key: rename}, inplace=True) add_data = pd.merge(add_data, temp_data, on='ID') # Drop invariant data add_data = _test_variance(add_data) if Sett.replaceID: add_data = _rename_id(add_data) self.data = pd.merge(self.data, add_data, on='ID')