def clustering(self, dist=10):
        """Handle data for finding clusters of cells."""
        kws = {'Dist': dist}  # Maximum distance for considering clustering
        data = None

        # Listing of paths of channels on which clusters are to be found
        cluster_chans = [
            p for p in self.channelPaths for t in Sett.cluster_channels
            if t.lower() == p.stem.lower()
        ]
        for path in cluster_chans:  # Loop paths, read file, and find clusters
            try:
                data = system.read_data(path, header=0)
            except (FileNotFoundError, AttributeError):
                msg = "No file for channel {}".format(path.stem)
                lg.logprint(LAM_logger, "{}: {}".format(self.name, msg), 'w')
                print("-> {}".format(msg))

            # Discard earlier versions of found clusters, if present
            if data is not None:
                data = data.loc[:, ~data.columns.str.contains('ClusterID')]
                data.name = path.stem  # The name of the clustering channel

                # Find clusters
                self.find_distances(data,
                                    vol_incl=Sett.cl_inclusion,
                                    compare=Sett.cl_incl_type,
                                    clusters=True,
                                    **kws)
 def get_mps(self, mp_name: str, use_mp: bool,
             datadir: pl.Path) -> pd.Series:
     """Collect MPs for sample anchoring."""
     if use_mp:
         try:  # Get measurement point for anchoring
             mp_dir_path = next(
                 self.channelpaths.pop(i)
                 for i, s in enumerate(self.channelpaths)
                 if str('_' + mp_name + '_') in str(s))
             mp_path = next(mp_dir_path.glob("*Position.csv"))
             mp_data = system.read_data(mp_path,
                                        header=Sett.header_row,
                                        test=False)
             mp_data = mp_data.loc[:, ['Position X', 'Position Y']]
             if not mp_data.empty:
                 mp_bin = self.project_mps(mp_data,
                                           datadir,
                                           filename="MPs.csv")
                 mp_df = pd.DataFrame({'MP': mp_bin.values.codes})
                 mp_df.to_csv(self.sampledir.joinpath("MPs.csv"),
                              index=False)
         except (StopIteration, ValueError, UnboundLocalError):
             mp_bin = None
             msg = f'could not find MP position for {self.name}'
             lg.logprint(LAM_logger, msg, 'e')
             print("    -> Failed to find MP position data.")
     else:  # Sets measurement point values to zero when MP's are not used
         mp_bin = pd.Series(0, name=self.name)
         system.save_to_file(mp_bin, datadir, "MPs.csv")
         system.save_to_file(mp_bin,
                             self.sampledir,
                             "MPs.csv",
                             append=False)
     return mp_bin
    def __init__(self, paths=None, child=False):
        if child:
            return

        # Creation of variables related to all samples, that are later passed
        # on to child classes.
        Samplegroups._groups = sorted(Store.samplegroups)
        Samplegroups._chanPaths = list(paths.datadir.glob('Norm_*'))
        Samplegroups.sample_paths = [
            p for p in paths.samplesdir.iterdir() if p.is_dir()
        ]
        Samplegroups._addData = list(paths.datadir.glob('Avg_*'))

        # Data and other usable directories
        Samplegroups.paths = paths

        # Total length of needed data matrix of all anchored samples
        Samplegroups.bin_length = Store.totalLength

        # Get MPs of all samples
        mp_path = paths.datadir.joinpath('MPs.csv')
        Samplegroups.sample_mps = system.read_data(mp_path,
                                                   header=0,
                                                   test=False)

        # If anchor point index is defined, find the start index of samples
        if Store.center is not None:
            Samplegroups.center_bin = Store.center

        # Assign color for each sample group
        groupcolors = sns.xkcd_palette(Sett.palette_colors)
        for i, grp in enumerate(Samplegroups._groups):
            Samplegroups.grp_palette.update({grp: groupcolors[i]})

        lg.logprint(LAM_logger, 'Sample groups established.', 'i')
 def avg_add_data(self, paths: system.Paths, data_names: dict,
                  total_len: int):
     """Find bin averages of additional data."""
     samples = self.starts.index
     for sample in samples:
         sample_dir = paths.samplesdir.joinpath(sample)
         data_file = sample_dir.glob(str(self.channel + '.csv'))
         data = system.read_data(next(data_file), header=0)
         for data_type in data_names.keys():
             sample_data = data.loc[:,
                                    data.columns.str.contains(str(data_type)
                                                              )]
             if sample_data.empty:
                 continue
             binned_data = data.loc[:, 'DistBin']
             bins = np.arange(0, Sett.projBins)
             for col in sample_data:
                 avg_s = pd.Series(np.full(total_len, np.nan), name=sample)
                 with warnings.catch_warnings():
                     warnings.simplefilter('ignore',
                                           category=RuntimeWarning)
                     insert = [
                         np.nanmean(sample_data.loc[binned_data == i, col])
                         for i in bins
                     ]
                     insert = [0 if np.isnan(v) else v for v in insert]
                 start = int(self.starts.at[sample])
                 end = int(start + Sett.projBins)
                 avg_s[start:end] = insert
                 filename = str('Avg_{}_{}.csv'.format(self.channel, col))
                 system.save_to_file(avg_s, paths.datadir, filename)
    def distance_mean(self, dist=25):
        """Prepare and handle data for cell-to-cell distances."""
        kws = {'Dist': dist}  # Maximum distance used to find cells

        # List paths of channels where distances are to be found
        dist_chans = [
            p for p in self.channelPaths for t in Sett.distance_channels
            if t.lower() == p.stem.lower()
        ]

        if Sett.use_target:  # If distances are found against other channel:
            target = Sett.target_chan  # Get the name of the target channel
            try:  # Find target's data file, read, and update data to keywords
                file = '{}.csv'.format(target)
                test_namer = re.compile(file, re.I)
                target_path = [
                    p for p in self.channelPaths
                    if test_namer.fullmatch(str(p.name))
                ]
                test_data = system.read_data(target_path[0], header=0)
                kws.update({'test_data': test_data})
            except (FileNotFoundError, IndexError):
                msg = "No file for channel {}".format(target)
                lg.logprint(LAM_logger, "{}: {}".format(self.name, msg), 'w')
                print("-> {}".format(msg))
                return

        # Loop through the channels, read, and find distances
        for path in dist_chans:
            try:
                data = system.read_data(path, header=0)
            except FileNotFoundError:
                msg = "No file for channel {}".format(path.stem)
                lg.logprint(LAM_logger, "{}: {}".format(self.name, msg), 'w')
                print("-> {}".format(msg))
                return
            # Discard earlier versions of calculated distances, if present
            data = data.loc[:, ~data.columns.str.startswith('Nearest_')]
            # Find distances
            data.name = path.stem
            self.find_distances(data,
                                vol_incl=Sett.inclusion,
                                compare=Sett.incl_type,
                                **kws)
 def read_channel(self, path):
     """Read channel data into a dataframe."""
     try:
         data = system.read_data(str(path), header=Sett.header_row)
         channel = self.name
         if channel.lower() not in [
                 c.lower() for c in Store.channels
         ] and channel.lower() != Sett.MPname.lower():
             Store.channels.append(self.name)
         return data
     except ValueError:
         lg.logprint(LAM_logger, 'Cannot read channel path {}'.format(path),
                     'ex')
Exemple #7
0
    def __init__(self, path, groups, plot_dir, stat_dir):
        self.dataerror = False
        self.error_vars = {}
        self.plot_dir = plot_dir
        self.stat_dir = stat_dir
        self.filename = path.stem
        self.data = system.read_data(path, header=0, test=False, index_col=0)

        # Test that data exists
        if self.data is None or self.data.empty:
            self.dataerror = True

        self.groups = groups
        self.test_grps = [g for g in groups if g != Sett.cntrlGroup]
        self.stat_data = None
Exemple #8
0
    def mww_test(self, channel_path):
        """Perform MWW-test for a data set of two groups."""
        self.error = False
        self.channel = ' '.join(str(channel_path.stem).split('_')[1:])
        data = system.read_data(channel_path, header=0, test=False)

        # Test that data exists and has non-zero numeric values
        cols = data.any().index
        valid_data = data.loc[:, cols]
        valid_grp_n = cols.map(lambda x: str(x).split('_')[0]).unique().size

        if not valid_data.any().any() or valid_grp_n < 2:
            self.error = True

        # Find group-specific data
        grp_data = valid_data.T.groupby(lambda x: str(x).split('_')[0])
        try:
            self.ctrl_data = grp_data.get_group(self.ctrl_grp).T
            self.test_data = grp_data.get_group(self.test_grp).T
        except KeyError:  # If sample group not found, i.e. no sample has data
            self.error = True

        if self.error:
            print(f"WARNING: {self.channel} - Insufficient data, skipped.")

        stat_cols = [
            'U Score', 'Corr. Greater', 'P Greater', 'Reject Greater',
            'Corr. Lesser', 'P Lesser', 'Reject Lesser', 'Corr. Two-sided',
            'P Two-sided', 'Reject Two-sided'
        ]
        stat_data = pd.DataFrame(index=data.index, columns=stat_cols)

        if Sett.windowed:  # If doing rolling window stats
            stat_data = self.windowed_test(stat_data)

        else:  # Bin-by-bin stats:
            stat_data = self.bin_test(stat_data)

        # Correct for multiple testing:
        stat_data = correct(stat_data, stat_data.iloc[:, 2], 1, 3)  # greater
        stat_data = correct(stat_data, stat_data.iloc[:, 5], 4, 6)  # lesser
        stat_data = correct(stat_data, stat_data.iloc[:, 8], 7, 9)  # 2-sided

        # Save statistics
        filename = f'Stats_{self.title} = {self.channel}.csv'
        system.save_to_file(stat_data, self.stat_dir, filename, append=False)
        self.stat_data = stat_data
 def get_vect_data(self, channel):
     """Get channel data that is used for vector creation."""
     try:
         # Search string:
         namer = str("_{}_".format(channel))
         namerreg = re.compile(namer, re.I)
         # Search found paths with string
         dir_path = [
             self.channelpaths[i] for i, s in enumerate(self.channelpaths)
             if namerreg.search(str(s))
         ][0]
         vect_path = next(dir_path.glob('*Position.csv'))
         vect_data = system.read_data(vect_path,
                                      header=Sett.header_row)  # Read data
     except (FileNotFoundError, IndexError):  # If data file not found
         msg = 'No valid datafile for vector creation.'
         if LAM_logger is not None:
             lg.logprint(LAM_logger, msg, 'w')
         print('-> {}'.format(msg))
         vect_data = None
     return vect_data
    def read_channel(self, path, groups, drop=False, name_sep=1):
        """Read channel data and concatenate sample group info into DF."""
        data = system.read_data(path, header=0, test=False)
        read_data = pd.DataFrame()

        # Loop through given groups and give an identification variable for
        # each sample belonging to the group.
        for grp in groups:
            namerreg = re.compile('^{}_'.format(grp), re.I)
            # Get only the samples that belong to the loop's current group
            temp = data.loc[:, data.columns.str.contains(namerreg)].T
            if Sett.Drop_Outliers and drop:  # conditionfull_dfy drop outliers
                temp = drop_outlier(temp)
            temp['Sample Group'] = grp  # Giving of sample group identification
            if read_data.empty:
                read_data = temp
            else:
                read_data = pd.concat([read_data, temp])

        # Finding the name of the data under analysis from its filepath
        name = '_'.join(str(path.stem).split('_')[name_sep:])
        center = self.center_bin  # Getting the bin to which samples are centered
        return read_data, name, center
def get_widths(samplesdir, datadir):
    """Find widths of samples along their vectors."""
    msg = "Necessary files for width approximation not found for "
    data, vector_data = None, None

    for path in [p for p in samplesdir.iterdir() if p.is_dir()]:
        # Find necessary data files:
        files = [p for p in path.iterdir() if p.is_file()]

        # Search terms
        vreg = re.compile('^vector.', re.I)  # vector
        dreg = re.compile(f'^{Sett.vectChannel}.csv', re.I)  # channel data

        try:  # Match terms to found paths
            vect_paths = [p for p in files if vreg.match(p.name)]
            data_paths = [p for p in files if dreg.match(p.name)]
            # Read found paths
            vector_data = system.read_vector(vect_paths)
            data = system.read_data(data_paths[0], header=0)

        # Error handling
        except (StopIteration, IndexError):
            name = path.name
            full_msg = msg + name
            print(f"WARNING: {full_msg}")
            if 'vector_data' not in locals():  # if vector not found
                print("-> Could not read vector data.")
                continue
            if 'data' not in locals():  # if channel data not found
                print("Could not read channel data")
                print("Make sure channel is set right (vector channel)\n")
                continue
            lg.logprint(LAM_logger, full_msg, 'w')

        # Compute widths
        process.DefineWidths(data, vector_data, path, datadir)
Exemple #12
0
def get_counts(paths):
    """Handle data to anchor samples and find cell counts."""
    try:  # Test that MPs are found for the sample
        mps = system.read_data(next(paths.datadir.glob('MPs.csv')),
                               header=0,
                               test=False)
    except (FileNotFoundError, StopIteration):
        msg = "MPs.csv NOT found!"
        print("ERROR: {}".format(msg))
        lg.logprint(LAM_logger, msg, 'c')
        msg = "-> Perform 'Count' before continuing.\n"
        print("{}".format(msg))
        lg.logprint(LAM_logger, msg, 'i')
        raise SystemExit

    # Find the smallest and largest anchor bin-number of the dataset
    mp_max, mp_min = mps.max(axis=1).values[0], mps.min(axis=1).values[0]
    # Store the bin number of the row onto which samples are anchored to
    Store.center = mp_max
    # Find the size of needed dataframe, i.e. so that all anchored samples fit
    mp_diff = mp_max - mp_min

    if not any([Sett.process_counts, Sett.process_samples]):
        # Find all sample groups in the analysis from the found MPs.
        found_samples = [p for p in paths.samplesdir.iterdir() if p.is_dir()]
        samples = mps.columns.tolist()
        if len(found_samples) != len(
                samples):  # Test whether sample numbers match
            msg = "Mismatch of sample N between MPs.csv and sample folders"
            print('WARNING: {}'.format(msg))
            lg.logprint(LAM_logger, msg, 'w')
        groups = set({s.casefold(): s.split('_')[0] for s in samples}.values())
        Store.samplegroups = sorted(groups)
        Store.channels = [
            c.stem.split('_')[1] for c in paths.datadir.glob("All_*.csv")
        ]
        try:  # If required lengths of matrices haven't been defined because
            # Process and Count are both False, get the sizes from files.
            chan = Sett.vectChannel
            path = paths.datadir.joinpath("Norm_{}.csv".format(chan))
            temp = system.read_data(path, test=False, header=0)
            Store.totalLength = temp.shape[0]  # Length of anchored matrices
            path = paths.datadir.joinpath("All_{}.csv".format(chan))
            temp = system.read_data(path, test=False, header=0)
            Sett.projBins = temp.shape[0]
        except AttributeError:
            msg = "Cannot determine length of sample matrix\n-> Must perform 'Count' before continuing."
            lg.logprint(LAM_logger, msg, 'c')
            print("ERROR: {}".format(msg))
        return

    # The total length of needed matrix when using 'Count'
    Store.totalLength = int(Sett.projBins + mp_diff)

    # Counting and anchoring of data:
    if Sett.process_counts:
        lg.logprint(LAM_logger, 'Begin normalization of channels.', 'i')
        print('\n---Normalizing sample data---')
        # Get combined channel files of all samples
        countpaths = paths.datadir.glob('All_*')
        for path in countpaths:
            name = str(path.stem).split('_')[1]
            print('  {}  ...'.format(name))
            # Anchor sample's data to the full data matrix
            ch_counts = Normalize(path)
            ch_counts.starts, norm_counts = ch_counts.normalize_samples(
                mps, Store.totalLength, Store.center)
            # Get average bin counts
            ch_counts.averages(norm_counts)
            # Get averages of additional data per bin
            ch_counts.avg_add_data(paths, Sett.AddData, Store.totalLength)

        # Approximate width of sample
        if Sett.measure_width:
            print('  Width  ...')
            width_path = paths.datadir.joinpath('Sample_widths.csv')
            width_counts = Normalize(width_path)
            _, _ = width_counts.normalize_samples(mps * 2,
                                                  Store.totalLength * 2,
                                                  Store.center * 2,
                                                  name='Sample_widths_norm')
        lg.logprint(LAM_logger, 'Channels normalized.', 'i')
Exemple #13
0
 def __init__(self, path):
     self.path = pl.Path(path)
     self.channel = str(self.path.stem).split('_')[1]
     self.counts = system.read_data(path, header=0, test=False)
     self.starts = None
Exemple #14
0
    def read_additional(self, data_keys):
        """Read relevant additional data of channel."""
        def _test_variance(data):
            """Test if additional data column contains variance."""
            for column in data.columns.difference(['ID']):
                test = data.loc[:, column].dropna()
                test = (test - test.min()) / test.max()
                if test.std() < 0.01:
                    self.datafail.append(column)
                    data.loc[:, column] = np.nan
            return data

        def _rename_id(data):
            """Rename filename identification of channel."""
            # I.e. as defined by settings.channelID
            for column in data.columns:
                id_str = str(column).split('_')[-1]
                if id_str in Sett.channelID.keys():
                    new_id = Sett.channelID.get(id_str)
                    data.rename(columns={
                        column:
                        column.replace(f'_{id_str}', f'-{new_id}')
                    },
                                inplace=True)
            return data

        add_data = pd.DataFrame(self.data.loc[:, 'ID'])
        for key, values in data_keys.items():
            paths = list(self.path.glob(f'*{values[0]}*'))
            if not paths:
                print(f"-> {self.name} {key} file not found")
                continue
            if len(paths) == 1:
                namer = re.compile(f'^{key}', re.I)
                if paths[0] == self.pospath and any(
                        self.data.columns.str.contains(namer)):
                    continue
                if paths[0] == self.pospath and not any(
                        self.data.columns.str.contains(namer)):
                    print(
                        f"'{key}' not in {self.pospath.name} of {self.sample.name} on channel {self.name}"
                    )
                temp_data = system.read_data(str(paths[0]),
                                             header=Sett.header_row)
                cols = temp_data.columns.map(lambda x, matcher=namer: bool(
                    re.match(matcher, x)) or x == 'ID')
                temp_data = temp_data.loc[:, cols]
                add_data = pd.merge(add_data, temp_data, on='ID')
            else:  # If multiple files, e.g. intensity, get all
                for path in paths:
                    # Search identifier for column from filename
                    strings = str(path.stem).split(f'{values[0]}_')
                    id_string = strings[1].split('_')[0]
                    # Locate columns
                    temp_data = system.read_data(str(path),
                                                 header=Sett.header_row)
                    temp_data = temp_data.loc[:, [key, 'ID']]
                    for col in [c for c in temp_data.columns if c != 'ID']:
                        rename = str(col + '_' + id_string)
                        temp_data.rename(columns={key: rename}, inplace=True)
                    add_data = pd.merge(add_data, temp_data, on='ID')
        # Drop invariant data
        add_data = _test_variance(add_data)
        if Sett.replaceID:
            add_data = _rename_id(add_data)
        self.data = pd.merge(self.data, add_data, on='ID')