コード例 #1
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
    def point_handedness(self):
        """
        Find handedness of projected points compared to vector.
        Returns DF with added column 'hand', with possible values [-1, 0, 1]
        that correspond to [right side, on vector, left side] respectively.
        """
        def _get_sign(arr, p1_x, p1_y, p2_x, p2_y):
            """Find which side of vector a feature is."""
            x_val, y_val = arr[0], arr[1]
            val = math.copysign(1, (p2_x - p1_x) * (y_val - p1_y) -
                                (p2_y - p1_y) * (x_val - p1_x))
            return val

        # Define bin edges
        edges, edge_points = self.get_vector_edges(multip=2)
        data = self.data.sort_values(by='NormDist')
        # Find features in every bin and define hand-side
        for ind, point1 in enumerate(edge_points[:-1]):
            point2 = edge_points[ind + 1]
            p1x, p1y = point1.x, point1.y
            p2x, p2y = point2.x, point2.y
            d_index = data.loc[(data.NormDist >= edges[ind])
                               & (data.NormDist < edges[ind + 1])].index
            points = data.loc[d_index, ['Position X', 'Position Y']]
            # Assign hand-side of features
            data.loc[d_index,
                     'hand'] = points.apply(_get_sign,
                                            args=(p1x, p1y, p2x, p2y),
                                            axis=1,
                                            raw=True).replace(np.nan, 0)
        data = data.sort_index()
        # Save calculated data
        channel_string = str('{}.csv'.format(Sett.vectChannel))
        system.save_to_file(data, self.sampledir, channel_string, append=False)
        return data
コード例 #2
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
 def project_mps(self, positions, datadir, filename="some.csv"):
     """For the projection of spot coordinates onto the vector."""
     xy_positions = list(
         zip(positions['Position X'], positions['Position Y']))
     # The shapely packages reguires transformation into Multipoints for the
     # projection.
     points = gm.MultiPoint(xy_positions)
     # Find point of projection on the vector.
     positions["VectPoint"] = [
         self.vector.interpolate(self.vector.project(gm.Point(x)))
         for x in points
     ]
     # Find normalized distance (0->1)
     positions["NormDist"] = [
         self.vector.project(x, normalized=True)
         for x in positions["VectPoint"]
     ]
     # Find the bins that the points fall into
     # Determine bins of each feature
     edges = np.linspace(0, 1, Sett.projBins + 1)
     labels = np.arange(0, Sett.projBins)
     positions["DistBin"] = pd.cut(positions["NormDist"],
                                   edges,
                                   labels=labels)
     mp_bin = pd.Series(positions.loc[:, "DistBin"], name=self.name)
     self.data = positions
     self.test_projection(Sett.MPname)
     # Save the obtained data:
     system.save_to_file(mp_bin.astype(int), datadir, filename)
     return mp_bin
コード例 #3
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
    def average_width(self, datadir):
        """Calculate width based on feature distance and side."""
        def _get_approx_width(sub_data):
            """Approximate sample's width at bin."""
            width = 0
            for val in [-1, 1]:
                distances = sub_data.loc[(sub_data.hand == val)].ProjDist
                if not distances.empty:
                    temp = distances.groupby(
                        pd.qcut(distances, 10, duplicates='drop')).mean()
                    if not temp.empty:
                        width += temp.tolist()[-1]
            return width

        edges = self.get_vector_edges(multip=2, points=False)
        cols = ['NormDist', 'ProjDist', 'hand']
        data = self.data.sort_values(by='NormDist').loc[:, cols]
        # Create series to hold width results
        res = pd.Series(name=self.name, index=pd.RangeIndex(stop=len(edges)))
        # Loop segments and get widths:
        for ind, _ in enumerate(edges[:-1]):
            d_index = data.loc[(data.NormDist >= edges[ind])
                               & (data.NormDist < edges[ind + 1])].index
            res.iat[ind] = _get_approx_width(data.loc[d_index, :])
        filename = 'Sample_widths.csv'
        system.save_to_file(res, datadir, filename)
コード例 #4
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
    def project_channel(self, channel):
        """For projecting coordinates onto the vector."""
        data = channel.data
        xy_positions = list(zip(data['Position X'], data['Position Y']))
        # Transformation into Multipoints required for projection:
        points = gm.MultiPoint(xy_positions)
        # Find projection distance on the vector.
        proj_vector_dist = [self.vector.project(x) for x in points]
        # Find the exact point of projection
        proj_points = [self.vector.interpolate(p) for p in proj_vector_dist]
        # Find distance between feature and the point of projection
        proj_dist = [p.distance(proj_points[i]) for i, p in enumerate(points)]
        # Find normalized distance (0->1)
        data["NormDist"] = [d / self.vector_length for d in proj_vector_dist]
        # Determine bins of each feature
        edges = np.linspace(0, 1, Sett.projBins + 1)
        labels = np.arange(0, Sett.projBins)
        data["DistBin"] = pd.cut(data["NormDist"],
                                 labels=labels,
                                 bins=edges,
                                 include_lowest=True).astype('int')

        # Assign data to DF and save the dataframe:
        data["VectPoint"] = [(round(p.x, 3), round(p.y, 3))
                             for p in proj_points]
        data["ProjDist"] = proj_dist
        self.data = data
        self.test_projection(channel.name)
        channel_string = f'{channel.name}.csv'
        system.save_to_file(data, self.sampledir, channel_string, append=False)
        return data
コード例 #5
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
 def get_mps(self, mp_name: str, use_mp: bool,
             datadir: pl.Path) -> pd.Series:
     """Collect MPs for sample anchoring."""
     if use_mp:
         try:  # Get measurement point for anchoring
             mp_dir_path = next(
                 self.channelpaths.pop(i)
                 for i, s in enumerate(self.channelpaths)
                 if str('_' + mp_name + '_') in str(s))
             mp_path = next(mp_dir_path.glob("*Position.csv"))
             mp_data = system.read_data(mp_path,
                                        header=Sett.header_row,
                                        test=False)
             mp_data = mp_data.loc[:, ['Position X', 'Position Y']]
             if not mp_data.empty:
                 mp_bin = self.project_mps(mp_data,
                                           datadir,
                                           filename="MPs.csv")
                 mp_df = pd.DataFrame({'MP': mp_bin.values.codes})
                 mp_df.to_csv(self.sampledir.joinpath("MPs.csv"),
                              index=False)
         except (StopIteration, ValueError, UnboundLocalError):
             mp_bin = None
             msg = f'could not find MP position for {self.name}'
             lg.logprint(LAM_logger, msg, 'e')
             print("    -> Failed to find MP position data.")
     else:  # Sets measurement point values to zero when MP's are not used
         mp_bin = pd.Series(0, name=self.name)
         system.save_to_file(mp_bin, datadir, "MPs.csv")
         system.save_to_file(mp_bin,
                             self.sampledir,
                             "MPs.csv",
                             append=False)
     return mp_bin
コード例 #6
0
    def count_clusters(self, data, name):
        """Count total clustered cells per bin."""

        # Find bins of the clustered cells to find counts per bin
        binned_data = data.loc[data.dropna(subset=['ClusterID']).index,
                               'DistBin']

        # Sort values and then get counts
        bins = binned_data.sort_values().to_numpy()
        unique, counts = np.unique(bins, return_counts=True)
        idx = np.arange(0, Sett.projBins)

        # Create series to store the cell count data
        binned_counts = pd.Series(np.full(len(idx), 0),
                                  index=idx,
                                  name=self.name)
        binned_counts.loc[unique] = counts
        filename = 'Clusters-{}.csv'.format(name)
        system.save_to_file(binned_counts, self.paths.datadir, filename)

        # Relate the counts to context, i.e. anchor them at the MP
        insert, _ = process.relate_data(binned_counts, self.MP,
                                        self.center_bin, self.bin_length)

        # Save the data
        counts_series = pd.Series(data=insert, name=self.name)
        filename = 'ClNorm_Clusters-{}.csv'.format(name)
        system.save_to_file(counts_series, self.paths.datadir, filename)
コード例 #7
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
    def normalize_samples(self, mps, array_length, center, name=None):
        """For inserting sample data into larger matrix, centered with MP."""
        # Create empty data array => insert in DF
        cols = self.counts.columns
        arr = np.full((array_length, len(cols)), np.nan)
        data = pd.DataFrame(arr, columns=cols)

        # Create empty series for holding each sample's starting index
        sample_start = pd.Series(np.full(len(cols), np.nan), index=cols)
        for col in self.counts.columns:
            handle = self.counts[col].values
            mp_bin = mps.at[0, col]
            # Insert sample's count data into larger, anchored dataframe:
            insert, insx = relate_data(handle, mp_bin, center, array_length)
            data[col] = insert
            # Save starting index of the sample
            sample_start.at[col] = insx

        check_anchor_quality(sample_start)
        # Save anchored data
        if name is None:
            name = f'Norm_{self.channel}'
        filename = f'{name}.csv'
        data = data.sort_index(axis=1)
        system.save_to_file(data, self.path.parent, filename, append=False)
        return sample_start, data
コード例 #8
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
 def avg_add_data(self, paths: system.Paths, data_names: dict,
                  total_len: int):
     """Find bin averages of additional data."""
     samples = self.starts.index
     for sample in samples:
         sample_dir = paths.samplesdir.joinpath(sample)
         data_file = sample_dir.glob(str(self.channel + '.csv'))
         data = system.read_data(next(data_file), header=0)
         for data_type in data_names.keys():
             sample_data = data.loc[:,
                                    data.columns.str.contains(str(data_type)
                                                              )]
             if sample_data.empty:
                 continue
             binned_data = data.loc[:, 'DistBin']
             bins = np.arange(0, Sett.projBins)
             for col in sample_data:
                 avg_s = pd.Series(np.full(total_len, np.nan), name=sample)
                 with warnings.catch_warnings():
                     warnings.simplefilter('ignore',
                                           category=RuntimeWarning)
                     insert = [
                         np.nanmean(sample_data.loc[binned_data == i, col])
                         for i in bins
                     ]
                     insert = [0 if np.isnan(v) else v for v in insert]
                 start = int(self.starts.at[sample])
                 end = int(start + Sett.projBins)
                 avg_s[start:end] = insert
                 filename = str('Avg_{}_{}.csv'.format(self.channel, col))
                 system.save_to_file(avg_s, paths.datadir, filename)
コード例 #9
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
 def find_counts(self, channel_name, datadir):
     """Gather projected features and find bin counts."""
     counts = np.bincount(self.data['DistBin'], minlength=Sett.projBins)
     counts = pd.Series(np.nan_to_num(counts), name=self.name)
     channel_string = f'All_{channel_name}.csv'
     system.save_to_file(counts, datadir, channel_string)
     if channel_name == Sett.vectChannel:
         test_count_projection(counts, self.name)
コード例 #10
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
 def create_median(self):
     # Extract point coordinates of the vector:
     positions = self.vect_data
     x, y = positions.loc[:, 'Position X'], positions.loc[:, 'Position Y']
     line_df = self.median_vector(x, y, Sett.medianBins)
     if line_df is not None and not line_df.empty:
         system.save_to_file(line_df,
                             self.sampledir,
                             'Vector.csv',
                             append=False)
コード例 #11
0
    def stats(self):
        """Calculate statistics of one variable between two groups."""
        # Group all data by sample groups
        grp_data = self.data.groupby(['Sample Group'])

        # Find data of control group
        ctrl_data = grp_data.get_group(Sett.cntrlGroup)

        # Make indices for DataFrame
        cols = ['U Score', 'P Two-sided', 'Reject Two-sided']  # Needed columns
        mcol = pd.MultiIndex.from_product([self.test_grps, cols],
                                          names=['Sample Group', 'Statistics'])
        variables = self.data.Variable.unique()  # Index

        # Create the DataFrame
        total_stats = pd.DataFrame(index=variables, columns=mcol)
        total_stats.sort_index(level=['Sample Group', 'Statistics'],
                               inplace=True)

        # Test each group against the control:
        for grp in self.test_grps:
            test_data = grp_data.get_group(grp)

            # Loop all variables to test
            for variable in variables:
                # Get data of both groups
                c_vals = ctrl_data.loc[
                    (ctrl_data.Variable == variable),
                    ctrl_data.columns.difference(['Sample Group', 'Variable'])]
                t_vals = test_data.loc[
                    (test_data.Variable == variable),
                    test_data.columns.difference(['Sample Group', 'Variable'])]

                # Perform test
                test_values = self.total_mww(grp, c_vals, t_vals, variable)

                # Insert values to result DF
                total_stats.loc[variable, (grp, cols)] = test_values

            if grp in self.error_vars.keys():
                print(
                    f"WARNING: {self.filename} - No data on {', '.join(self.error_vars[grp])}"
                )

        # Save statistics
        savename = self.filename + ' Stats.csv'
        system.save_to_file(total_stats,
                            self.stat_dir,
                            savename,
                            append=False,
                            w_index=True)

        # Store to object
        self.stat_data = total_stats
コード例 #12
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
 def create_skeleton(self):
     # Extract point coordinates of the vector:
     positions = self.vect_data
     x, y = positions.loc[:, 'Position X'], positions.loc[:, 'Position Y']
     coord_df, bin_array, skeleton = self.binarize_coords(
         x, y, Sett.SkeletonResize, Sett.BDiter, Sett.SigmaGauss)
     line_df = self.skeleton_vector(coord_df)
     if line_df is not None and not line_df.empty:
         system.save_to_file(line_df,
                             self.sampledir,
                             'Vector.csv',
                             append=False)
     pfunc.skeleton_plot(self.sampledir, self.name, bin_array, skeleton)
コード例 #13
0
    def mww_test(self, channel_path):
        """Perform MWW-test for a data set of two groups."""
        self.error = False
        self.channel = ' '.join(str(channel_path.stem).split('_')[1:])
        data = system.read_data(channel_path, header=0, test=False)

        # Test that data exists and has non-zero numeric values
        cols = data.any().index
        valid_data = data.loc[:, cols]
        valid_grp_n = cols.map(lambda x: str(x).split('_')[0]).unique().size

        if not valid_data.any().any() or valid_grp_n < 2:
            self.error = True

        # Find group-specific data
        grp_data = valid_data.T.groupby(lambda x: str(x).split('_')[0])
        try:
            self.ctrl_data = grp_data.get_group(self.ctrl_grp).T
            self.test_data = grp_data.get_group(self.test_grp).T
        except KeyError:  # If sample group not found, i.e. no sample has data
            self.error = True

        if self.error:
            print(f"WARNING: {self.channel} - Insufficient data, skipped.")

        stat_cols = [
            'U Score', 'Corr. Greater', 'P Greater', 'Reject Greater',
            'Corr. Lesser', 'P Lesser', 'Reject Lesser', 'Corr. Two-sided',
            'P Two-sided', 'Reject Two-sided'
        ]
        stat_data = pd.DataFrame(index=data.index, columns=stat_cols)

        if Sett.windowed:  # If doing rolling window stats
            stat_data = self.windowed_test(stat_data)

        else:  # Bin-by-bin stats:
            stat_data = self.bin_test(stat_data)

        # Correct for multiple testing:
        stat_data = correct(stat_data, stat_data.iloc[:, 2], 1, 3)  # greater
        stat_data = correct(stat_data, stat_data.iloc[:, 5], 4, 6)  # lesser
        stat_data = correct(stat_data, stat_data.iloc[:, 8], 7, 9)  # 2-sided

        # Save statistics
        filename = f'Stats_{self.title} = {self.channel}.csv'
        system.save_to_file(stat_data, self.stat_dir, filename, append=False)
        self.stat_data = stat_data
コード例 #14
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
 def averages(self, norm_counts: pd.DataFrame):
     """Find bin averages of channels."""
     # Find groups of each sample based on samplenames
     samples = norm_counts.columns.tolist()
     groups = set({s.casefold(): s.split('_')[0] for s in samples}.values())
     cols = ["{}_All".format(g) for g in groups]
     averages = pd.DataFrame(index=norm_counts.index, columns=cols)
     for grp in groups:  # For each group found in data
         namer = "{}_".format(grp)
         group_data = norm_counts.loc[:,
                                      norm_counts.columns.str.
                                      startswith(namer)]
         # Calculate group averages
         averages.loc[:, "{}_All".format(grp)] = group_data.mean(axis=1)
     # Save average data
     filename = str('ChanAvg_{}.csv'.format(self.channel))
     system.save_to_file(averages, self.path.parent, filename, append=False)
コード例 #15
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
    def find_sample_vector(self, path):  # path = data directory
        """Find sample's vector data."""
        try:  # Find sample's vector file
            paths = list(self.sampledir.glob('Vector.*'))
            self.vector = system.read_vector(paths)
            self.vector_length = self.vector.length
            length_series = pd.Series(self.vector_length, name=self.name)
            system.save_to_file(length_series, path, 'Length.csv')

        # If vector file not found
        except (FileNotFoundError, IndexError):
            msg = f'Vector-file NOT found for {self.name}'
            lg.logprint(LAM_logger, msg, 'e')
            print(f'ERROR: {msg}')
        except (AttributeError, ValueError):  # If vector file is faulty
            msg = f'Faulty vector for {self.name}'
            lg.logprint(LAM_logger, msg, 'c')
            print(f'CRITICAL: {msg}')
コード例 #16
0
ファイル: process.py プロジェクト: hietakangas-laboratory/LAM
def find_existing(paths: system.Paths):
    """Get MPs and count old projections when not projecting during 'Count'."""
    msg = 'Collecting pre-existing data.'
    print(msg)
    lg.logprint(LAM_logger, msg, 'i')
    mps = pd.DataFrame(columns=Store.samples)
    for smpl in Store.samples:
        smplpath = paths.samplesdir.joinpath(smpl)
        # FIND MP
        if Sett.useMP:
            try:
                mp_df = pd.read_csv(smplpath.joinpath('MPs.csv'))
                mp_bin = mp_df.iat[0, 0]
            except FileNotFoundError:
                msg = "MP-data not found."
                add = "Provide MP-data or set useMP to False."
                print(f"ERROR: {msg}\n{add}")
                raise SystemExit
        else:
            mp_bin = 0
        mps.loc[0, smpl] = mp_bin
        # FIND CHANNEL COUNTS
        for path in [
                p for p in smplpath.iterdir() if p.suffix == '.csv'
                and p.stem not in ['Vector', 'MPs', Sett.MPname]
        ]:
            data = pd.read_csv(path)
            try:
                counts = np.bincount(data['DistBin'], minlength=Sett.projBins)
                counts = pd.Series(np.nan_to_num(counts), name=smpl)
                channel_string = str(f'All_{path.stem}.csv')
                system.save_to_file(counts, paths.datadir, channel_string)
            except ValueError:  # If channel has not been projected
                print(f"Missing projection data: {path.stem} - {smpl}")
                print("-> Set project=True and perform Count")
                continue
    mps.to_csv(paths.datadir.joinpath('MPs.csv'))
    samples = mps.columns.tolist()
    groups = set({s.casefold(): s.split('_')[0] for s in samples}.values())
    Store.samplegroups = sorted(groups)
コード例 #17
0
    def find_distances(self,
                       data,
                       vol_incl=200,
                       compare='smfull_dfer',
                       clusters=False,
                       **kws):
        """Calculate cell-to-cell distances or find clusters."""
        def _find_clusters():
            """Find cluster 'seeds' and merge to create full clusters."""
            def __merge(seeds):
                """Merge seeds that share cells."""
                cells = sum(seeds, [])  # List of all cells

                # Create map object containing a set for each cell ID:
                cells = map(lambda x: {x}, set(cells))

                # Loop through a set of each seed
                for item in map(set, seeds):
                    # For each seed, find IDs from the set of cell
                    # IDs and merge them

                    # ID-sets not in seed:
                    out = [x for x in cells if not x & item]
                    # found ID-sets:
                    m_seeds = [x for x in cells if x & item]

                    # make union of the ID sets that are found
                    m_seeds = set([]).union(*m_seeds)

                    # Reassign cells to contain the newly merged ID-sets
                    cells = out + [m_seeds]
                yield cells

            max_dist = kws.get('Dist')  # max distance to consider clustering
            treedata = xy_pos[['x', 'y', 'z']]

            # Create K-D tree and query for nearest
            tree = KDTree(treedata)
            seed_ids = tree.query_radius(treedata, r=max_dist)

            # Merging of the seeds
            seed_lst = [xy_pos.iloc[a, :].ID.tolist() for a in seed_ids]
            cl_gen = __merge(seed_lst)

            # Change the generator into list of lists and drop clusters of size
            # under/over limits
            all_cl = [
                list(y) for x in cl_gen for y in x
                if y and Sett.cl_min <= len(y) <= Sett.cl_max
            ]
            return all_cl

        def _find_nearest():
            """Iterate passed data to determine nearby cells."""

            max_dist = kws.get('Dist')  # distance used for subsetting target

            # If distances are found to features on another channel:
            if 'target_xy' in locals():
                target = target_xy
                comment = Sett.target_chan
                filename = f'Avg_{data.name} VS {comment}_Distance Means.csv'
            else:  # If using the same channel:
                target = xy_pos
                comment = data.name
                filename = f'Avg_{data.name}_Distance Means.csv'

            # Creation of DF to store found data (later concatenated to data)
            cols = [f'Nearest_Dist_{comment}', f'Nearest_ID_{comment}']
            new_data = pd.DataFrame(index=xy_pos.index)

            # KD tree
            treedata = target[['x', 'y', 'z']]
            tree = KDTree(treedata)
            dist, ind = tree.query(xy_pos[['x', 'y', 'z']], k=2)

            col_dict = {
                cols[0]: dist[:, 1],
                cols[1]: target.iloc[ind[:, 1]].ID.values
            }
            new_data = new_data.assign(**col_dict)

            # Concatenate the obtained data with the read data.
            new_data = pd.concat([data, new_data], axis=1)

            # limit data based on max_dist
            new_data[cols] = new_data[cols].where(
                (new_data[cols[0]] <= max_dist))

            # Get bin and distance to nearest cell for each cell, calculate
            # average distance within each bin.
            binned_data = new_data.loc[:, 'DistBin']
            distances = new_data.loc[:, cols[0]].astype('float64')
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', category=RuntimeWarning)
                means = [
                    np.nanmean(distances[binned_data.values == k])
                    for k in np.arange(0, Sett.projBins)
                ]
            return new_data, means, filename

        if vol_incl > 0:  # Subsetting of data based on cell volume
            data_ind = subset_data(data, compare, vol_incl, self.name)
            if 'test_data' in kws.keys():  # Obtain target channel if used
                test_data = kws.pop('test_data')
                test_data.name = data.name
                test_ind = subset_data(test_data, compare, vol_incl, self.name)
        elif 'test_data' in kws.keys():
            test_data = kws.pop('test_data')
            test_ind = test_data.index
            data_ind = data.index
        else:
            data_ind = data.index

        # Accessing the data for the analysis via the indexes taken before.
        # Cells for which the nearest cells will be found:
        xy_pos = data.loc[
            data_ind,
            ['Position X', 'Position Y', 'Position Z', 'ID', 'DistBin']]
        renames = {'Position X': 'x', 'Position Y': 'y', 'Position Z': 'z'}
        xy_pos.rename(columns=renames, inplace=True)  # rename for dot notation

        if 'test_ind' in locals():  # Get data from target channel, if used
            target_xy = test_data.loc[
                test_ind, ['Position X', 'Position Y', 'Position Z', 'ID']]
            target_xy.rename(columns=renames, inplace=True)

        if not clusters:  # Finding nearest distances
            new_data, means, filename = _find_nearest()
            means_series = pd.Series(means, name=self.name)
            insert, _ = process.relate_data(means_series, self.MP,
                                            self.center_bin, self.bin_length)
            means_insert = pd.Series(data=insert, name=self.name)
            system.save_to_file(means_insert, self.paths.datadir, filename)

        else:  # Finding clusters
            if not xy_pos.empty:
                all_cl = _find_clusters()
            else:
                all_cl = False

            # Create dataframe for storing the obtained data
            cl_data = pd.DataFrame(index=data.index,
                                   columns=['ID', 'ClusterID'])
            cl_data = cl_data.assign(ID=data.ID)  # Copy ID column

            # Give name from a continuous range to each of the found clusters
            # and add it to cell-specific data (for each belonging cell).
            if all_cl:
                for i, vals in enumerate(all_cl):
                    vals = [int(v) for v in vals]
                    cl_data.loc[cl_data.ID.isin(vals), 'ClusterID'] = i + 1
            else:
                print(f"-> No clusters found for {self.name}.")
                cl_data.loc[:, 'ClusterID'] = np.nan

            # Merge obtained data with the original data
            new_data = data.merge(cl_data, how='outer', copy=False, on=['ID'])
            self.count_clusters(new_data, data.name)

        # Overwrite original sample data with the data containing new columns
        write_name = '{}.csv'.format(data.name)
        system.save_to_file(new_data, self.path, write_name, append=False)
コード例 #18
0
    def get_totals(self):
        """Count sample & channel -specific cell totals."""
        def _read_and_sum():
            """Read path and sum cell numbers of bins for each sample."""
            chan_data, __, _ = self.read_channel(path, self._groups, drop=drpb)
            # Get sum of cells for each sample
            ch_sum = chan_data.sum(axis=1, skipna=True, numeric_only=True)
            # Get group of each sample
            groups = chan_data.loc[:, 'Sample Group']
            # Change the sum data into dataframe and add group identifiers
            ch_sum = ch_sum.to_frame().assign(group=groups.values)
            ch_sum.rename(columns={'group': 'Sample Group'}, inplace=True)
            return ch_sum

        lg.logprint(LAM_logger, 'Finding total counts', 'i')
        drpb = Sett.Drop_Outliers  # Find if dropping outliers
        datadir = self.paths.datadir
        full_df = pd.DataFrame()

        # Loop through files containing cell count data, read, and find sums
        for path in datadir.glob('All*'):
            ch_sum = _read_and_sum()
            channel = path.stem.split('_')[1]  # Channel name
            ch_sum = ch_sum.assign(Variable=channel)
            full_df = pd.concat([full_df, ch_sum],
                                ignore_index=False,
                                sort=False)

        # Save dataframe containing sums of each channel for each sample
        system.save_to_file(full_df,
                            datadir,
                            'Total Counts.csv',
                            append=False,
                            w_index=True)

        # Find totals of additional data
        for channel in [
                c for c in Store.channels
                if c not in ['MP', 'R45', Sett.MPname]
        ]:
            full_df = pd.DataFrame()

            for path in datadir.glob('Avg_{}_*'.format(channel)):
                chan_data, __, _ = self.read_channel(path,
                                                     self._groups,
                                                     drop=drpb)
                # Assign channel identifier
                add_name = path.stem.split('_')[2:]  # Channel name
                chan_data = chan_data.assign(Variable='_'.join(add_name))

                # Concatenate new data to full set
                full_df = pd.concat([full_df, chan_data],
                                    ignore_index=False,
                                    sort=False)

            if full_df.empty:
                continue

            # Adjust column order so that identifiers are first
            ordered = ['Sample Group', 'Variable']
            cols = ordered + (full_df.columns.drop(ordered).tolist())
            full_df = full_df[cols]

            # Drop samples that have invariant data
            full_df = full_df[
                full_df.iloc[:, :-3].nunique(axis=1, dropna=True) > 1]

            # Save dataframe containing sums of each channel for each sample
            filename = 'Total {} AddData.csv'.format(channel)
            system.save_to_file(full_df,
                                datadir,
                                filename,
                                append=False,
                                w_index=True)

        # Find totals of data obtained from distance calculations
        full_df = pd.DataFrame()
        for path in chain(datadir.glob('Clusters-*.csv'),
                          datadir.glob('*Distance Means.csv'),
                          datadir.glob('Sample_widths_norm.csv')):
            if 'Clusters-' in path.name:
                name = "{} Clusters".format(path.stem.split('-')[1])
            elif 'Distance Means' in path.name:
                name = "{} Distances".format(path.name.split('_')[1])
            else:
                name = "Widths"
            chan_data, __, _ = self.read_channel(path, self._groups, drop=drpb)

            # Assign data type identifier
            chan_data = chan_data.assign(Variable=name)
            full_df = pd.concat([full_df, chan_data],
                                ignore_index=False,
                                sort=False)

        if not full_df.empty:  # If data obtained
            # Adjust column order so that identifiers are first
            ordered = ['Sample Group', 'Variable']
            cols = ordered + (full_df.columns.drop(ordered).tolist())
            full_df = full_df[cols]

            # Save DF
            filename = 'Total Distance Data.csv'
            system.save_to_file(full_df,
                                datadir,
                                filename,
                                append=False,
                                w_index=True)

        lg.logprint(LAM_logger, 'Total counts done', 'i')