Esempio n. 1
0
    def _apply_data_ops_auto(self,
                             process_name: str,
                             df: Optional[pd.DataFrame] = None):
        """
        Apply subset and filter operations to a dataframe where the operations
        are derived from the `branch_spec`, consecutively applying data operations,
        beginning with those corresponding to the first process, and ending
        with those corresponding to the process specified by `process_name`
        Args:
            process_name:

        Returns:

        """
        subset_list = self.spec.get_subset_list(process_name)
        filter_list = self.spec.get_filter_list(process_name)
        if df is None:
            self.set_meta(None)
        for (subset, filter_) in zip(subset_list, filter_list):
            df = self._apply_data_op(df, subset, "subset")
            df = self._apply_data_op(df, filter_, "filter")
        df.replace(" ", "_", regex=True, inplace=True)
        partitions_list = self.spec.get_partition_list(process_name)
        partitions = set().union(*partitions_list)
        if partitions:
            df = label_df_partitions(df, partitions, encodings=True)
        return df
Esempio n. 2
0
 def _get_cell_meta(self, process_name: str) -> pd.DataFrame:
     """
     Read in cell metadata and performs modifications:
         - replace any spaces with underscores
         - merge metadata from process precursors and current process
         - performs data operations (subset, filter, partition)
     Args:
         df: if provided, skip first two steps and go straight to data ops
     Returns:
         df: modified metadata dataframe
     """
     try:
         df = pd.read_csv(self["root"].path_map["meta"], sep="\t", index_col=0)
     except FileNotFoundError:
         df = pd.DataFrame(self.rna.cell_ids.copy())
         df.columns = ["cell_id"]
         df.index = df["cell_id"]
         df.drop(columns=["cell_id"], inplace=True)
     if process_name is not None:
         precursor_names = self.spec.get_precursors_lookup(incl_current=True)[process_name]
         for precursor_name in precursor_names:
             try:
                 process_meta = self[precursor_name].process_meta
             except FileNotFoundError:
                 pass
             else:
                 intersect_cols = set(df.columns).intersection(set(process_meta.columns))
                 process_meta.drop(intersect_cols, axis=1, inplace=True)
                 df = df.merge(process_meta, left_index=True, right_index=True)
     df.replace(" ", "_", regex=True, inplace=True)
     partitions_list = self.spec.get_partition_list(process_name)
     partitions = set().union(*partitions_list)
     if partitions:
         df = label_df_partitions(df, partitions, encodings=True)
     return df
Esempio n. 3
0
 def get_cell_meta(self, df=None):
     # TODO: MEMORY DUPLICATION - we want to keep file access pure?
     if df is None:
         # TODO: fix this
         try:
             # df = self.f["cell_metadata"].copy()
             df = pd.read_csv(self.root_dir / "meta.tsv", sep="\t", index_col=0)
         except FileNotFoundError:
             df = pd.DataFrame(self.rna.cell_ids.copy())
             df.columns = ["cell_id"]
             df.index = df["cell_id"]
             df.drop(columns=["cell_id"], inplace=True)
         df.replace(" ", "_", regex=True, inplace=True)
         if "to_bucket_var" in df and "bucketed_var" not in df:
             df["bucketed_var"] = pd.cut(df["to_bucket_var"], bins=(0, 20, 40, 60, 80), labels=(10, 30, 50, 70),)
         if "str_var_preprocessed" in df and "str_var_processed" not in df:
             df["str_var_processed"] = df["str_var_preprocessed"].str.extract(r"([A-Z]\d)")
         # TODO: fill in once `process_run.done` feature is ready
         df = self._meta_add_downstream_data(df)
     df = self._subset_filter(df, self.spec, self.schema)
     if self.spec.partition_set:
         df = label_df_partitions(df, self.spec.partition_set, encodings=True)
     return df
Esempio n. 4
0
 def set_partition(self, process_name: Optional[str] = None, encodings=True):
     """Add columns to metadata to indicate partition from spec"""
     columns = self.spec[process_name]["partition"]
     self._meta = label_df_partitions(self.meta, columns, encodings)