Beispiel #1
0
 def set_groupdata_dict(self, groupdata_file: str):
     self.groupdata_file = groupdata_file
     self.groupdata_digest_name = Utilities.filename_only(self.groupdata_file).replace(".groupdata", "")
     groupdata_df = pd.read_table(self.groupdata_file, sep="\t", header="infer", names=["sample_name", "group_name"])
     self.groupdata_dict = {i: sorted(set(
         groupdata_df.loc[groupdata_df["group_name"] == i, ["sample_name"]])) for i in sorted(
         set(groupdata_df["group_name"]))}
     self.raw_all_sample_names_list = sorted(set(groupdata_df["sample_name"]))
Beispiel #2
0
 def split(self, output_dir: str):
     output_dir = Utilities.ends_with_slash(output_dir)
     os.makedirs(output_dir, exist_ok=True)
     # Note: the dataframe must have only index and value columns
     for sample_col_name in list(self.pivot_df):
         sample_name = Utilities.filename_only(sample_col_name).split(
             "_")[0]
         sample_file_name = "{}{}.tsv".format(output_dir, sample_name)
         self.pivot_df[sample_col_name].reset_index().rename(
             columns={
                 sample_col_name: self.value_col_name
             }).to_csv(sample_file_name, sep="\t", header=True, index=False)
         self._sample_names_list.append(sample_file_name)
 def evaluate_sampledata():
     import os
     import subprocess
     import pandas as pd
     from meta.scripts.Utilities import Utilities
     import re
     #
     df = pd.DataFrame(columns=["sample_name", "sample_path"])
     for dir_mask in [
             "/data2/bio/ecoli_komfi/raw_reads/*",
             "/data2/bio/ecoli_komfi/raw_reads2/*"
     ]:
         data_1 = [
             i.strip() for i in subprocess.getoutput(
                 "ls -d {}R1*.fastq* | sort".format(dir_mask)).split("\n")
         ]
         data_12 = [
             "{a}\t{b}".format(a=i, b=i.replace("R1", "R2"))
             if os.path.isfile(i.replace("R1", "R2")) else ""
             for i in data_1
         ]
         sample_names_list = [
             re.sub("_S.*$", "", Utilities.filename_only(i)) for i in data_1
         ]
         df = pd.concat([
             df,
             pd.DataFrame.from_dict({
                 "sample_name": sample_names_list,
                 "sample_path": data_12
             })
         ],
                        axis=0,
                        ignore_index=True)
     #
     df["group_id"] = "group_id"
     #
     os.makedirs(ProjectDescriber.directory, exist_ok=True)
     df.loc[:,
            ["sample_name", "group_id"]].to_csv(ProjectDescriber.groupdata,
                                                sep='\t',
                                                index=False,
                                                header=False)
     df.loc[:, ["sample_name", "sample_path"]].to_csv(
         ProjectDescriber.sampledata, sep='\t', index=False, header=False)
     subprocess.getoutput(
         "sed -i 's|\"||g' {}".format(ProjectDescriber.sampledata)
     )  # Tab-containing columns items are flanked by '"'