key=lambda x: len(x),
              reverse=True)[0].format("fasta")
    for i in assemblies
}
props_stats = {
    k: {
        "length": len(props.get(k)),
        "head": props.get(k)[:50]
    }
    for k in props
}

# Create BLAST queries
blast_reports = Utilities.multi_core_queue(mp_get_and_blast_largest_contig,
                                           assemblies)
headers = Utilities.single_core_queue(process_blast_report, blast_reports)

# Create GenBank queries
genbank_reports = Utilities.multi_core_queue(mp_download_reference_genbank,
                                             headers)
reference_df = pd.DataFrame(
    Utilities.single_core_queue(process_genbank_report, genbank_reports))
reference_df["sample_name"] = reference_df["assembly_file"].apply(
    lambda x: "_".join(
        os.path.splitext(os.path.basename(x))[0].split("_")[:-1]))
reference_df.sort_values("sample_name", inplace=True)
reference_table = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR,
                               "BLASTed.sampledata")

Utilities.dump_tsv(reference_df, reference_table)
Beispiel #2
0
                            axis=0,
                            ignore_index=True,
                            sort=False)
            major_raw_ds = major_raw_ds.fillna("Other")
            pie_ext = ax.pie(
                major_raw_ds[sample_name],
                radius=1,
                labels=major_raw_ds[RAW_LABEL_COL_NAME],
                labeldistance=1 - _WEDGE_WIDTH / 2,
                rotatelabels=True,
                wedgeprops=_WEDGE_PROPERTIES,
                textprops=_LABEL_PROPERTIES,
                colors=major_raw_ds["color"].apply(lambda x: tuple(
                    float(i) for i in x.split(";"))).values.tolist())
            # Export visualization tables
            Utilities.dump_tsv(
                major_digest_df.reset_index(),
                "{}_inner_values.tsv".format(sample_export_mask))
            Utilities.dump_tsv(
                major_raw_ds, "{}_outer_values.tsv".format(sample_export_mask))
            # Set labels
            ax.set_xlabel(y_col_name)
            ax.set_ylabel(value_col_name)
            plt.tight_layout()
            # Export PNG
            pie_file = "{}_double_donut.png".format(sample_export_mask)
            fig.suptitle(pie_file, fontsize=_BASE_FONT_SIZE)
            plt.savefig(pie_file, dpi=300, bbox_inches="tight")
            plt.close("all")
            plt.clf()
 def export(self):
     import shutil
     shutil.copy2(self.annotation_file,
                  "{}.bak".format(self.annotation_file))
     Utilities.dump_tsv(self.merged_df, table_file=self.annotation_file)
Beispiel #4
0
node_names = [
    j for j in [i.name for i in tree.find_clades()]
    if j is not None and j.startswith("GCF")
]

annotations_list = []
for node_name in node_names:
    # node_name = "GCF_005377825.1_ASM537782v1"
    genbank_file = os.path.join(genbank_dir,
                                "{}_genomic.gbff".format(node_name))
    seq_records = list(SeqIO.parse(genbank_file, "genbank"))
    annotation_dict = {
        i: flatten_string(seq_records[0].annotations.get(i))
        for i in ["organism", "date", "comment"]
    }
    annotation_dict["comment"] = remove_maintenance_comments(
        annotation_dict["comment"])
    annotation_dict["strain"] = Utilities.safe_findall(
        "[S|s]train:* ([^ ]+)", seq_records[0].description)
    annotation_dict["refseq_id"] = Utilities.safe_findall(
        "GCF_[^_]+", node_name)
    annotation_dict["assembly_id"] = node_name.replace(
        annotation_dict["refseq_id"], "").strip("_")
    annotations_list.append(annotation_dict)

annotations_df = pd.DataFrame(annotations_list)
Utilities.dump_tsv(
    annotations_df,
    "/data1/bio/projects/inicolaeva/klebsiella_infants/roary/newick/iTOL_collapsed_tree_annotation.tsv"
)
Beispiel #5
0
import pandas as pd
from meta.scripts.Utilities import Utilities

#%%

sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra"
sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv"))

queue = [{
    "func": Utilities.count_reads_statistics,
    "kwargs": {
        "reads_file": i,
        "type_": "fastq_gz"
    }
} for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))]

raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper,
                                                  queue,
                                                  async_=True)

#%%

raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats)
raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[
    "reads_file"].apply(os.path.basename)
raw_reads_base_stat_df["sample_name"] = raw_reads_base_stat_df[
    "reads_file"].str.extract(r"(.+)\[")

Utilities.dump_tsv(raw_reads_base_stat_df,
                   os.path.join(sra_dir, "raw_reads_base_stats.tsv"))
Beispiel #6
0
    }
    first_digits = Utilities.safe_findall("^\d+", _sample_name)
    if len(first_digits) > 0:
        first_digits = int(first_digits)
        for k in _SPECIES:
            if first_digits == _SPECIES.get(k):
                return k
    print("Cannot define species: '{}'".format(_sample_name))
    return "_"


raw_sampledata_df["host"] = raw_sampledata_df["sample_name"].apply(
    define_species)

Utilities.dump_tsv(raw_sampledata_df,
                   ProjectDescriber.SAMPLE_DATA_FILE,
                   col_names=["sample_name", "raw_reads"])
print(ProjectDescriber.SAMPLE_DATA_FILE)
# /data1/bio/projects/vradchenko/lactobacillus_salivarius/sample_data/raw.sampledata

# Prepare Sequence Read Archive table
SRA_TEMPLATE_COL_NAMES = [
    "biosample_accession", "library_ID", "title", "library_strategy",
    "library_source", "library_selection", "library_layout", "platform",
    "instrument_model", "design_description", "filetype", "filename",
    "filename2", "filename3", "filename4", "assembly", "fasta_file",
    "bioproject_accession"
]

sra_dir = os.path.join(ProjectDescriber.ROOT_DIR, "sra")
os.makedirs(os.path.join(sra_dir, "reads"), exist_ok=True)
Beispiel #7
0
    "Tet": "Tetracyclines",
    "Tmt": "Trimethoprim",
    "Bla": "CBL",
    "Bla_ESBL": "ESBL",
    "Bla_broad": "BSBL",
    "Bla_broad_inhR": "BSBL-inhR"
},
                            inplace=True)

phenotype_df = pd.concat([initial_sample_data_df, antibiogram_df],
                         axis=1,
                         sort=False).sort_index()
phenotype_df.index.names = [INDEX_COL_NAME]
phenotype_df = process_header(phenotype_df).transpose().reset_index()
#
Utilities.dump_tsv(phenotype_df, os.path.join(article_dir, "phenotype.tsv"))
Utilities.dump_string(phenotype_df.to_latex(index=False, header=True),
                      os.path.join(article_dir, "phenotype.tex"))

genotype_df = pd.concat(
    [
        ncbi_accessions_df, combined_assembly_statistics_df,
        kleborate_results_df
    ],
    axis=1,
    sort=False).sort_index()  # .sort_values(["Patient ID", "Sample Number"])
genotype_df.index.names = [INDEX_COL_NAME]
# genotype_df.replace({"_": "\\_"}, regex=True)
genotype_df = process_header(genotype_df,
                             capitalize=False).transpose().reset_index()
#
Beispiel #8
0
 def process(self):
     value_col_name_raw_pivot_annotated_mask = self.create_mirrored_path(
         [projectDescriber.DATA_DIGEST_DIR, self.value_col_name],
         makedirs=True)
     Utilities.dump_tsv(
         self.raw_annotated_pivot.reset_index(),
         "{}_raw_annotated_pivot.tsv".format(
             value_col_name_raw_pivot_annotated_mask))
     for col_name_with_keywords in KEYWORDS_ASSOCIATIVE_PAIRS:
         df_to_digest = self.raw_annotated_pivot.loc[:, [
             col_name_with_keywords
         ] + self.sample_names]
         associations = KEYWORDS_ASSOCIATIVE_PAIRS.get(
             col_name_with_keywords)
         if col_name_with_keywords == HOST_COL_NAME:
             associations = digestAssociationsKeeper.generate_genera_dict(
                 df_to_digest[col_name_with_keywords].values.tolist())
         digest_df, raw_ds = digestAssociationsKeeper.digest_df(
             df_to_digest,
             associations=associations,
             columns_with_keywords=[col_name_with_keywords])
         raw_ds = Utilities.left_merge(
             raw_ds,
             self.raw_annotated_pivot[RAW_LABEL_COL_NAME].reset_index(),
             REFERENCE_COL_NAME).fillna("")
         raw_ds[RAW_LABEL_COL_NAME] = raw_ds[RAW_LABEL_COL_NAME].apply(
             lambda x: min(Utilities.remove_empty_values(
                 [i for i in x.strip().split(" ")]),
                           key=len))
         keyword_export_mask = self.create_mirrored_path([
             projectDescriber.DATA_DIGEST_DIR, self.value_col_name,
             col_name_with_keywords
         ],
                                                         makedirs=True)
         Utilities.dump_tsv(digest_df.reset_index(),
                            "{}_digest.tsv".format(keyword_export_mask))
         Utilities.dump_tsv(raw_ds,
                            "{}_raw.tsv".format(keyword_export_mask))
         for sample_name in digest_df.columns:
             _BASE_FONT_SIZE = 15
             _WEDGE_WIDTH = 0.3
             _WEDGE_PROPERTIES = dict(width=_WEDGE_WIDTH, edgecolor="w")
             _LABEL_PROPERTIES = dict(fontsize=_BASE_FONT_SIZE,
                                      rotation_mode="anchor",
                                      verticalalignment="center",
                                      horizontalalignment="center")
             major_digest_df = Utilities.get_n_majors_from_df(
                 digest_df, sample_name, n=INNER_DONUT_GROUPS - 1)
             # Create visualization
             fig, ax = plt.subplots()
             plt.rcParams.update({
                 "font.size": _BASE_FONT_SIZE,
                 "figure.figsize": (20, 20)
             })
             ax.axis("equal")
             y_col_name = major_digest_df.columns[0]
             # Returning value: [[wedges...], [labels...], [values...]]
             pie_int = ax.pie(major_digest_df[sample_name],
                              radius=1 - _WEDGE_WIDTH,
                              labels=major_digest_df.index,
                              labeldistance=1 - _WEDGE_WIDTH,
                              rotatelabels=False,
                              autopct=self.make_autopct(
                                  major_digest_df[y_col_name]),
                              pctdistance=1 - _WEDGE_WIDTH / 2.0,
                              wedgeprops=_WEDGE_PROPERTIES,
                              textprops=_LABEL_PROPERTIES)
             # Combine color values in 'RGBA' format into the one dictionary
             pie_int_colors = {
                 pie_int[1][idx].get_text(): wedge.get_facecolor()
                 for idx, wedge in enumerate(pie_int[0])
             }
             # Manual sort the dataset with raw values prior to the order of digest keywords
             major_raw_ds = pd.DataFrame()
             for digest_keyword in major_digest_df.index:
                 if digest_keyword == "Other":
                     major_raw_ds_append = pd.DataFrame(
                         major_digest_df.loc["Other"]).transpose()
                     major_raw_ds_append.index.name = DIGEST_LABEL_COL_NAME
                     major_raw_ds_append = major_raw_ds_append.reset_index()
                 else:
                     major_raw_ds_append_right = raw_ds.loc[
                         raw_ds[DIGEST_LABEL_COL_NAME] == digest_keyword, [
                             REFERENCE_COL_NAME, sample_name,
                             DIGEST_LABEL_COL_NAME, RAW_LABEL_COL_NAME
                         ]]
                     major_raw_ds_append_left = Utilities.get_n_majors_from_df(
                         major_raw_ds_append_right.set_index(
                             REFERENCE_COL_NAME),
                         sample_name,
                         n=OUTER_DONUT_SUBGROUPS -
                         1).rename(index={
                             "Other": digest_keyword
                         }).reset_index()
                     major_raw_ds_append = Utilities.left_merge(
                         major_raw_ds_append_left,
                         major_raw_ds_append_right, REFERENCE_COL_NAME)
                     major_raw_ds_append[
                         RAW_LABEL_COL_NAME] = major_raw_ds_append[
                             RAW_LABEL_COL_NAME].fillna(
                                 "{}_Other".format(digest_keyword))
                     major_raw_ds_append[
                         DIGEST_LABEL_COL_NAME] = major_raw_ds_append[
                             DIGEST_LABEL_COL_NAME].fillna("Other")
                 pie_ext_append_colors = []
                 for row_number in major_raw_ds_append.index.values:
                     row_color = pie_int_colors.get(digest_keyword)
                     if not row_color:
                         continue
                     row_old_alpha = row_color[3]
                     _MINIMAL_ALPHA = 0.2
                     if major_raw_ds_append.shape[0] < 4:
                         row_new_alpha = row_old_alpha - (
                             row_old_alpha * row_number * _MINIMAL_ALPHA)
                     else:
                         row_new_alpha = row_old_alpha - (
                             (row_old_alpha - _MINIMAL_ALPHA) * row_number /
                             float(major_raw_ds_append.shape[0] - 1))
                     pie_ext_append_colors.append(";".join(
                         str(i)
                         for i in list(row_color[:3]) + [row_new_alpha]))
                 major_raw_ds_append["color"] = pie_ext_append_colors
                 if major_raw_ds_append.shape[0] > 0:
                     if major_raw_ds.shape[0] == 0:
                         major_raw_ds = major_raw_ds_append
                     else:
                         major_raw_ds = pd.concat(
                             [major_raw_ds, major_raw_ds_append],
                             axis=0,
                             ignore_index=True,
                             sort=False)
             major_raw_ds = major_raw_ds.fillna("Other")
             pie_ext = ax.pie(
                 major_raw_ds[sample_name],
                 radius=1,
                 labels=major_raw_ds[RAW_LABEL_COL_NAME],
                 labeldistance=1 - _WEDGE_WIDTH / 2,
                 rotatelabels=True,
                 wedgeprops=_WEDGE_PROPERTIES,
                 textprops=_LABEL_PROPERTIES,
                 colors=major_raw_ds["color"].apply(lambda x: tuple(
                     float(i) for i in x.split(";"))).values.tolist())
             # Export visualization tables
             sample_export_mask = self.create_mirrored_path([
                 projectDescriber.DATA_DIGEST_DIR, self.value_col_name,
                 col_name_with_keywords, sample_name
             ],
                                                            makedirs=True)
             Utilities.dump_tsv(
                 major_digest_df.reset_index(),
                 "{}_inner_values.tsv".format(sample_export_mask))
             Utilities.dump_tsv(
                 major_raw_ds,
                 "{}_outer_values.tsv".format(sample_export_mask))
             # Set labels
             ax.set_xlabel(y_col_name)
             ax.set_ylabel(self.value_col_name)
             plt.tight_layout()
             # Export PNG
             pie_file = "{}_double_donut.png".format(sample_export_mask)
             fig.suptitle(pie_file, fontsize=_BASE_FONT_SIZE)
             plt.savefig(pie_file, dpi=300, bbox_inches="tight")
             plt.close("all")
             plt.clf()
projectDescriber = ProjectDescriber()
rawSampledataDF = Utilities.load_tsv(
    "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata"
)
# Prepare path
rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads")
cutadaptDir = os.path.join(rawReadsDir, "cutadapt")
os.makedirs(cutadaptDir, exist_ok=True)
# Trim reads
cutadaptResults = Utilities.multi_core_queue(
    run_cutadapt, queue=rawSampledataDF.values.tolist())
cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values(
    "sample_name")
Utilities.dump_tsv(
    cutadaptResultsDF,
    table_file=projectDescriber.SAMPLE_DATA_FILE,
    col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"])
# Assemble reads
spadesDir = os.path.join(rawReadsDir, "spades")
spadesResults = Utilities.single_core_queue(run_spades,
                                            cutadaptResultsDF.values.tolist())
spadesResultsDF = pd.DataFrame.from_dict(spadesResults).sort_values(
    "sample_name")
spadesResultsSampleData = os.path.join(
    os.path.dirname(projectDescriber.SAMPLE_DATA_FILE),
    "assemblies.sampledata")
Utilities.dump_tsv(spadesResultsDF,
                   table_file=spadesResultsSampleData,
                   col_names=["sample_name", "assembly"])
print("\n".join([projectDescriber.SAMPLE_DATA_FILE, spadesResultsSampleData]))
"""
        "genome_assembly_bp_valid"] * 100 / combined_statistics_df[
            "sample_reads_bp"]
combined_statistics_df[
    "genome_assembly_coverage"] = combined_statistics_df["sample_reads_bp"] * (
        combined_statistics_df["genome_assembled_reads_percentage"] /
        100) / combined_statistics_df["reference_genome_bp"]
combined_statistics_df["genome_assembly_coverage"] = combined_statistics_df[
    "genome_assembly_coverage"].apply(lambda x: "{0:.2f}x".format(x))
combined_statistics_file = os.path.join(ProjectDescriber.ROOT_DIR,
                                        "sample_data",
                                        "combined_assembly_statistics.tsv")
combined_statistics_df.reset_index(inplace=True)

Utilities.dump_tsv(combined_statistics_df,
                   combined_statistics_file,
                   col_names=[
                       i for i in combined_statistics_df.columns
                       if not i.endswith("file")
                   ])

print(combined_statistics_file)
# /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/combined_assembly_statistics.tsv
# Copied into the ./datasets directory

# Decontamination (after NCBI submission)
contamination_reports_dir = os.path.join(assemblies_target_dir,
                                         "contamination")
decontaminated_assemblies_dir = os.path.join(assemblies_target_dir,
                                             "decontaminated")
_ = [
    os.makedirs(i, exist_ok=True)
    for i in (contamination_reports_dir, decontaminated_assemblies_dir)
Beispiel #11
0
    if "undetermined" in name.lower():
        name = "UND"
    suffix = s.split("_")[-1][0].lower()
    return "_".join([_OWNER_PREFIX, name, suffix])


combined_statistics_df.reset_index(inplace=True)
combined_statistics_df["suggested_strain_name"] = combined_statistics_df[
    INDEX_COL_NAME].apply(define_strain_name)
combined_statistics_file = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR,
                                        "combined_assembly_statistics.tsv")

Utilities.dump_tsv(
    combined_statistics_df,
    combined_statistics_file,
    col_names=[INDEX_COL_NAME] + sorted([
        i for i in combined_statistics_df.columns
        if "file" not in i and i not in ("raw_reads", INDEX_COL_NAME)
    ]))

print(combined_statistics_file)
# /data1/bio/projects/vradchenko/lactobacillus_salivarius/sample_data/combined_assembly_statistics.tsv
# Copy the data

ncbi_genome_metadata_df = combined_statistics_df.loc[:, [
    INDEX_COL_NAME, "real_assembly_coverage", "assembly_file"
]].copy()
ncbi_genome_metadata_df.rename(
    columns={"real_assembly_coverage": "genome_coverage"}, inplace=True)

ncbi_genome_metadata_df["assembly_date"] = ncbi_genome_metadata_df[
Beispiel #12
0
raw_reads_dict = {
    i: sorted([j for j in raw_reads_files_list if "_{}_".format(i) in os.path.splitext(os.path.basename(j))[0]]) for i
    in ("R1", "R2")}
# Combine the dict into the pandas.DataFrame object
raw_sampledata_df = pd.DataFrame.from_dict(raw_reads_dict)
# Are reads files corresponding to each other?
assert all((raw_sampledata_df["R1"].str.replace("_R1_", "_R2_") == raw_sampledata_df["R2"]).values.tolist() + (
            raw_sampledata_df["R2"].str.replace("_R2_", "_R1_") == raw_sampledata_df["R1"]).values.tolist())
# Get the sample names from reads file names
raw_sampledata_df["sample_name"] = raw_sampledata_df["R1"].map(
    lambda x: Utilities.safe_findall("(.+)_S[0-9]{2}_R[1|2]_001.fastq.gz", os.path.basename(x)))
# Export sampledata
project_describer = ProjectDescriber()
raw_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads.sampledata")

Utilities.dump_tsv(df=raw_sampledata_df, table_file=raw_sampledata_file, col_names=["sample_name", "R1", "R2"])

print(raw_sampledata_file)  # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata
# Create more detailed sampledata
raw_sampledata_df["reads_files"] = raw_sampledata_df.loc[:, ["R1", "R2"]].apply(lambda x: ";".join(x), axis=1)
raw_sampledata_df["taxon"] = "Klebsiella pneumoniae"
pipeline_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads_pipeline.sampledata")

Utilities.dump_tsv(df=raw_sampledata_df, table_file=pipeline_sampledata_file,
                   col_names=["sample_name", "reads", "taxon"])

print(pipeline_sampledata_file)
# /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads_pipeline.sampledata

reads_stats_list = Utilities.single_core_queue(Utilities.get_reads_stats_from_fq_gz,
                                               raw_sampledata_df["R1"].values.tolist())