key=lambda x: len(x), reverse=True)[0].format("fasta") for i in assemblies } props_stats = { k: { "length": len(props.get(k)), "head": props.get(k)[:50] } for k in props } # Create BLAST queries blast_reports = Utilities.multi_core_queue(mp_get_and_blast_largest_contig, assemblies) headers = Utilities.single_core_queue(process_blast_report, blast_reports) # Create GenBank queries genbank_reports = Utilities.multi_core_queue(mp_download_reference_genbank, headers) reference_df = pd.DataFrame( Utilities.single_core_queue(process_genbank_report, genbank_reports)) reference_df["sample_name"] = reference_df["assembly_file"].apply( lambda x: "_".join( os.path.splitext(os.path.basename(x))[0].split("_")[:-1])) reference_df.sort_values("sample_name", inplace=True) reference_table = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "BLASTed.sampledata") Utilities.dump_tsv(reference_df, reference_table)
axis=0, ignore_index=True, sort=False) major_raw_ds = major_raw_ds.fillna("Other") pie_ext = ax.pie( major_raw_ds[sample_name], radius=1, labels=major_raw_ds[RAW_LABEL_COL_NAME], labeldistance=1 - _WEDGE_WIDTH / 2, rotatelabels=True, wedgeprops=_WEDGE_PROPERTIES, textprops=_LABEL_PROPERTIES, colors=major_raw_ds["color"].apply(lambda x: tuple( float(i) for i in x.split(";"))).values.tolist()) # Export visualization tables Utilities.dump_tsv( major_digest_df.reset_index(), "{}_inner_values.tsv".format(sample_export_mask)) Utilities.dump_tsv( major_raw_ds, "{}_outer_values.tsv".format(sample_export_mask)) # Set labels ax.set_xlabel(y_col_name) ax.set_ylabel(value_col_name) plt.tight_layout() # Export PNG pie_file = "{}_double_donut.png".format(sample_export_mask) fig.suptitle(pie_file, fontsize=_BASE_FONT_SIZE) plt.savefig(pie_file, dpi=300, bbox_inches="tight") plt.close("all") plt.clf()
def export(self): import shutil shutil.copy2(self.annotation_file, "{}.bak".format(self.annotation_file)) Utilities.dump_tsv(self.merged_df, table_file=self.annotation_file)
node_names = [ j for j in [i.name for i in tree.find_clades()] if j is not None and j.startswith("GCF") ] annotations_list = [] for node_name in node_names: # node_name = "GCF_005377825.1_ASM537782v1" genbank_file = os.path.join(genbank_dir, "{}_genomic.gbff".format(node_name)) seq_records = list(SeqIO.parse(genbank_file, "genbank")) annotation_dict = { i: flatten_string(seq_records[0].annotations.get(i)) for i in ["organism", "date", "comment"] } annotation_dict["comment"] = remove_maintenance_comments( annotation_dict["comment"]) annotation_dict["strain"] = Utilities.safe_findall( "[S|s]train:* ([^ ]+)", seq_records[0].description) annotation_dict["refseq_id"] = Utilities.safe_findall( "GCF_[^_]+", node_name) annotation_dict["assembly_id"] = node_name.replace( annotation_dict["refseq_id"], "").strip("_") annotations_list.append(annotation_dict) annotations_df = pd.DataFrame(annotations_list) Utilities.dump_tsv( annotations_df, "/data1/bio/projects/inicolaeva/klebsiella_infants/roary/newick/iTOL_collapsed_tree_annotation.tsv" )
import pandas as pd from meta.scripts.Utilities import Utilities #%% sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra" sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv")) queue = [{ "func": Utilities.count_reads_statistics, "kwargs": { "reads_file": i, "type_": "fastq_gz" } } for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))] raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper, queue, async_=True) #%% raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats) raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[ "reads_file"].apply(os.path.basename) raw_reads_base_stat_df["sample_name"] = raw_reads_base_stat_df[ "reads_file"].str.extract(r"(.+)\[") Utilities.dump_tsv(raw_reads_base_stat_df, os.path.join(sra_dir, "raw_reads_base_stats.tsv"))
} first_digits = Utilities.safe_findall("^\d+", _sample_name) if len(first_digits) > 0: first_digits = int(first_digits) for k in _SPECIES: if first_digits == _SPECIES.get(k): return k print("Cannot define species: '{}'".format(_sample_name)) return "_" raw_sampledata_df["host"] = raw_sampledata_df["sample_name"].apply( define_species) Utilities.dump_tsv(raw_sampledata_df, ProjectDescriber.SAMPLE_DATA_FILE, col_names=["sample_name", "raw_reads"]) print(ProjectDescriber.SAMPLE_DATA_FILE) # /data1/bio/projects/vradchenko/lactobacillus_salivarius/sample_data/raw.sampledata # Prepare Sequence Read Archive table SRA_TEMPLATE_COL_NAMES = [ "biosample_accession", "library_ID", "title", "library_strategy", "library_source", "library_selection", "library_layout", "platform", "instrument_model", "design_description", "filetype", "filename", "filename2", "filename3", "filename4", "assembly", "fasta_file", "bioproject_accession" ] sra_dir = os.path.join(ProjectDescriber.ROOT_DIR, "sra") os.makedirs(os.path.join(sra_dir, "reads"), exist_ok=True)
"Tet": "Tetracyclines", "Tmt": "Trimethoprim", "Bla": "CBL", "Bla_ESBL": "ESBL", "Bla_broad": "BSBL", "Bla_broad_inhR": "BSBL-inhR" }, inplace=True) phenotype_df = pd.concat([initial_sample_data_df, antibiogram_df], axis=1, sort=False).sort_index() phenotype_df.index.names = [INDEX_COL_NAME] phenotype_df = process_header(phenotype_df).transpose().reset_index() # Utilities.dump_tsv(phenotype_df, os.path.join(article_dir, "phenotype.tsv")) Utilities.dump_string(phenotype_df.to_latex(index=False, header=True), os.path.join(article_dir, "phenotype.tex")) genotype_df = pd.concat( [ ncbi_accessions_df, combined_assembly_statistics_df, kleborate_results_df ], axis=1, sort=False).sort_index() # .sort_values(["Patient ID", "Sample Number"]) genotype_df.index.names = [INDEX_COL_NAME] # genotype_df.replace({"_": "\\_"}, regex=True) genotype_df = process_header(genotype_df, capitalize=False).transpose().reset_index() #
def process(self): value_col_name_raw_pivot_annotated_mask = self.create_mirrored_path( [projectDescriber.DATA_DIGEST_DIR, self.value_col_name], makedirs=True) Utilities.dump_tsv( self.raw_annotated_pivot.reset_index(), "{}_raw_annotated_pivot.tsv".format( value_col_name_raw_pivot_annotated_mask)) for col_name_with_keywords in KEYWORDS_ASSOCIATIVE_PAIRS: df_to_digest = self.raw_annotated_pivot.loc[:, [ col_name_with_keywords ] + self.sample_names] associations = KEYWORDS_ASSOCIATIVE_PAIRS.get( col_name_with_keywords) if col_name_with_keywords == HOST_COL_NAME: associations = digestAssociationsKeeper.generate_genera_dict( df_to_digest[col_name_with_keywords].values.tolist()) digest_df, raw_ds = digestAssociationsKeeper.digest_df( df_to_digest, associations=associations, columns_with_keywords=[col_name_with_keywords]) raw_ds = Utilities.left_merge( raw_ds, self.raw_annotated_pivot[RAW_LABEL_COL_NAME].reset_index(), REFERENCE_COL_NAME).fillna("") raw_ds[RAW_LABEL_COL_NAME] = raw_ds[RAW_LABEL_COL_NAME].apply( lambda x: min(Utilities.remove_empty_values( [i for i in x.strip().split(" ")]), key=len)) keyword_export_mask = self.create_mirrored_path([ projectDescriber.DATA_DIGEST_DIR, self.value_col_name, col_name_with_keywords ], makedirs=True) Utilities.dump_tsv(digest_df.reset_index(), "{}_digest.tsv".format(keyword_export_mask)) Utilities.dump_tsv(raw_ds, "{}_raw.tsv".format(keyword_export_mask)) for sample_name in digest_df.columns: _BASE_FONT_SIZE = 15 _WEDGE_WIDTH = 0.3 _WEDGE_PROPERTIES = dict(width=_WEDGE_WIDTH, edgecolor="w") _LABEL_PROPERTIES = dict(fontsize=_BASE_FONT_SIZE, rotation_mode="anchor", verticalalignment="center", horizontalalignment="center") major_digest_df = Utilities.get_n_majors_from_df( digest_df, sample_name, n=INNER_DONUT_GROUPS - 1) # Create visualization fig, ax = plt.subplots() plt.rcParams.update({ "font.size": _BASE_FONT_SIZE, "figure.figsize": (20, 20) }) ax.axis("equal") y_col_name = major_digest_df.columns[0] # Returning value: [[wedges...], [labels...], [values...]] pie_int = ax.pie(major_digest_df[sample_name], radius=1 - _WEDGE_WIDTH, labels=major_digest_df.index, labeldistance=1 - _WEDGE_WIDTH, rotatelabels=False, autopct=self.make_autopct( major_digest_df[y_col_name]), pctdistance=1 - _WEDGE_WIDTH / 2.0, wedgeprops=_WEDGE_PROPERTIES, textprops=_LABEL_PROPERTIES) # Combine color values in 'RGBA' format into the one dictionary pie_int_colors = { pie_int[1][idx].get_text(): wedge.get_facecolor() for idx, wedge in enumerate(pie_int[0]) } # Manual sort the dataset with raw values prior to the order of digest keywords major_raw_ds = pd.DataFrame() for digest_keyword in major_digest_df.index: if digest_keyword == "Other": major_raw_ds_append = pd.DataFrame( major_digest_df.loc["Other"]).transpose() major_raw_ds_append.index.name = DIGEST_LABEL_COL_NAME major_raw_ds_append = major_raw_ds_append.reset_index() else: major_raw_ds_append_right = raw_ds.loc[ raw_ds[DIGEST_LABEL_COL_NAME] == digest_keyword, [ REFERENCE_COL_NAME, sample_name, DIGEST_LABEL_COL_NAME, RAW_LABEL_COL_NAME ]] major_raw_ds_append_left = Utilities.get_n_majors_from_df( major_raw_ds_append_right.set_index( REFERENCE_COL_NAME), sample_name, n=OUTER_DONUT_SUBGROUPS - 1).rename(index={ "Other": digest_keyword }).reset_index() major_raw_ds_append = Utilities.left_merge( major_raw_ds_append_left, major_raw_ds_append_right, REFERENCE_COL_NAME) major_raw_ds_append[ RAW_LABEL_COL_NAME] = major_raw_ds_append[ RAW_LABEL_COL_NAME].fillna( "{}_Other".format(digest_keyword)) major_raw_ds_append[ DIGEST_LABEL_COL_NAME] = major_raw_ds_append[ DIGEST_LABEL_COL_NAME].fillna("Other") pie_ext_append_colors = [] for row_number in major_raw_ds_append.index.values: row_color = pie_int_colors.get(digest_keyword) if not row_color: continue row_old_alpha = row_color[3] _MINIMAL_ALPHA = 0.2 if major_raw_ds_append.shape[0] < 4: row_new_alpha = row_old_alpha - ( row_old_alpha * row_number * _MINIMAL_ALPHA) else: row_new_alpha = row_old_alpha - ( (row_old_alpha - _MINIMAL_ALPHA) * row_number / float(major_raw_ds_append.shape[0] - 1)) pie_ext_append_colors.append(";".join( str(i) for i in list(row_color[:3]) + [row_new_alpha])) major_raw_ds_append["color"] = pie_ext_append_colors if major_raw_ds_append.shape[0] > 0: if major_raw_ds.shape[0] == 0: major_raw_ds = major_raw_ds_append else: major_raw_ds = pd.concat( [major_raw_ds, major_raw_ds_append], axis=0, ignore_index=True, sort=False) major_raw_ds = major_raw_ds.fillna("Other") pie_ext = ax.pie( major_raw_ds[sample_name], radius=1, labels=major_raw_ds[RAW_LABEL_COL_NAME], labeldistance=1 - _WEDGE_WIDTH / 2, rotatelabels=True, wedgeprops=_WEDGE_PROPERTIES, textprops=_LABEL_PROPERTIES, colors=major_raw_ds["color"].apply(lambda x: tuple( float(i) for i in x.split(";"))).values.tolist()) # Export visualization tables sample_export_mask = self.create_mirrored_path([ projectDescriber.DATA_DIGEST_DIR, self.value_col_name, col_name_with_keywords, sample_name ], makedirs=True) Utilities.dump_tsv( major_digest_df.reset_index(), "{}_inner_values.tsv".format(sample_export_mask)) Utilities.dump_tsv( major_raw_ds, "{}_outer_values.tsv".format(sample_export_mask)) # Set labels ax.set_xlabel(y_col_name) ax.set_ylabel(self.value_col_name) plt.tight_layout() # Export PNG pie_file = "{}_double_donut.png".format(sample_export_mask) fig.suptitle(pie_file, fontsize=_BASE_FONT_SIZE) plt.savefig(pie_file, dpi=300, bbox_inches="tight") plt.close("all") plt.clf()
projectDescriber = ProjectDescriber() rawSampledataDF = Utilities.load_tsv( "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata" ) # Prepare path rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads") cutadaptDir = os.path.join(rawReadsDir, "cutadapt") os.makedirs(cutadaptDir, exist_ok=True) # Trim reads cutadaptResults = Utilities.multi_core_queue( run_cutadapt, queue=rawSampledataDF.values.tolist()) cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values( "sample_name") Utilities.dump_tsv( cutadaptResultsDF, table_file=projectDescriber.SAMPLE_DATA_FILE, col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"]) # Assemble reads spadesDir = os.path.join(rawReadsDir, "spades") spadesResults = Utilities.single_core_queue(run_spades, cutadaptResultsDF.values.tolist()) spadesResultsDF = pd.DataFrame.from_dict(spadesResults).sort_values( "sample_name") spadesResultsSampleData = os.path.join( os.path.dirname(projectDescriber.SAMPLE_DATA_FILE), "assemblies.sampledata") Utilities.dump_tsv(spadesResultsDF, table_file=spadesResultsSampleData, col_names=["sample_name", "assembly"]) print("\n".join([projectDescriber.SAMPLE_DATA_FILE, spadesResultsSampleData])) """
"genome_assembly_bp_valid"] * 100 / combined_statistics_df[ "sample_reads_bp"] combined_statistics_df[ "genome_assembly_coverage"] = combined_statistics_df["sample_reads_bp"] * ( combined_statistics_df["genome_assembled_reads_percentage"] / 100) / combined_statistics_df["reference_genome_bp"] combined_statistics_df["genome_assembly_coverage"] = combined_statistics_df[ "genome_assembly_coverage"].apply(lambda x: "{0:.2f}x".format(x)) combined_statistics_file = os.path.join(ProjectDescriber.ROOT_DIR, "sample_data", "combined_assembly_statistics.tsv") combined_statistics_df.reset_index(inplace=True) Utilities.dump_tsv(combined_statistics_df, combined_statistics_file, col_names=[ i for i in combined_statistics_df.columns if not i.endswith("file") ]) print(combined_statistics_file) # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/combined_assembly_statistics.tsv # Copied into the ./datasets directory # Decontamination (after NCBI submission) contamination_reports_dir = os.path.join(assemblies_target_dir, "contamination") decontaminated_assemblies_dir = os.path.join(assemblies_target_dir, "decontaminated") _ = [ os.makedirs(i, exist_ok=True) for i in (contamination_reports_dir, decontaminated_assemblies_dir)
if "undetermined" in name.lower(): name = "UND" suffix = s.split("_")[-1][0].lower() return "_".join([_OWNER_PREFIX, name, suffix]) combined_statistics_df.reset_index(inplace=True) combined_statistics_df["suggested_strain_name"] = combined_statistics_df[ INDEX_COL_NAME].apply(define_strain_name) combined_statistics_file = os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "combined_assembly_statistics.tsv") Utilities.dump_tsv( combined_statistics_df, combined_statistics_file, col_names=[INDEX_COL_NAME] + sorted([ i for i in combined_statistics_df.columns if "file" not in i and i not in ("raw_reads", INDEX_COL_NAME) ])) print(combined_statistics_file) # /data1/bio/projects/vradchenko/lactobacillus_salivarius/sample_data/combined_assembly_statistics.tsv # Copy the data ncbi_genome_metadata_df = combined_statistics_df.loc[:, [ INDEX_COL_NAME, "real_assembly_coverage", "assembly_file" ]].copy() ncbi_genome_metadata_df.rename( columns={"real_assembly_coverage": "genome_coverage"}, inplace=True) ncbi_genome_metadata_df["assembly_date"] = ncbi_genome_metadata_df[
raw_reads_dict = { i: sorted([j for j in raw_reads_files_list if "_{}_".format(i) in os.path.splitext(os.path.basename(j))[0]]) for i in ("R1", "R2")} # Combine the dict into the pandas.DataFrame object raw_sampledata_df = pd.DataFrame.from_dict(raw_reads_dict) # Are reads files corresponding to each other? assert all((raw_sampledata_df["R1"].str.replace("_R1_", "_R2_") == raw_sampledata_df["R2"]).values.tolist() + ( raw_sampledata_df["R2"].str.replace("_R2_", "_R1_") == raw_sampledata_df["R1"]).values.tolist()) # Get the sample names from reads file names raw_sampledata_df["sample_name"] = raw_sampledata_df["R1"].map( lambda x: Utilities.safe_findall("(.+)_S[0-9]{2}_R[1|2]_001.fastq.gz", os.path.basename(x))) # Export sampledata project_describer = ProjectDescriber() raw_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads.sampledata") Utilities.dump_tsv(df=raw_sampledata_df, table_file=raw_sampledata_file, col_names=["sample_name", "R1", "R2"]) print(raw_sampledata_file) # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata # Create more detailed sampledata raw_sampledata_df["reads_files"] = raw_sampledata_df.loc[:, ["R1", "R2"]].apply(lambda x: ";".join(x), axis=1) raw_sampledata_df["taxon"] = "Klebsiella pneumoniae" pipeline_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads_pipeline.sampledata") Utilities.dump_tsv(df=raw_sampledata_df, table_file=pipeline_sampledata_file, col_names=["sample_name", "reads", "taxon"]) print(pipeline_sampledata_file) # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads_pipeline.sampledata reads_stats_list = Utilities.single_core_queue(Utilities.get_reads_stats_from_fq_gz, raw_sampledata_df["R1"].values.tolist())