def generate(pair_2d_array: list, regex: str = DEFAULT_REGEX, extension: str = DEFAULT_READS_EXTENSION): arr = SampleDataArray() for sample_read_files in pair_2d_array: sample_read_files = sorted(sample_read_files) sample_file = os.path.basename(sample_read_files[0]) sample_name = Utilities.safe_findall( regex, re.sub(f"{extension}$", "", sample_file)) if len(sample_name) == 0: raise ValueError( f"Cannot process the file '{sample_file}' with the regex '{regex}'" ) if any(sample_name not in i for i in sample_read_files): raise ValueError( f"Some files from the list '{sample_read_files}' do not contain {sample_name} parsed by the regex '{regex}'" ) if sample_name in arr.lines.keys(): print( f"Duplicate sample data line key, the regex check is considered: '{sample_name}'" ) c = 0 sample_name_ = str(sample_name) while sample_name in arr.lines.keys(): c += 1 sample_name = "{}.{}".format(sample_name_, c) arr.lines[sample_name] = SampleDataLine(sample_name, sample_read_files) return arr
def process_blast_report(high_scoring_pairs: list): first_report = high_scoring_pairs[0] reference_header = first_report.get("title") accession_id = Utilities.safe_findall("\|* *gi\| *([^|]+) *\|", reference_header) return dict(assembly_file=first_report.get("assembly_file"), reference_header=reference_header, accession_id=accession_id)
def get_genera_dict(input_list: list): return { j: () for j in sorted([ Utilities.safe_findall("([A-Z][a-z]{4,})", i).strip() for i in set(input_list) if isinstance(i, str) ]) if len(j) > 0 }
def _mp_parse_nfasta_header(header): output_dict = dict(former_id=header) output_dict["genbank_id"] = Utilities.safe_findall( "^gb\|([^|]+)", header) output_dict["is_antisense_strand"] = header.split("|")[2].startswith( "-") output_dict["locus"] = Utilities.safe_findall("\|(\d+\-\d+)", header) output_dict["aro_id"] = Utilities.safe_findall("\|ARO:(\d+)", header) gene_chunk = header.split("|")[-1] output_dict["host"] = Utilities.safe_findall("\[(.+)\]", gene_chunk) output_dict["gene_description"] = gene_chunk.replace( "[{}]".format(output_dict["host"]), "").strip() _MIN_GENE_SYMBOL_LENGTH = 3 _NON_GENE_SYMBOL_WORDS = ("DNA", "RNA") output_dict["gene_symbol"] = min([ j for j in [ i.strip() for i in output_dict.get("gene_description").split(" ") ] if len(j) >= _MIN_GENE_SYMBOL_LENGTH and j not in _NON_GENE_SYMBOL_WORDS ], key=len) return Utilities.dict2pd_series(output_dict)
def _mp_parse_nfasta_header(header: str): _VFDB_REGEXES = (("vfdb_id", "^VFG(\d+)", "VFG{}"), ("gene_accession_id", "\(([^\(]+)\) ", "({}) "), ("gene_symbol", "^\(([^\(]+)\) ", "({}) "), ("gene_host", "\[([^\]]+)\]$", "[{}]"), ("gene_name", " \[([^\]]+)\] $", " [{}] "), ("gene_description", ".*", "{}")) out = {"former_id": header} # Spaces are important here for _tuple in _VFDB_REGEXES: key, regex, replacement = _tuple out[key] = Utilities.safe_findall(regex, header) if len(out.get(key)) > 0: header = header.replace(replacement.format(out.get(key)), "") return {k: out.get(k).strip() for k in out}
def define_species(_sample_name: str): _SPECIES = { "Bacillus subtilis BZR 336g": 336, "Bacillus subtilis BZR 517": 517, "Lactobacillus salivarius": 1, "Lactobacillus curvatus": 2, "Lactobacillus heilongjiangensis": 8 } first_digits = Utilities.safe_findall("^\d+", _sample_name) if len(first_digits) > 0: first_digits = int(first_digits) for k in _SPECIES: if first_digits == _SPECIES.get(k): return k print("Cannot define species: '{}'".format(_sample_name)) return "_"
def _mp_parse_nfasta_header(header): out = {"former_id": header} for tag in re.findall("\[(.+)\]", header): header = header.replace("[{}]".format(tag), "[{}]".format(tag.strip())) header_chunks = [i.strip() for i in header.split("|")] category_chunk = header_chunks[1].upper() if category_chunk.startswith("T"): out["category"] = "Toxin" elif category_chunk.startswith("AT"): out["category"] = "Antitoxin" elif category_chunk.startswith("RE"): out["category"] = "Regulator" else: raise ValueError("Cannot define the header's category: {}".format(header)) out["tadb_id"] = Utilities.safe_findall("([0-9]+)", category_chunk) out["geninfo_id"] = Utilities.safe_findall("gi\|([0-9]+)\|", header.lower()) ref = Utilities.safe_findall("REF\|(.+)\|", header.upper()).split("|")[0] if len(ref) == 0: try: ref = Utilities.safe_findall("((N|Y)(C|P|Z)_\d+\.\d*)", header.upper())[0].split("|")[0] except IndexError: pass out["refseq_id"] = ref locus = Utilities.safe_findall("\|:([c]{0,1}[0-9\-]+)", header.lower()) out["is_antisense_strand"] = locus.startswith("c") out["locus"] = locus.replace("c", "") tail = header_chunks[-1] out["description"] = tail.split("[")[0].replace( ":{}{}".format(["", "c"][out["is_antisense_strand"]], out["locus"]), "").replace(ref, "") out["gene_symbol"] = Utilities.safe_findall("\[([^\[]+)\]$", tail) host = "" if tail.count("[") > 1: host = Utilities.safe_findall("\[([^\[]+)\]", tail, 0) if len(host) == 0: host = Utilities.safe_findall("([A-Z][a-z]+ [a-z]+[\.]{0,1})", tail) out["host"] = host for key in out: if isinstance(out.get(key), str): out[key] = out.get(key).strip() return out
def process_genbank_report(d: dict): genbank_records = d.get("genbank_records") genbank_record = genbank_records[0] cds_number = len([i for i in genbank_record.features if i.type == "CDS"]) qualifiers_dict = [ i.qualifiers for i in genbank_record.features if i.type == "source" ][0] organism = Utilities.remove_empty_values( qualifiers_dict.get("organism")[0].split(" "))[:2] strain = " ".join(organism + [qualifiers_dict.get("strain")[0]]) taxonomy_id = Utilities.safe_findall("\d+", [ i for i in qualifiers_dict.get("db_xref") if i.split(":")[0].strip() == "taxon" ][0]) return dict(assembly_file=d.get("assembly_file"), strain=strain, taxonomy_id=taxonomy_id, reference_accession_id=genbank_record.id, cds_number=cds_number, reference_bp=len(genbank_record), reference_description=genbank_record.description)
os.makedirs(card_digest_dir, exist_ok=True) association_digest.reset_index().to_csv(os.path.join( card_digest_dir, "digest_card_{}_{}.tsv".format(value_col_name, annotation_col_name)), sep="\t", index=False, header=True) association_digest_percentage = association_digest * 100 / association_digest.sum( ) fig = plt.figure() sns.set(style="whitegrid", font_scale=1) export_df = association_digest.rename( columns={ i: Utilities.safe_findall( "/data1/bio/projects/inicolaeva/klebsiella_infants/map_data/Statistics/(.+)_card_v3.0.1_coverage.tsv", i) for i in list(association_digest) }).transpose() export_df.index.name = "sample_name" ax = export_df.plot(kind='bar', stacked='True', figsize=(20, 10)) ax.set_ylabel(value_col_name) legend = ax.legend(loc="center left", shadow=True, fontsize="x-small", bbox_to_anchor=(1.04, 0.5), borderaxespad=0) image_file_name = os.path.join( card_digest_dir, "digest_card_{}_{}.png".format(value_col_name, annotation_col_name))
node_names = [ j for j in [i.name for i in tree.find_clades()] if j is not None and j.startswith("GCF") ] annotations_list = [] for node_name in node_names: # node_name = "GCF_005377825.1_ASM537782v1" genbank_file = os.path.join(genbank_dir, "{}_genomic.gbff".format(node_name)) seq_records = list(SeqIO.parse(genbank_file, "genbank")) annotation_dict = { i: flatten_string(seq_records[0].annotations.get(i)) for i in ["organism", "date", "comment"] } annotation_dict["comment"] = remove_maintenance_comments( annotation_dict["comment"]) annotation_dict["strain"] = Utilities.safe_findall( "[S|s]train:* ([^ ]+)", seq_records[0].description) annotation_dict["refseq_id"] = Utilities.safe_findall( "GCF_[^_]+", node_name) annotation_dict["assembly_id"] = node_name.replace( annotation_dict["refseq_id"], "").strip("_") annotations_list.append(annotation_dict) annotations_df = pd.DataFrame(annotations_list) Utilities.dump_tsv( annotations_df, "/data1/bio/projects/inicolaeva/klebsiella_infants/roary/newick/iTOL_collapsed_tree_annotation.tsv" )
# Get the raw reads files raw_reads_files_dir = ProjectDescriber.RAW_DATA_DIR raw_reads_files_list = [ i for i in Utilities.scan_whole_dir(raw_reads_files_dir) if i.endswith("_001.fastq.gz") ] # Split them into the two groups STRANDS = ("R1", "R2") raw_reads_list = [] for raw_reads_files_pair in Utilities.get_most_similar_word_pairs( raw_reads_files_list): # Illumina file names have template '[sample]_[sequence]_[lane]_[strand]_[number].fastq.gz' # E.g: '336g_S1_L001_R1_001.fastq.gz' sample_name = Utilities.safe_findall( "(.+)_S[0-9]+_L[0-9]+_R[0-9]+_[0-9]+", os.path.basename(raw_reads_files_pair[0])) raw_reads_dict = dict(sample_name=sample_name) for raw_reads_file in raw_reads_files_pair: for reads_strand in STRANDS: if "_{}_".format(reads_strand) in os.path.splitext( os.path.basename(raw_reads_file))[0]: raw_reads_dict[reads_strand] = raw_reads_file if all([ raw_reads_dict.get(STRANDS[0]).replace("_{}_".format( STRANDS[0]), "_{}_".format(STRANDS[-1])) == raw_reads_dict.get( STRANDS[-1]) ] + [ raw_reads_dict.get(STRANDS[-1]).replace("_{}_".format( STRANDS[-1]), "_{}_".format(STRANDS[0])) == raw_reads_dict.get( STRANDS[0])
assembly_files = [ i for i in Utilities.scan_whole_dir(assembler_result_dir) if os.path.basename(i) == "contigs.fasta" ] assemblies_target_dir = "/data1/bio/projects/inicolaeva/klebsiella_infants/assemblies" sample_dirs = sorted( set([os.path.dirname(os.path.dirname(i)) for i in assembly_files])) _ = subprocess.getoutput("rm -rf {}".format(assemblies_target_dir)) os.makedirs(assemblies_target_dir, exist_ok=True) assemblies_annotations = [] for sample_dir in sample_dirs: sample_name = os.path.basename(sample_dir) sample_number = Utilities.safe_findall("([0-9]+)", sample_name) sample_assemblies = [i for i in assembly_files if i.startswith(sample_dir)] assemblies_annotation = dict() seq_records_processed = [] plasmid_counter = 0 assembly_target_file = os.path.join(assemblies_target_dir, "{}_genome.fna".format(sample_name)) for assembly_file_raw in sample_assemblies: for assembly_type in ASSEMBLY_TYPES: if os.path.dirname(assembly_file_raw).endswith(assembly_type): seq_records = sorted(list( SeqIO.parse(assembly_file_raw, "fasta")), key=lambda x: len(x), reverse=True) assemblies_annotation["sample_name"] = sample_name assemblies_annotation["{}_file".format(
def generate_genera_dict(keywords: list): return DigestAssociationsKeeper.generate_keywords_dict( [Utilities.safe_findall("([A-Z][a-z]{4,})", i) for i in keywords])
# Get the raw reads files raw_reads_files_dir = "/data1/bio/190405_M01969_0041_000000000-C6B66/Conversion_shotgun/Klebsiella" raw_reads_files_list = [i for i in Utilities.scan_whole_dir(raw_reads_files_dir) if os.path.normpath(os.path.dirname(i)) == raw_reads_files_dir and i.endswith("_001.fastq.gz")] # Split them into the two groups raw_reads_dict = { i: sorted([j for j in raw_reads_files_list if "_{}_".format(i) in os.path.splitext(os.path.basename(j))[0]]) for i in ("R1", "R2")} # Combine the dict into the pandas.DataFrame object raw_sampledata_df = pd.DataFrame.from_dict(raw_reads_dict) # Are reads files corresponding to each other? assert all((raw_sampledata_df["R1"].str.replace("_R1_", "_R2_") == raw_sampledata_df["R2"]).values.tolist() + ( raw_sampledata_df["R2"].str.replace("_R2_", "_R1_") == raw_sampledata_df["R1"]).values.tolist()) # Get the sample names from reads file names raw_sampledata_df["sample_name"] = raw_sampledata_df["R1"].map( lambda x: Utilities.safe_findall("(.+)_S[0-9]{2}_R[1|2]_001.fastq.gz", os.path.basename(x))) # Export sampledata project_describer = ProjectDescriber() raw_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads.sampledata") Utilities.dump_tsv(df=raw_sampledata_df, table_file=raw_sampledata_file, col_names=["sample_name", "R1", "R2"]) print(raw_sampledata_file) # /data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata # Create more detailed sampledata raw_sampledata_df["reads_files"] = raw_sampledata_df.loc[:, ["R1", "R2"]].apply(lambda x: ";".join(x), axis=1) raw_sampledata_df["taxon"] = "Klebsiella pneumoniae" pipeline_sampledata_file = os.path.join(project_describer.ROOT_DIR, "sample_data", "raw_reads_pipeline.sampledata") Utilities.dump_tsv(df=raw_sampledata_df, table_file=pipeline_sampledata_file, col_names=["sample_name", "reads", "taxon"])