def analyze_tblastn_out(tblastn_out_path, orfs_fasta_path, in_fasta, btax_data, res_gtf_json, num_threads=conf_constants.num_threads, work_dir="", save_alignments=False, save_trees=False): in_fasta_dict = load_fasta_to_dict(fasta_path=in_fasta) btax_info = BtaxInfo.load_from_dict(btax_data) orfs_stats = mp.Manager().dict() seq_ids_to_orgs = btax_info.fna_id tblatn_out_dict = read_blast_out(blast_out_path=tblastn_out_path) orfs_fasta_dict = load_fasta_to_dict(fasta_path=orfs_fasta_path) params_list = list() for seq_id in orfs_fasta_dict: chr_id, c1, c2, ori = parse_orf_id(seq_id) if ori == "+": orf_nucl_seq = in_fasta_dict[chr_id][c1 - 1:c2] else: orf_nucl_seq = str( Seq(in_fasta_dict[chr_id][c1 - 1:c2]).reverse_complement()) params_list.append({ "function": get_orf_stats, "orf_id": seq_id, "orf_homologs_prot": { seq_id: orfs_fasta_dict[seq_id] }, "orf_homologs_nucl": { seq_id: orf_nucl_seq }, "homologs_list": tblatn_out_dict[seq_id], "btax_data": btax_data, "seq_ids_to_orgs": seq_ids_to_orgs, "orfs_stats": orfs_stats, "work_dir": work_dir, "save_alignment": save_alignments, "save_tree": save_trees }) pool = mp.Pool(num_threads) pool.map(worker, params_list) # list(map(worker, params_list)) # for debug pool.close() pool.join() tblatn_out_dict = None orfs_fasta_dict = None eagle_logger.info("ORFs stats calculated") for orf_id in orfs_stats.keys(): try: res_gtf_json[orf_id]["attribute"] = json.dumps(orfs_stats[orf_id]) except KeyError: pass orfs_stats = None return res_gtf_json
def nucl_by_prot_aln(self, nucl_fasta_dict=None, nucl_fasta_path=None): if self.aln_type.lower() not in ("protein", "prot", "p"): if self.logger: self.logger.error("reference alignment type is not protein") else: print("ERROR: reference alignment type is not protein") return if not nucl_fasta_dict: if nucl_fasta_path: nucl_fasta_dict = load_fasta_to_dict(fasta_path=nucl_fasta_path) else: if self.logger: self.logger.error("no nucleotide sequences are input") else: print("ERROR: no nucleotide sequences are input") return nucl_aln_dict = dict() for seq_name in self.seq_names: match = re.search(re.sub("[-\.]", "", self[seq_name]).replace("*", "."), str(Seq(nucl_fasta_dict[seq_name]).translate())) nucl_aln_dict[seq_name] = nucl_accord_prot(self[seq_name], nucl_fasta_dict[seq_name][match.start()*3: match.end()*3]) nucl_aln = self._sub_mult_aln(SeqsDict.load_from_dict(nucl_aln_dict), aln_type="nucl", aln_name="nucl_"+self.aln_name) return nucl_aln
def run_blast_search(self, blast_type, query, db, out, num_threads=1, outfmt=7, max_hsps=100, **kwargs): if num_threads > 1 and kwargs.get("split_input", True): if self.logger is not None: self.logger.info("splitting '%s' into %s parts" % (query, num_threads)) else: print("INFO: splitting '%s' into %s parts" % (query, num_threads)) if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) query_dict = load_fasta_to_dict(fasta_path=query, dat_path=os.path.join(self.tmp_dir, ".%s.dat" % query)) query_chunk_size = len(query_dict) // num_threads + 1 p_list = list() query_seqs = list(query_dict.keys()) i = 0 query_chunks_list = list() for i in range(num_threads): query_chunk_path = None query_chunk_path = os.path.join(self.tmp_dir, ("_%s" % i).join(os.path.splitext(os.path.basename(query)))) query_chunks_list.append([query_chunk_path, query_chunk_path + ".bl"]) if len(query_dict) == i*query_chunk_size: del query_chunks_list[-1] continue elif (i+1) * query_chunk_size > len(query_dict): query_dict.get_sample(query_seqs[i * query_chunk_size:]).dump(seqs_path=query_chunks_list[-1][0]) else: query_dict.get_sample(query_seqs[i*query_chunk_size: (i+1)*query_chunk_size]).dump(seqs_path=query_chunks_list[-1][0]) p = mp.Process(target=self.run_blast_search, args=(blast_type, query_chunks_list[-1][0], db, query_chunks_list[-1][1], 1, outfmt, max_hsps), kwargs=kwargs) p.start() p_list.append(p) for p in p_list: p.join() join_files(in_files_list=list(map(lambda p: p[1], query_chunks_list)), out_file_path=out) if kwargs.get("remove_tmp", True): shutil.rmtree(self.tmp_dir) else: blast_search_cmd = os.path.join(self.inst_dir, blast_type) + \ " -query " + query + \ " -db " + db + \ " -out " + out + \ " -word_size " + kwargs.get("word_size", str(3)) + \ " -num_threads " + str(num_threads) + \ " -outfmt " + str(outfmt) + \ " -max_hsps " + str(max_hsps) if self.logger is not None: self.logger.info("run '%s' command" % blast_search_cmd) else: print("INFO: run '%s' command" % blast_search_cmd) subprocess.call(blast_search_cmd, shell=True)
def detect_seqs_type(fasta_path=None, fasta_dict=None, nuc_freq_thr=0.75): seqs_list = list() summ_l = 0 if not fasta_dict and fasta_path: fasta_dict = load_fasta_to_dict(fasta_path) for seq_key in fasta_dict.keys(): seqs_list.append(fasta_dict[seq_key].lower().replace("-", "")) summ_l += len(seqs_list[-1]) let_counts = Counter("".join(seqs_list)) if float(let_counts.get("a", 0)+let_counts.get("c", 0)+let_counts.get("g", 0)+ let_counts.get("t", 0))/float(summ_l) >= nuc_freq_thr: return "nucl" else: return "prot" else: return None
def run_hmmscan(self, profiles_db, in_fasta, num_threads=4, out_path=None): if not os.path.exists(self.tmp_dir): os.makedirs(self.tmp_dir) shredded_fasta_path = os.path.join(self.tmp_dir, os.path.basename(in_fasta)) if not out_path: if "." in in_fasta: out_path = ".".join(in_fasta.split(".")[:-1]) + ".hsr" else: out_path = in_fasta + ".hsr" in_fasta_dict = load_fasta_to_dict(fasta_path=in_fasta) shredded_in_fasta = shred_seqs(fasta_dict=in_fasta_dict, part_l=50000, parts_ov=5000) fasta_to_scan_dict = OrderedDict() for seq_id in shredded_in_fasta: i = 1 for seq in shredded_in_fasta[seq_id]: fasta_to_scan_dict[seq_id+"_"+str(i)] = seq i += 1 dump_fasta_dict(fasta_dict=fasta_to_scan_dict, fasta_path=shredded_fasta_path) hmmscan_cmd = os.path.join(self.inst_dir, "hmmscan") + " --cpu " + str(num_threads) + " " + profiles_db + " " +\ shredded_fasta_path + " > " + out_path subprocess.call(hmmscan_cmd, shell=True) shutil.rmtree(self.tmp_dir, ignore_errors=True)
def prepare_family(family_name, family_data, bact_fam_f_path, db_dir, **kwargs): # TODO: refactor it special_keys = ("16S_rRNA_tree", "16S_rRNA_tsv", "16S_rRNA_fasta", "blastdb", "repr_profile") rRNA_seqs_dict = dict() # {seq_id: seq} ids_to_org_dict = dict() # {seq_id: bacterium_name} for genus in family_data.keys(): for species in family_data[genus].keys(): for strain in family_data[genus][species].keys(): bacterium_rRNA_dict = load_fasta_to_dict( family_data[genus][species][strain]["16S_rRNA_file"]) for rRNA_id in bacterium_rRNA_dict.keys(): new_rRNA_id = None new_rRNA_id = rRNA_id.split(" ")[0].split("|")[1] ids_to_org_dict[new_rRNA_id] = strain rRNA_seqs_dict[new_rRNA_id] = bacterium_rRNA_dict[rRNA_id] red, reduced_orgs = reduce_seq_names(fasta_dict=dict( map(lambda x: (x, True), set(ids_to_org_dict.values()))), num_letters=7, num_words=2) rev_reduced_orgs = dict(map(lambda x: (x[1], x[0]), reduced_orgs.items())) comp_seq_id_dict = defaultdict(int) short_ids_dict = dict() for seq_id in rRNA_seqs_dict.keys(): short_seq_id = None short_seq_id = rev_reduced_orgs[ids_to_org_dict[seq_id]]+"x"+\ str(get_un_fix(un_num=comp_seq_id_dict[rev_reduced_orgs[ids_to_org_dict[seq_id]]], fix_len=2)) comp_seq_id_dict[rev_reduced_orgs[ids_to_org_dict[seq_id]]] += 1 short_ids_dict[short_seq_id] = rev_reduced_orgs[ ids_to_org_dict[seq_id]] + "x" + seq_id ids_to_org_dict[short_ids_dict[short_seq_id]] = { "organism_name": ids_to_org_dict.pop(seq_id) } rRNA_seqs_dict[short_seq_id] = rRNA_seqs_dict.pop(seq_id) eagle_logger.info("%s rRNA loaded" % family_name) # TODO: follows ### This section will be upgraded with my own alignment method but now MUSCLE and hmmer 3 are used tmp_fam_dir = os.path.join(db_dir, family_name + "_tmp") rRNA_aln = construct_mult_aln( seq_dict=rRNA_seqs_dict, method="MUSCLE", aln_type="nucl", aln_name=family_name + "_rRNA_aln", tmp_dir=tmp_fam_dir, muscle_exec_path=conf_constants.muscle_exec_path, emboss_inst_dir=conf_constants.emboss_inst_dir, hmmer_inst_dir=conf_constants.hmmer_inst_dir, logger=eagle_logger) eagle_logger.info("%s rRNA alignment constructed" % family_name) rRNA_aln.short_to_full_seq_names = short_ids_dict # May be errors: not tested rRNA_aln.remove_paralogs( ids_to_org_dict, method="min_dist", inplace=True) # If I use my own alignment method: method="spec_pos" rRNA_tree = build_tree_by_dist( rRNA_aln.get_distance_matrix(), full_seq_names=rRNA_aln.full_to_short_seq_names, tmp_dir=tmp_fam_dir, logger=eagle_logger) # TODO: write it for not only repr bacteria usage # fam_tax = {family_name: get_tree_from_dict(family_data, stop_level=3, special_keys=special_keys)} # rRNA_tree, removed_seqs = rRNA_tree.according_to_taxonomy(taxonomy=fam_tax) # rRNA_aln.remove_seqs(seqs_list=removed_seqs) ### family_data["16S_rRNA_tree"] = { "newick": rRNA_tree.newick, "full_seq_names": rRNA_tree.full_seq_names } family_data["16S_rRNA_tsv"] = os.path.join(db_dir, family_name + "_16S_rRNA.tsv") family_data["16S_rRNA_fasta"] = os.path.join( db_dir, family_name + "_16S_rRNA.fasta") rRNA_aln.get_blocks_tsv(tsv_path=family_data["16S_rRNA_tsv"], fasta_path=family_data["16S_rRNA_fasta"], meta_dict=ids_to_org_dict) remained_orgs = list( map(lambda seq_id: ids_to_org_dict[seq_id]["organism_name"], rRNA_aln.seq_names())) family_data = clean_btax_data(family_data, remained_orgs, stop_level=3, special_keys=special_keys) family_data = download_btax_files( key_prefix_pairs={"fna_file": "_genomic.fna.gz"}, btax_data=family_data, download_dir=db_dir, logger=eagle_logger) family_data["fam_fna"], family_data["chr_id"] = get_btax_fna( fna_key="fna_file", btax_genomes=family_data, btax_name=family_name, db_dir=db_dir) family_data["blastdb"] = create_btax_blastdb( btax_fna_path=family_data["fam_fna"], btax_name=family_name, db_dir=db_dir, blast_inst_dir=conf_constants.blast_inst_dir, logger=eagle_logger) # repr_alns = <function that builds alignments for set of representative genes (returns dict = {aln_name: MultAln object})> family_data["repr_profile"] = generate_btax_profiles( source={"16S_rRNA": rRNA_aln}, db_dir=db_dir, btax_name=family_name, method="hmmer") # TODO: the source should be repr_alns # family_data["codon_usage"] = get_btax_cu(family_data) bact_fam_json_f = open(bact_fam_f_path, 'a') bact_fam_json_f.write(' "' + family_name + '": ' + json.dumps(family_data) + ",\n") bact_fam_json_f.close() eagle_logger.info("%s prepared" % family_name)
def get_btax_dict(genomes_list, btax_level, btc_profiles, db_dir, num_threads=None, build_tree=False, config_path=None, **kwargs): if config_path: conf_constants.update_by_config(config_path=config_path) conf_constants_db.update_by_config(config_path=config_path) if not num_threads: num_threads = conf_constants.num_threads else: conf_constants.num_threads = num_threads btax_dict = defaultdict(BtaxInfo) btc_fasta_dict = defaultdict(dict) seq_ids_to_orgs = dict() for genome_dict in genomes_list: if not genome_dict: continue genome_info = GenomeInfo.load_from_dict(genome_dict) if not genome_info.btc_seqs_id: continue btax_name = None try: btax_name = genome_info.taxonomy[-btax_level] except IndexError: btax_name = genome_info.taxonomy[0] btax_dict[btax_name].genomes.append(genome_info.get_json()) if btax_dict[btax_name].name is None: btax_dict[btax_name].name = btax_name btc_seqs_fasta_dict = load_fasta_to_dict(genome_info.btc_seqs_fasta) for btc_seq_id in genome_info.btc_seqs_id: seq_ids_to_orgs[btc_seq_id] = genome_info.org_name btc_fasta_dict[genome_info.btc_seqs_id[btc_seq_id]][ btc_seq_id] = btc_seqs_fasta_dict[btc_seq_id] btc_profile_types = dict() for btc_profile_dict in btc_profiles: btc_profile_info = SeqProfileInfo.load_from_dict(btc_profile_dict) btc_profile_types[btc_profile_info.name] = btc_profile_info.seq_type btc_dist_dict = dict() btc_aln_dict = dict() short_to_full_seq_names = dict() for btc_profile_name in btc_fasta_dict: btc_mult_aln = construct_mult_aln( seq_dict=btc_fasta_dict[btc_profile_name], aln_type=btc_profile_types[btc_profile_name], aln_name=btc_profile_name + "_aln", tmp_dir=kwargs.get("aln_tmp_dir", "mult_aln_tmp"), method=conf_constants_db.btc_profile_aln_method, num_threads=num_threads, logger=eagle_logger, op=5.0, ep=0.5, **kwargs) # low_memory can be set through kwargs # TODO: only the code from else block should be remained after moving 16S rRNA obtaining out from get_bacteria_from_ncbi if btc_profile_name == "16S_rRNA": btc_mult_aln.short_to_full_seq_names = \ reduce_seq_names({re.sub("lcl\|(N(C|Z)_)?", "", seq_name): seq_name for seq_name in btc_mult_aln}, num_letters=10, num_words=1)[0] else: btc_mult_aln.short_to_full_seq_names = short_to_full_seq_names.copy( ) btc_mult_aln.remove_paralogs(seq_ids_to_orgs=seq_ids_to_orgs, inplace=True) btc_mult_aln.improve_aln(inplace=True) btc_dist_dict[btc_profile_name] = btc_mult_aln.get_distance_matrix( ) # TODO: implement specific positions method short_to_full_seq_names.update(btc_mult_aln.short_to_full_seq_names) if kwargs.get("save_alignments", False): btc_mult_aln.dump_alignment( aln_fasta_path=os.path.join(db_dir, btc_mult_aln.aln_name + ".fasta")) btc_mult_aln.rename_seqs(seq_ids_to_orgs) btc_aln_dict[btc_profile_name] = deepcopy(btc_mult_aln) global_dist_matr = get_global_dist(btc_dist_dict, btc_profiles, seq_ids_to_orgs) global_dist_matr_path = os.path.join(db_dir, BACTERIA_GLOBAL_DIST_MATRIX) short_to_full_seq_names_path = os.path.join( db_dir, BACTERIA_SHORT_TO_FULL_ORG_NAMES) short_to_full_seq_names = global_dist_matr.dump( matrix_path=global_dist_matr_path, matr_format="phylip") with open(short_to_full_seq_names_path, "w") as short_to_full_org_names_f: json.dump(short_to_full_seq_names, short_to_full_org_names_f, indent=2) eagle_logger.info("base taxons standardisation started") btax_dict = standardize_btax(btax_dict=btax_dict, global_dist_matr=global_dist_matr) eagle_logger.info("base taxons standardisation finished") full_to_short_seq_names = { v: k for k, v in short_to_full_seq_names.items() } for btax_name in btax_dict: btax_orgs = set( GenomeInfo.load_from_dict(genome).org_name for genome in btax_dict[btax_name].genomes) if build_tree: btax_dict[btax_name].mean_d = global_dist_matr[btax_orgs].mean_dist btax_dict[btax_name].median_d = global_dist_matr[ btax_orgs].median_dist if len(btax_orgs) > 2: btax_dict[btax_name].ref_tree_newick = build_tree_by_dist( global_dist_matr[btax_orgs], tree_name=btax_name + "_tree").newick btax_btc_aln_dict = dict() for btc_profile_name, btc_aln in btc_aln_dict.items(): btax_btc_aln = btc_aln[btax_orgs].improve_aln( inplace=False) btax_btc_aln.aln_name = btax_name + "_" + btc_profile_name btax_btc_aln_dict[btc_profile_name] = deepcopy( btax_btc_aln) btax_dict[btax_name].repr_profiles = generate_btax_profiles( btax_btc_aln_dict, db_dir=db_dir, btax_name=btax_name, method="hmmer") btax_dict[btax_name].ref_tree_full_names = \ {full_to_short_seq_names[btax_org]: btax_org for btax_org in btax_orgs} btax_dict[btax_name] = btax_dict[btax_name].get_json() return btax_dict
def get_orf_stats(orf_id, orf_homologs_prot, orf_homologs_nucl, homologs_list, btax_data, seq_ids_to_orgs, orfs_stats, work_dir, **kwargs): orf_stats = { "uniformity_std": -1.0, "phylo_diff": -1.0, "Ka/Ks": -1.0, "representation": 0.0, "relative_mean_btax_dist": -1.0, "relative_median_btax_dist": -1.0, "relative_mean_ORF_dist": -1.0, "relative_median_ORF_dist": -1.0, "stops_per_seq_median": -1.0, "seqs_with_stops_fract": -1.0, } if len(homologs_list) < 3: orfs_stats[orf_id] = orf_stats eagle_logger.warning("A few homologs number for ORF '%s'" % orf_id) return btax_info = BtaxInfo.load_from_dict(btax_data) seq_ids_to_orgs[orf_id] = "Input_Organism_X" btax_fna = load_fasta_to_dict(fasta_path=btax_info.btax_fna) for hom in homologs_list: if hom["subj_start"] <= hom["subj_end"]: nucl_seq = Seq(btax_fna[hom["subj_id"]][hom["subj_start"] - 1:hom["subj_end"]]) else: nucl_seq = Seq(btax_fna[hom["subj_id"]] [hom["subj_end"] - 1:hom["subj_start"]]).reverse_complement() try: orf_homologs_prot[hom["subj_id"]] = str(nucl_seq.translate()) except TranslationError: continue orf_homologs_nucl[hom["subj_id"]] = str(nucl_seq) btax_fna = None eagle_logger.info("got homologs sequences for ORF '%s'" % orf_id) orf_mult_aln = construct_mult_aln( seq_dict=orf_homologs_prot, aln_name=orf_id.replace("|:", "_") + "_aln", aln_type="prot", method="MUSCLE", tmp_dir=os.path.join(work_dir, orf_id.replace("|:", "_") + "_aln_tmp"), logger=eagle_logger) eagle_logger.info("got multiple alignment for ORF '%s' homologs" % orf_id) orf_mult_aln.remove_paralogs(seq_ids_to_orgs=seq_ids_to_orgs, method="min_dist", inplace=True) orf_stats["representation"] = float(len(orf_mult_aln) - 1) / float( len(btax_info.genomes)) if len(orf_mult_aln.seq_names) < 4: orfs_stats[orf_id] = orf_stats eagle_logger.warning("A few homologs number for ORF '%s'" % orf_id) return orf_mult_aln.improve_aln(inplace=True) dist_matrix = orf_mult_aln.get_distance_matrix().replace_negative( inplace=False) btax_dist_matrix = dist_matrix[list( filter(lambda seq_name: seq_name != orf_id, dist_matrix.seq_names))] if btax_info.median_d > 0.0: orf_stats[ "relative_mean_btax_dist"] = btax_dist_matrix.mean_dist / btax_info.mean_d orf_stats["relative_mean_ORF_dist"] = dist_matrix[orf_id].mean( ) / btax_info.mean_d if btax_info.median_d > 0.0: orf_stats[ "relative_median_btax_dist"] = btax_dist_matrix.median_dist / btax_info.median_d orf_stats["relative_median_ORF_dist"] = dist_matrix[orf_id].median( ) / btax_info.median_d stops_stats = orf_mult_aln.stop_codons_stats() orf_stats["stops_per_seq_median"] = stops_stats["stops_per_seq_median"] orf_stats["seqs_with_stops_fract"] = stops_stats["seqs_with_stops_fract"] # Uniformity orf_stats["uniformity_std"] = orf_mult_aln.estimate_uniformity( cons_thr=conf_constants.cons_thr, window_l=conf_constants.unif_window_l, windows_step=conf_constants.unif_windows_step) if np.isnan(orf_stats["uniformity_std"]): orf_stats["uniformity_std"] = -1.0 eagle_logger.info("got uniformity_std for ORF '%s'" % orf_id) # Ka/Ks orf_kaks = orf_mult_aln.calculate_KaKs_windows( nucl_seqs_dict=orf_homologs_nucl) if not pd.isna(orf_kaks): orf_stats["Ka/Ks"] = orf_kaks eagle_logger.info("got Ka/Ks for ORF '%s'" % orf_id) # Phylo phylo_tmp_dir = os.path.join(work_dir, orf_id.replace("|:", "_") + "_phylo_tmp") try: del orf_mult_aln[orf_id] except KeyError: pass orf_homs_tree = build_tree_by_dist( dist_matrix=orf_mult_aln.get_distance_matrix(), method="FastME", full_seq_names=dict((short_id, seq_ids_to_orgs[full_id]) for short_id, full_id in orf_mult_aln.short_to_full_seq_names.items()), tree_name=orf_id.replace("|:", "_") + "_tree", tmp_dir=phylo_tmp_dir, logger=eagle_logger) orf_homs_tree.set_full_names(inplace=True) btax_tree = PhyloTree.load_tree_from_str( tree_str=btax_info.ref_tree_newick, full_seq_names=btax_info.ref_tree_full_names, tree_name="btax_tree", tmp_dir=phylo_tmp_dir, logger=eagle_logger) btax_tree.set_full_names(inplace=True) phylo_diff = compare_trees(phylo_tree1=orf_homs_tree, phylo_tree2=btax_tree, method="Robinson-Foulds") if not pd.isna(phylo_diff): orf_stats["phylo_diff"] = phylo_diff eagle_logger.info("got phylo_diff for ORF '%s'" % orf_id) orfs_stats[orf_id] = orf_stats eagle_logger.info("got ORF '%s' stats" % orf_id) if kwargs.get("save_alignment", False): orf_mult_aln.dump_alignment( os.path.join(work_dir, ORF_ALNS_DIR, orf_mult_aln.aln_name + ".fasta")) if kwargs.get("save_tree", False): orf_homs_tree.dump_tree(tree_path=os.path.join( work_dir, ORF_TREES_DIR, orf_homs_tree.tree_name + ".nwk"))
def load_alignment(cls, aln_fasta_path, aln_type=None, aln_name=None, config_path=None, logger=None, **kwargs): return cls(mult_aln_dict=load_fasta_to_dict(fasta_path=aln_fasta_path, **kwargs), aln_type=aln_type, aln_name=aln_name, config_path=config_path, logger=logger)