if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0: with open(filePath) as infile: for line in infile: sampleName = ntpath.basename(filePath).split(".txt")[0] outfile.write(line.strip() + "\t" + sampleName + "\t" + args.cohort_name + "\n") # Eval spHMMs rpackages.importr('base') #packageNames = ('tidyverse','ggsci','ggpubr') #utils = rpackages.importr('utils') #utils.chooseCRANmirror(ind=1) #packnames_to_install = [x for x in packageNames if not rpackages.isinstalled(x)] #if len(packnames_to_install) > 0: # utils.install_packages(StrVector(packnames_to_install)) rpackages.importr('tidyverse') rpackages.importr('ggsci') rpackages.importr('ggpubr') hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs') os.makedirs(hp_hmm_directory,0o777,True) with open('EvaluateSpHMMs.R', 'r') as f: rStr = f.read() myfunc = STAP(rStr, "EvaluateSpHMM") myfunc.EvaluateSpHMM(allHMMResult, allBLASTResult, gene_pos_file, args.prot_family_name, float(args.F1_Thresh), hmm_directory, hp_hmm_directory) timeTaken = time.time() - startTime mins = int(timeTaken / 60) secs = int(timeTaken) % 60 print("\nTotal time taken : " + str(mins) + " mins " + str(secs) + " seconds")
def mbgcbuild(prot_alignment, prot_family_name, cohort_name, nucl_seq_directory, prot_seq_directory, seq_fmt, pair_fmt, r1_file_suffix, r2_file_suffix, tp_genes_nucl, blast_db_directory_map_file, blastn_search_directory, hmm_search_directory, f1_thresh, output_directory, cpu): try: CPU_THREADS = 4 startTime = time.time() if cpu is not None: CPU_THREADS = int(cpu) # setup paths build_op_dir = output_directory + os.sep + "build" hmm_directory = os.path.join(build_op_dir, 'spHMMs') tp_genes_prot = build_op_dir + os.sep + "TPGenes.faa" alnOutput = os.path.join(build_op_dir,"TP_Homolog_Alignment.afa") gene_pos_file = os.path.join(build_op_dir, 'Gene_Interval_Pos.txt') gene_pos_file_aa = os.path.join(build_op_dir, 'Gene_Interval_Pos_AA.txt') if hmm_search_directory is None: hmm_search_directory = os.path.join(build_op_dir, 'hmm_result') allHMMResult = os.path.join(build_op_dir,"CombinedHmmSearch.txt") if blastn_search_directory is None: blastn_search_directory = os.path.join(build_op_dir, 'blastn_result') allBLASTResult = os.path.join(build_op_dir,"CombinedBLASTSearch.txt") # Create OP dirs os.makedirs(hmm_directory, 0o777, True) # Translate protein sequence runTranSeq(tp_genes_nucl,"1",tp_genes_prot) # Join true positives in the sample with the BGC proteins tmpFile = os.path.join(build_op_dir,"TP_Homolog.faa") joinedSeqs = [] tpGeneSeqs = list(SeqIO.parse(tp_genes_prot, "fasta")) # Removing _1 added by TranSeq for seq in tpGeneSeqs: seq.id = seq.id[:-2] seq.description = "" joinedSeqs.append(seq) SeqIO.write(joinedSeqs,tp_genes_prot,"fasta") protAlnSeqs = list(SeqIO.parse(prot_alignment, "fasta")) for seq in protAlnSeqs: joinedSeqs.append(seq) SeqIO.write(joinedSeqs, tmpFile, "fasta") # MUSCLE align TP genes with markers runMUSCLE(tmpFile, alnOutput) # Gen spHMMs and interval pos # Extract spHMM coordinates from MUSCLE alignment hmmDict = gensphmmfiles(prot_family_name, alnOutput, tp_genes_prot, hmm_directory, gene_pos_file, gene_pos_file_aa) if r1_file_suffix is None: r1_file_suffix = "" if r2_file_suffix is None: r2_file_suffix = "" # #Preprocess synthetic reads nucl_seq_directory = PreProcessReadsPar(nucl_seq_directory, seq_fmt,pair_fmt, r1_file_suffix.strip(), r2_file_suffix.strip(), build_op_dir, CPU_THREADS) #Check if BLAST DB directory mapping file is provided or not if blast_db_directory_map_file is None: blast_db_directory_map_file = "" # Translate nucleotide seq if not os.path.isdir(prot_seq_directory): prot_seq_directory = TranseqReadsDir(build_op_dir, nucl_seq_directory, CPU_THREADS) # HMMER Search if not os.path.exists(allHMMResult): os.makedirs(hmm_search_directory,0o777,True) for hmmSeqPosKey, hmmFileObj in hmmDict.items(): hmmInterval = str(hmmDict[hmmSeqPosKey].intervalStart)+"_"+str(hmmDict[hmmSeqPosKey].intervalEnd) RunHMMDirectoryParallel(prot_seq_directory,hmmFileObj.hmmFile, cohort_name, prot_family_name, "30_10", hmmInterval, hmm_search_directory, CPU_THREADS) with open(allHMMResult, 'w') as outfile: for subdir, dirs, files in os.walk(hmm_search_directory): for file in files: filePath = os.path.join(subdir, file) if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0: with open(filePath) as infile: for line in infile: outfile.write(line) # BLAST Alignment if not os.path.exists(allBLASTResult): if not os.path.isdir(blastn_search_directory): print("Constructing BLAST Search Dir:" + blastn_search_directory) os.makedirs(blastn_search_directory,0o777,True) RunMakeDBandBlastN(nucl_seq_directory, blast_db_directory_map_file, tp_genes_nucl, "blastn", "-max_target_seqs 10000 -perc_identity 90.0 -outfmt \"6 sseqid slen sstart send qseqid qlen qstart qend pident evalue\" ", blastn_search_directory, CPU_THREADS) with open(allBLASTResult, 'w') as outfile: outfile.write("sseqid\tslen\tsstart\tsend\tqseqid\tqlen\tqstart\tqend\tpident\tevalue\tSample\tsampleType\n") for subdir, dirs, files in os.walk(blastn_search_directory): for file in files: filePath = os.path.join(subdir, file) if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0: with open(filePath) as infile: for line in infile: sampleName = os.path.basename(filePath).split(".txt")[0] outfile.write(line.strip() + "\t" + sampleName + "\t" + cohort_name + "\n") # Eval spHMMs rpackages.importr('base') utils = rpackages.importr('utils') packageNames = ('tidyverse','ggsci','ggpubr','dplyr','ggplot2') packnames_to_install = [x for x in packageNames if not rpackages.isinstalled(x)] if len(packnames_to_install) > 0: utils.install_packages(StrVector(packnames_to_install)) rpackages.importr('tidyverse') rpackages.importr('ggsci') rpackages.importr('ggpubr') rpackages.importr('dplyr') rpackages.importr('ggplot2') hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs') os.makedirs(hp_hmm_directory,0o777,True) module_dir = os.path.dirname(os.path.abspath(createhmm.__file__)) print("\nR-script path : " + module_dir) r_script = os.path.join(module_dir,'EvaluateSpHMMs.R') with open(r_script, 'r') as f: rStr = f.read() myfunc = STAP(rStr, "EvaluateSpHMM") myfunc.EvaluateSpHMM(allHMMResult, allBLASTResult, gene_pos_file, prot_family_name, float(f1_thresh), hmm_directory, hp_hmm_directory) timeTaken = time.time() - startTime mins = int(timeTaken / 60) secs = int(timeTaken) % 60 print("\nTotal time taken : " + str(mins) + " mins " + str(secs) + " seconds") return hp_hmm_directory except: print("Metabgc-build has failed. Please check your inputs and contact support on : https://github.com/donia-lab/MetaBGC") exit()
def mbgcbuild(prot_alignment, prot_family_name, cohort_name, nucl_seq_directory, prot_seq_directory, seq_fmt, pair_fmt, r1_file_suffix, r2_file_suffix, tp_genes_nucl, blastn_search_directory, hmm_search_directory, f1_thresh, output_directory, cpu): startTime = time.time() if cpu is not None: CPU_THREADS = int(cpu) # setup paths build_op_dir = output_directory + os.sep + "build" hmm_directory = os.path.join(build_op_dir, 'spHMMs') prot_aln_file = os.path.join(hmm_directory, ntpath.basename(prot_alignment)) tp_genes_prot = build_op_dir + os.sep + "TPGenes.faa" alnOutput = os.path.join(build_op_dir, "tmp.afa") gene_pos_file = os.path.join(build_op_dir, 'Gene_Interval_Pos.txt') if hmm_search_directory is None: hmm_search_directory = os.path.join(build_op_dir, 'hmm_result') allHMMResult = hmm_search_directory + os.sep + "CombinedHmmSearch.txt" if blastn_search_directory is None: blastn_search_directory = os.path.join(build_op_dir, 'blastn_result') allBLASTResult = blastn_search_directory + os.sep + "CombinedBLASTSearch.txt" # Gen spHMMs and interval pos os.makedirs(hmm_directory, 0o777, True) copyfile(prot_alignment, prot_aln_file) hmmDict = gensphmmfiles(prot_family_name, prot_aln_file, hmm_directory) runTranSeq(tp_genes_nucl, "1", tp_genes_prot) tmpFile = os.path.join(build_op_dir, "tmp.fa") # Join true positives in the sample with the BGC proteins joinedSeqs = [] tpGeneSeqs = list(SeqIO.parse(tp_genes_prot, "fasta")) # Removing _1 added by TranSeq for seq in tpGeneSeqs: seq.id = seq.id[:-2] seq.description = "" joinedSeqs.append(seq) protAlnSeqs = list(SeqIO.parse(prot_aln_file, "fasta")) for seq in protAlnSeqs: joinedSeqs.append(seq) SeqIO.write(joinedSeqs, tmpFile, "fasta") # MUSCLE align TP genes with markers runMUSCLE(tmpFile, alnOutput) # Extract spHMM coordinates from MUSCLE alignment gengeneposlist(prot_family_name, protAlnSeqs, hmmDict, alnOutput, gene_pos_file) if r1_file_suffix is None: r1_file_suffix = "" if r2_file_suffix is None: r2_file_suffix = "" # #Preprocess synthetic reads nucl_seq_directory = PreProcessReadsPar(nucl_seq_directory, seq_fmt, pair_fmt, r1_file_suffix.strip(), r2_file_suffix.strip(), build_op_dir, CPU_THREADS) # Translate nucleotide seq if not os.path.isdir(prot_seq_directory): prot_seq_directory = TranseqReadsDir(build_op_dir, nucl_seq_directory, CPU_THREADS) # HMMER Search os.makedirs(hmm_search_directory, 0o777, True) for hmmSeqPosKey, hmmFileObj in hmmDict.items(): hmmInterval = str(hmmDict[hmmSeqPosKey].intervalStart) + "_" + str( hmmDict[hmmSeqPosKey].intervalEnd) RunHMMDirectory(prot_seq_directory, hmmFileObj.hmmFile, cohort_name, prot_family_name, "30_10", hmmInterval, hmm_search_directory, CPU_THREADS) with open(allHMMResult, 'w') as outfile: for subdir, dirs, files in os.walk(hmm_search_directory): for file in files: filePath = os.path.join(subdir, file) if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0: with open(filePath) as infile: for line in infile: outfile.write(line) # BLAST Alignment if not os.path.isdir(blastn_search_directory): os.makedirs(blastn_search_directory, 0o777, True) RunBLASTNDirectoryPar(nucl_seq_directory, tp_genes_nucl, blastn_search_directory, CPU_THREADS) with open(allBLASTResult, 'w') as outfile: outfile.write( "sseqid\tslen\tsstart\tsend\tqseqid\tqlen\tqstart\tqend\tpident\tevalue\tSample\tsampleType\n" ) for subdir, dirs, files in os.walk(blastn_search_directory): for file in files: filePath = os.path.join(subdir, file) if re.match(r".*txt$", file) and os.path.getsize(filePath) > 0: with open(filePath) as infile: for line in infile: sampleName = ntpath.basename(filePath).split( ".txt")[0] outfile.write(line.strip() + "\t" + sampleName + "\t" + cohort_name + "\n") # Eval spHMMs rpackages.importr('base') utils = rpackages.importr('utils') packageNames = ('tidyverse', 'ggsci', 'ggpubr', 'dplyr', 'ggplot2') packnames_to_install = [ x for x in packageNames if not rpackages.isinstalled(x) ] if len(packnames_to_install) > 0: utils.install_packages(StrVector(packnames_to_install)) rpackages.importr('tidyverse') rpackages.importr('ggsci') rpackages.importr('ggpubr') rpackages.importr('dplyr') rpackages.importr('ggplot2') hp_hmm_directory = os.path.join(build_op_dir, 'HiPer_spHMMs') os.makedirs(hp_hmm_directory, 0o777, True) r_script = os.path.join(sys.path[0], 'metabgc', 'src', 'EvaluateSpHMMs.R') with open(r_script, 'r') as f: rStr = f.read() myfunc = STAP(rStr, "EvaluateSpHMM") myfunc.EvaluateSpHMM(allHMMResult, allBLASTResult, gene_pos_file, prot_family_name, float(f1_thresh), hmm_directory, hp_hmm_directory) timeTaken = time.time() - startTime mins = int(timeTaken / 60) secs = int(timeTaken) % 60 print("\nTotal time taken : " + str(mins) + " mins " + str(secs) + " seconds") return hp_hmm_directory