def remapping(self): if file.which("samtools") is None: raise EnvironmentError( "Could not find executable: {}".format("samtools")) elif file.which("bwa") is None: raise EnvironmentError( "Could not find executable: {}".format("bwa")) else: file.isdir(self._outdir) if self._cpu == 1: cpu = 1 elif self._cpu > 1: cpu = self._cpu - 1 else: logging.info("-n CPU should be a integer >=1 (default = 1)") self.index() self.remap(cpu) self.process_bam(cpu) if os.stat( os.path.join(self._outdir, self._prefix + "_unique_depth.tab")).st_size > 0: self.clean() logging.debug("Remapping and parsing done") else: logging.info("Error: remapping and parsing failed")
def main(): parser = argparse.ArgumentParser( usage= "iqkm -i metagenome -o out_dir --help_dir help_dir --fq fastq1 --rq fastq2 --meta --quantify", description= "Workflow for KM assignment and/or quantification, on both contig and sample basis", add_help=False, ) required = parser.add_argument_group("required arguments") optional = parser.add_argument_group("optional arguments") required.add_argument( "-i", "--input", dest="genome", help="Input genome/metagenomes, required", required=True, ) required.add_argument( "-o", "--out_dir", dest="outdir", help="Output folder", required=True, ) required.add_argument( "--help_dir", dest="help_dir", help= "Folder containing Kofam HMM database and help files, refer to README.md for downloading", required=True, ) optional.add_argument( "--fq", dest="fastq1", help= "Input first or only read file (fastq or fastq.gz), required when '--quantify' is specified", required=False, ) optional.add_argument("-h", "--help", action="help") optional.add_argument( "--rq", dest="fastq2", help="Input reverse read (fastq or fastq.gz format), optional", default=None, ) optional.add_argument( "--prefix", dest="prefix", help= "Prefix of output files, default: your input genome/metagenome file name without postfix", default=None, ) optional.add_argument( "--db", dest="hmmdb", help= "Kofam HMM database for KO assignment, default path='/help_dir/db/kofam.hmm', you can change it to your customised db", default=None, ) optional.add_argument( "--com", dest="com", help= "KM completeness threshold on contig basis (only KM with completeness above the threshold will be considered present), default = 66.67", default=66.67, ) optional.add_argument( "--skip", action="store_true", help= "Force skip steps if relevant output files have been found under designated directories, not recommanded if your input file is newer (default = False)", default=False, ) optional.add_argument( "-q", "--quantify", action="store_true", help= "Run both KM assignment and quantification (default = False, add '-q' or '--quantify' to enable)", default=False, ) optional.add_argument( "-m", "--meta", action="store_true", help="Running in metagenome mode (prodigal -p meta; default = False)", default=False, ) optional.add_argument( "-w", "--include_weights", dest="include_weights", help= "Include weights of each KO when doing KM assignment (default = True)", default=True, ) optional.add_argument( "-n", "--threads", dest="cpu", help="Number of threads used for computation (default = 1)", default=1, ) optional.add_argument( "-f", "--force", action="store_true", help= "Force reruning the whole pipeline, don't resume previous run (default = False)", default=False, ) optional.add_argument( "-d", "--dist", action="store_true", help="Apply KM minimum distance threshold (default = True)", default=True, ) optional.add_argument( "-g", "--genome_equivalent", dest="GE", help= "Genome equivalent output generated from microbe-census, can be used for library-size normalization when doing quantification. Optional (default: None)", default=None, ) if len(sys.argv) == 1: parser.print_help() else: logging.basicConfig( format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %H:%M:%S: ", level=logging.INFO, ) args = parser.parse_args() logging.info("iqKM version {}".format(iqkm.version.__version__)) if not file.exists(args.genome): logging.error( "Please provide the right path of input genome/metagenome file (fasta format)" ) if not file.isdir(args.help_dir): logging.error( "Please provide the right path for help_files, refer to README.md for download help_dir" ) if args.quantify: logging.info( "Running iqKM for both KM assignment and quantification") if not file.exists(args.fastq1): logging.error( "Please provide the right path of raw reads file (fastq format) for KM quantification" ) else: Workflow_iqkm(args.genome, args.fastq1, args.fastq2, args.hmmdb, args.prefix, args.outdir, args.help_dir, args.GE, args.meta, "hmmsearch", args.force, args.dist, args.com, args.include_weights, args.cpu, "prodigal", args.skip) else: logging.info("Running iqKM for KM assignment") Workflow_identify(args.genome, args.hmmdb, args.prefix, args.outdir, args.help_dir, args.meta, "hmmsearch", args.force, args.dist, args.com, args.include_weights, args.cpu, "prodigal", args.skip)
def __init__(self, fna, fsq1, fsq2, db, prefix, outdir, help_dir, GE, meta=False, ko_anno_tool="hmmsearch", force=False, dist=True, com=66.67, include_weights=True, cpu=1, gene_prediction_tool="prodigal", skip=False): self._fna = fna self._fq1 = fsq1 self._fq2 = fsq2 self._hmmdb = db self._prefix = prefix self._GE = GE self._meta = meta self._cpu = cpu self._force = force self._dist = dist self._com = com self._include_weights = include_weights self._gene_predict_tool = gene_prediction_tool self._ko_anno_tool = ko_anno_tool self._outdir = outdir self._help_dir = help_dir self._skip = skip if self._prefix is None: self._prefix = ".".join( (os.path.basename(self._fna)).split(".")[:-1]) # run prodigal logging.info("Running prodigal") file.isdir(os.path.join(self._outdir, "prodigal")) out_pep = os.path.join(self._outdir, "prodigal", self._prefix + ".pep") out_cds = os.path.join(self._outdir, "prodigal", self._prefix + ".cds") out_gff = os.path.join(self._outdir, "prodigal", self._prefix + ".gff") if self._force: cls_prod = Prodigal(self._fna, self._outdir, self._meta) cls_prod.run_prodigal(out_pep, out_cds, out_gff) elif self._skip: if file.exists(out_cds) and file.exists(out_pep): logging.info("Force skipping prodigal as user used '--skip'") else: logging.info( "Failed to skip prodigal as prodigal output are missing") logging.info("Running prodigal") cls_prod = Prodigal(self._fna, self._outdir, self._meta) cls_prod.run_prodigal(out_pep, out_cds, out_gff) else: if file.isnewer(self._fna, out_pep): cls_prod = Prodigal(self._fna, self._outdir, self._meta) cls_prod.run_prodigal(out_pep, out_cds, out_gff) else: logging.info( "Skip prodigal because {} is newer than {}, add '--force' if you want to rerun the computation" .format(out_pep, self._fna)) # run bwa to remap reads to *.cds to quantify genes/KOs logging.info("Run remapping to quantify genes/KOs") file.isdir(os.path.join(self._outdir), "out_remap") remap_dir = os.path.join((self._outdir), "out_remap") remap_out = os.path.join(remap_dir, self._prefix + "_unique.tab") if self._force: remap_cls = Remapping(out_cds, self._fq1, self._fq2, remap_dir, self._prefix, self._cpu) remap_cls.remapping() elif self._skip: if file.exists(remap_out): logging.info( "Force skipping bwa mapping as user used '--skip'") else: logging.info( "Failed to skip bwa mapping as mapping output is missing") logging.info("Run remapping to quantify genes/KOs") remap_cls = Remapping(out_cds, self._fq1, self._fq2, remap_dir, self._prefix, self._cpu) remap_cls.remapping() else: if file.isnewer(out_cds, remap_out) or file.isnewer( self._fq1, remap_out): remap_cls = Remapping(out_cds, self._fq1, self._fq2, remap_dir, self._prefix, self._cpu) remap_cls.remapping() else: logging.info( "Skip remapping because {} and {} is newer than {}, add '--force' if you want to rerun the computation" .format(out_cds, self._fq1, remap_out)) # run hmmsearch logging.info("Running hmmsearch") file.isdir(os.path.join(self._outdir, "hmmsearch")) hmm_out = os.path.join(self._outdir, "hmmsearch", self._prefix + "_hmmsearch.tbl") hmm_log = os.path.join(self._outdir, "hmmsearch", self._prefix + "_hmmsearch.log") if self._hmmdb is None: self._hmmdb = os.path.join(self._help_dir, "db/kofam.hmm") if self._force: hmm_cls = Hmmsearch(out_pep, self._cpu, self._outdir, self._hmmdb) hmm_cls.hmmsearch(hmm_out, hmm_log) elif self._skip: if file.exists(hmm_out): logging.info("Force skipping hmmsearch as user used '--skip'") else: logging.info( "Failed to skip hmmsearch as hmmsearch output is missing") logging.info("Running hmmsearch") hmm_cls = Hmmsearch(out_pep, self._cpu, self._outdir, self._hmmdb) hmm_cls.hmmsearch(hmm_out, hmm_log) else: if file.isnewer(out_pep, hmm_out): hmm_cls = Hmmsearch(out_pep, self._cpu, self._outdir, self._hmmdb) hmm_cls.hmmsearch(hmm_out, hmm_log) else: logging.info( "Skip hmmsearch because {} is newer than {}, add '--force' if you want to rerun the computation" .format(hmm_out, out_pep)) # parse KO, the result is under dir(ourdir + "KO_parsing") logging.info("Parsing KO") file.isdir(os.path.join(self._outdir, "KO_parsing")) ko_output = os.path.join(self._outdir, "KO_parsing", self._prefix + ".ko") if self._force: parse_cls = ParseKo( self._ko_anno_tool, self._gene_predict_tool, out_pep, hmm_out, self._outdir, ) parse_cls.write_out(ko_output) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] elif self._skip: if file.exists(ko_output): logging.info("Force skipping parsing KO as user used '--skip'") parse_cls = ParseKo(self._ko_anno_tool, self._gene_predict_tool, out_pep, hmm_out, self._outdir) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] else: logging.info( "Failed to skip KO parsing as KO parsing output is missing" ) logging.info("Parsing KO") parse_cls = ParseKo( self._ko_anno_tool, self._gene_predict_tool, out_pep, hmm_out, self._outdir, ) parse_cls.write_out(ko_output) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] else: if file.isnewer(hmm_out, ko_output): parse_cls = ParseKo( self._ko_anno_tool, self._gene_predict_tool, out_pep, hmm_out, self._outdir, ) parse_cls.write_out(ko_output) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] else: logging.info( "Skip parsing KO because {} is newer than {}, add '--force' if you want to rerun the computation" .format(ko_output, hmm_out)) parse_cls = ParseKo( self._ko_anno_tool, self._gene_predict_tool, out_pep, hmm_out, self._outdir, ) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] # Assigning KM logging.info("Assigning KM") file.isdir(os.path.join(self._outdir, "KM_assignment_unfiltered")) help_graphs = os.path.join(self._help_dir, "help_files/graphs.pkl") help_classes = os.path.join(self._help_dir, "help_files/all_pathways_class.txt") help_names = os.path.join(self._help_dir, "help_files/all_pathways_names.txt") ( graphs, pathway_names, pathway_classes, ) = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) kegg_output = os.path.join(self._outdir, "KM_assignment_unfiltered", self._prefix + ".summary.kegg") # COMMON INFO using_graphs = copy.deepcopy(graphs) kegg_output_pathway = kegg_output + "_pathways.tsv" if self._force: edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_pathway, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, False) weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs( using_graphs) iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, "", file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() elif self._skip: if file.exists(kegg_output_pathway): logging.info( "Force skipping KM assignment as user used '--skip'") else: logging.info( "Failed to skip KM assignment as KM assignment output is missing" ) logging.info("Assigning KM") edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_pathway, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, False) weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs( using_graphs) iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, "", file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: if file.isnewer(ko_output, kegg_output_pathway): edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_pathway, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, False) weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs( using_graphs) iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, "", file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: logging.info( "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation" .format(kegg_output_pathway, ko_output)) # BY CONTIGS kegg_output_contig = kegg_output + "_contigs.tsv" if self._force: ( graphs, pathway_names, pathway_classes, ) = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_contig, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, True) for contig in dict_KO_by_contigs: using_graphs = copy.deepcopy(graphs) edges = dict_KO_by_contigs[contig] iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, contig, file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() elif self._skip: if file.exists(kegg_output_contig): logging.info( "Force skipping KM assignment as user used '--skip'") else: logging.info( "Failed to skip KM assignment as KM assignment output is missing" ) logging.info("Assigning KM") ( graphs, pathway_names, pathway_classes, ) = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_contig, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, True) for contig in dict_KO_by_contigs: using_graphs = copy.deepcopy(graphs) edges = dict_KO_by_contigs[contig] iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, contig, file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: if file.isnewer(ko_output, kegg_output_contig): ( graphs, pathway_names, pathway_classes, ) = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_contig, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, True) for contig in dict_KO_by_contigs: using_graphs = copy.deepcopy(graphs) edges = dict_KO_by_contigs[contig] iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, contig, file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: logging.info( "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation" .format(kegg_output_contig, ko_output)) # calculate the minimum dist, and apply dist and com threshold (or not) on contig basis, apply com threhold on sample basis logging.info("Calculating minimum distance within each KM") file.isdir(os.path.join(self._outdir, "KM_assignment_filtered")) out_dist = os.path.join(self._outdir, "KM_assignment_filtered", self._prefix + "_km_on_contig.tsv") out_count = os.path.join(self._outdir, "KM_assignment_filtered", self._prefix + "_km_sample_count.tsv") if self._force: km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool, self._gene_predict_tool, hmm_out, out_pep, self._cpu, self._dist, self._outdir, self._help_dir) km.km_dist(d_ko_position, out_dist, out_count) elif self._skip: if file.exists(out_count) and file.exists(out_dist): logging.info( "Force skipping KM minimum distance calculation as user used '--skip'" ) else: logging.info( "Failed to skip KM minimum distance calculation as output is missing" ) logging.info("Calculating minimum distance within each KM") km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool, self._gene_predict_tool, hmm_out, out_pep, self._cpu, self._dist, self._outdir, self._help_dir) km.km_dist(d_ko_position, out_dist, out_count) else: if file.isnewer(kegg_output_contig, out_count): km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool, self._gene_predict_tool, hmm_out, out_pep, self._cpu, self._dist, self._outdir, self._help_dir) km.km_dist(d_ko_position, out_dist, out_count) else: logging.info( "Skip KM minimum distance calculation because {} is newer than {}, add '--force' if you want to rerun the computation" .format(out_count, kegg_output_contig)) # calculate the minimum dist within each KM, normalized KM abundance (with GE) or non-normalized KM abundance, both on contig and sample basis logging.info("Calculating KM abundance") file.isdir(os.path.join(self._outdir, "out_abundance")) file.isdir(os.path.join(self._outdir, "out_abundance", "ko_abd")) file.isdir(os.path.join(self._outdir, "out_abundance", "km_abd_sample")) file.isdir(os.path.join(self._outdir, "out_abundance", "km_abd_contig")) output_ko = os.path.join(self._outdir, "out_abundance", "ko_abd", self._prefix + "_ko_abd.tsv") output_km_contig = os.path.join( self._outdir, "out_abundance", "km_abd_contig", self._prefix + "_km_contig_abd.tsv", ) out_km_sample = os.path.join( self._outdir, "out_abundance", "km_abd_sample", self._prefix + "_km_sample_abd.tsv", ) abd_cls = KM_abd(self._GE, remap_out, kegg_output_contig, self._com, self._ko_anno_tool, self._gene_predict_tool, hmm_out, out_pep, self._dist, self._outdir, self._help_dir) abd_cls.km_abd( d_nuc_ko, d_ko_position, d_position_gene, output_ko, output_km_contig, out_km_sample, )
def __init__(self, ffn, faa, db, fp, gene_predict_tool, prefix, ko_anno_tool="hmmsearch", force=False, dist=True, com=66.67, include_weights=True, cpu=1, outdir="./out"): self._ffn = ffn self._faa = faa self._hmmdb = db self._fp = fp self._prefix = prefix self._cpu = cpu self._force = force self._dist = dist self._com = com self._include_weights = include_weights self._gene_predict_tool = gene_predict_tool self._ko_anno_tool = ko_anno_tool self._outdir = outdir if self._prefix is None: self._prefix = ".".join( (os.path.basename(self._faa)).split(".")[:-1]) if self._fp is None: if self._gene_predict_tool == "prodigal": self._fp = self._faa else: logging.error( "Please provide gene prediction file (prokka output *.gff)" ) pkg_dir = os.path.dirname(os.path.abspath(__file__)) # run hmmsearch logging.info("Running hmmsearch") file.isdir(os.path.join(self._outdir, "hmmsearch")) hmm_out = os.path.join(self._outdir, "hmmsearch", self._prefix + "_hmmsearch.tbl") hmm_log = os.path.join(self._outdir, "hmmsearch", self._prefix + "_hmmsearch.log") if self._hmmdb is None: self._hmmdb = os.path.join(pkg_dir, "../db/kofam.hmm") if self._force: hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir, self._hmmdb) hmm_cls.hmmsearch(hmm_out, hmm_log) else: if file.isnewer(self._faa, hmm_out): hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir, self._hmmdb) hmm_cls.hmmsearch(hmm_out, hmm_log) else: logging.info( "Skip hmmsearch because {} is newer than {}, add '--force' if you want to rerun the computation" .format(hmm_out, self._faa)) # parse KO, the result is under dir(ourdir + "ko_parsing") logging.info("Parsing KO") file.isdir(os.path.join(self._outdir, "ko_parsing")) ko_output = os.path.join(self._outdir, "ko_parsing", self._prefix + ".ko") if self._force: parse_cls = ParseKo(self._ko_anno_tool, self._gene_predict_tool, self._fp, hmm_out, self._outdir) parse_cls.write_out(ko_output) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] else: if file.isnewer(hmm_out, ko_output): parse_cls = ParseKo(self._ko_anno_tool, self._gene_predict_tool, self._fp, hmm_out, self._outdir) parse_cls.write_out(ko_output) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] else: logging.info( "Skip parsing KO because {} is newer than {}, add '--force' if you want to rerun the computation" .format(ko_output, hmm_out)) parse_cls = ParseKo(self._ko_anno_tool, self._gene_predict_tool, self._fp, hmm_out, self._outdir) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] # Assigning KM logging.info("Assigning KM") file.isdir(os.path.join(self._outdir, "KM_assignment_unfiltered")) help_graphs = os.path.join(pkg_dir, '../help_files/graphs.pkl') help_classes = os.path.join(pkg_dir, '../help_files/all_pathways_class.txt') help_names = os.path.join(pkg_dir, '../help_files/all_pathways_names.txt') graphs, pathway_names, pathway_classes = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) kegg_output = os.path.join(self._outdir, "KM_assignment_unfiltered", self._prefix + '.summary.kegg') # COMMON INFO using_graphs = copy.deepcopy(graphs) kegg_output_pathway = kegg_output + '_pathways.tsv' if self._force: edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_pathway, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, False) weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs( using_graphs) iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, '', file_out_summary, weights_of_KOs, self._include_weights) file_out_summary.close() else: if file.isnewer(ko_output, kegg_output_pathway): edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_pathway, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, False) weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs( using_graphs) iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, '', file_out_summary, weights_of_KOs, self._include_weights) file_out_summary.close() else: logging.info( "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation" .format(kegg_output_pathway, ko_output)) # BY CONTIGS kegg_output_contig = kegg_output + '_contigs.tsv' if self._force: graphs, pathway_names, pathway_classes = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_contig, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, True) for contig in dict_KO_by_contigs: using_graphs = copy.deepcopy(graphs) edges = dict_KO_by_contigs[contig] iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, contig, file_out_summary, weights_of_KOs, self._include_weights) file_out_summary.close() else: if file.isnewer(ko_output, kegg_output_contig): graphs, pathway_names, pathway_classes = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_contig, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, True) for contig in dict_KO_by_contigs: using_graphs = copy.deepcopy(graphs) edges = dict_KO_by_contigs[contig] iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, contig, file_out_summary, weights_of_KOs, self._include_weights) file_out_summary.close() else: logging.info( "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation" .format(kegg_output_contig, ko_output)) # calculate the minimum dist, and apply dist threshold or not logging.info("Calculating minimum distance within each KM") km = KM_dist(kegg_output_contig, self._com, self._ko_anno_tool, self._gene_predict_tool, hmm_out, self._fp, self._cpu, self._dist, self._outdir) file.isdir(os.path.join(args.outdir, "KM_assignment_filtered")) parse = ParseKo(args.tool, args.gene_predict_tool, args.gff_faa, args.ko_annotation_result, args.outdir) d_ko_position = (parse.parseKo())[1] out_dist = os.path.join(args.outdir, "KM_assignment_filtered", self._prefix + "_dist.tsv") km.km_dist(d_ko_position, out_dist)
def __init__( self, ffn, fsq1, fsq2, faa, db, fp, gene_predict_tool, prefix, GE, ko_anno_tool="hmmsearch", force=False, dist=True, com=66.67, include_weights=True, cpu=1, outdir="./out", ): self._ffn = ffn self._fq1 = fsq1 self._fq2 = fsq2 self._faa = faa self._hmmdb = db self._fp = fp self._prefix = prefix self._GE = GE self._cpu = cpu self._force = force self._dist = dist self._com = com self._include_weights = include_weights self._gene_predict_tool = gene_predict_tool self._ko_anno_tool = ko_anno_tool self._outdir = outdir if self._prefix is None: self._prefix = ".".join( (os.path.basename(self._faa)).split(".")[:-1]) if self._fp is None: if self._gene_predict_tool == "prodigal": self._fp = self._faa else: logging.error( "Please provide gene prediction file (prokka output *.gff)" ) pkg_dir = os.path.dirname(os.path.abspath(__file__)) # run bwa to remap reads to *.ffn to quantify genes/KOs logging.info("Run remapping to quantify genes/KOs") file.isdir(os.path.join(self._outdir), "out_remap") remap_dir = os.path.join((self._outdir), "out_remap") remap_out = os.path.join(remap_dir, self._prefix + "_unique.tab") if self._force: remap_cls = Remapping(self._ffn, self._fq1, self._fq2, remap_dir, self._prefix, self._cpu) remap_cls.remapping() else: if file.isnewer(self._ffn, remap_out) or file.isnewer( self._fq1, remap_out): remap_cls = Remapping(self._ffn, self._fq1, self._fq2, remap_dir, self._prefix, self._cpu) remap_cls.remapping() else: logging.info( "Skip remapping because {} and {} is newer than {}, add '--force' if you want to rerun the computation" .format(self._ffn, self._fq1, remap_out)) # run hmmsearch logging.info("Running hmmsearch") file.isdir(os.path.join(self._outdir, "hmmsearch")) hmm_out = os.path.join(self._outdir, "hmmsearch", self._prefix + "_hmmsearch.tbl") hmm_log = os.path.join(self._outdir, "hmmsearch", self._prefix + "_hmmsearch.log") if self._hmmdb is None: self._hmmdb = os.path.join(pkg_dir, "../db/kofam.hmm") if self._force: hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir, self._hmmdb) hmm_cls.hmmsearch(hmm_out, hmm_log) else: if file.isnewer(self._faa, hmm_out): hmm_cls = Hmmsearch(self._faa, self._cpu, self._outdir, self._hmmdb) hmm_cls.hmmsearch(hmm_out, hmm_log) else: logging.info( "Skip hmmsearch because {} is newer than {}, add '--force' if you want to rerun the computation" .format(hmm_out, self._faa)) # parse KO, the result is under dir(ourdir + "ko_parsing") logging.info("Parsing KO") file.isdir(os.path.join(self._outdir, "ko_parsing")) ko_output = os.path.join(self._outdir, "ko_parsing", self._prefix + ".ko") if self._force: parse_cls = ParseKo( self._ko_anno_tool, self._gene_predict_tool, self._fp, hmm_out, self._outdir, ) parse_cls.write_out(ko_output) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] else: if file.isnewer(hmm_out, ko_output): parse_cls = ParseKo( self._ko_anno_tool, self._gene_predict_tool, self._fp, hmm_out, self._outdir, ) parse_cls.write_out(ko_output) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] else: logging.info( "Skip parsing KO because {} is newer than {}, add '--force' if you want to rerun the computation" .format(ko_output, hmm_out)) parse_cls = ParseKo( self._ko_anno_tool, self._gene_predict_tool, self._fp, hmm_out, self._outdir, ) d_nuc_ko = parse_cls.parse_kohmm() d_ko_position, d_position_gene = (parse_cls.parseKo())[1:] # Assigning KM logging.info("Assigning KM") file.isdir(os.path.join(self._outdir, "KM_assignment_unfiltered")) help_graphs = os.path.join(pkg_dir, "../help_files/graphs.pkl") help_classes = os.path.join(pkg_dir, "../help_files/all_pathways_class.txt") help_names = os.path.join(pkg_dir, "../help_files/all_pathways_names.txt") ( graphs, pathway_names, pathway_classes, ) = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) kegg_output = os.path.join(self._outdir, "KM_assignment_unfiltered", self._prefix + ".summary.kegg") # COMMON INFO using_graphs = copy.deepcopy(graphs) kegg_output_pathway = kegg_output + "_pathways.tsv" if self._force: edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_pathway, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, False) weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs( using_graphs) iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, "", file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: if file.isnewer(ko_output, kegg_output_pathway): edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_pathway, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, False) weights_of_KOs = iqkm.give_pathways_weight.get_weights_for_KOs( using_graphs) iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, "", file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: logging.info( "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation" .format(kegg_output_pathway, ko_output)) # BY CONTIGS kegg_output_contig = kegg_output + "_contigs.tsv" if self._force: ( graphs, pathway_names, pathway_classes, ) = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_contig, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, True) for contig in dict_KO_by_contigs: using_graphs = copy.deepcopy(graphs) edges = dict_KO_by_contigs[contig] iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, contig, file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: if file.isnewer(ko_output, kegg_output_contig): ( graphs, pathway_names, pathway_classes, ) = iqkm.give_pathways_weight.download_pathways( help_graphs, help_names, help_classes) edges, dict_KO_by_contigs = iqkm.give_pathways_weight.get_list_items( ko_output) file_out_summary = open(kegg_output_contig, "wt") iqkm.give_pathways_weight.set_headers(file_out_summary, True) for contig in dict_KO_by_contigs: using_graphs = copy.deepcopy(graphs) edges = dict_KO_by_contigs[contig] iqkm.give_pathways_weight.sort_out_pathways( using_graphs, edges, pathway_names, pathway_classes, contig, file_out_summary, weights_of_KOs, self._include_weights, ) file_out_summary.close() else: logging.info( "Skip KM assignment because {} is newer than {}, add '--force' if you want to rerun the computation" .format(kegg_output_contig, ko_output)) # calculate the minimum dist within each KM, normalized KM abundance (with GE) or non-normalized KM abundance, both on contig and sample basis logging.info("Calculating minimum distance within KM and KM abundance") file.isdir(os.path.join(self._outdir, "out_abundance")) file.isdir(os.path.join(self._outdir, "out_abundance", "ko_abd")) file.isdir(os.path.join(self._outdir, "out_abundance", "km_abd_sample")) file.isdir(os.path.join(self._outdir, "out_abundance", "km_abd_contig")) output_ko = os.path.join(self._outdir, "out_abundance", "ko_abd", self._prefix + "_ko_abd.tsv") output_km_contig = os.path.join( self._outdir, "out_abundance", "km_abd_contig", self._prefix + "_km_contig_abd.tsv", ) out_km_sample = os.path.join( self._outdir, "out_abundance", "km_abd_sample", self._prefix + "_km_sample_abd.tsv", ) abd_cls = KM_abd( self._GE, remap_out, kegg_output_contig, self._com, self._ko_anno_tool, self._gene_predict_tool, hmm_out, self._fp, self._dist, self._outdir, ) abd_cls.km_abd( d_nuc_ko, d_ko_position, d_position_gene, output_ko, output_km_contig, out_km_sample, )