def ensure_dir(d): """ Check to make sure the supplied directory path does not exist, if so, create it. The method catches OSError exceptions and returns a descriptive message instead of re-raising the error. :type d: str :param d: It is the full path to a directory. :return: Does not return anything, but creates a directory path if it doesn't exist already. """ if not os.path.exists(d): try: os.makedirs(d) except OSError as oe: # should not happen with os.makedirs # ENOENT: No such file or directory if os.errno == errno.ENOENT: msg = twdd("""One or more directories in the path ({}) do not exist. If you are specifying a new directory for output, please ensure all other directories in the path currently exist.""") return msg.format(d) else: msg = twdd("""An error occurred trying to create the output directory ({}) with message: {}""") return msg.format(d, oe.strerror)
def ensure_dir(d): """ Check to make sure the supplied directory path does not exist, if so, create it. The method catches OSError exceptions and returns a descriptive message instead of re-raising the error. :type d: str :param d: It is the full path to a directory. :return: Does not return anything, but creates a directory path if it doesn't exist already. """ if not os.path.exists(d): try: os.makedirs(d) except OSError as oe: # should not happen with os.makedirs # ENOENT: No such file or directory if os.errno == errno.ENOENT: msg = twdd( """One or more directories in the path ({}) do not exist. If you are specifying a new directory for output, please ensure all other directories in the path currently exist.""" ) return msg.format(d) else: msg = twdd( """An error occurred trying to create the output directory ({}) with message: {}""") return msg.format(d, oe.strerror)
def setUp(self): self.krepA = prep_clark_input( io.StringIO( twdd(u"""\ Name,TaxID,Lineage,Count,Proportion_All(%),Proportion_Classified(%) Achromobacter xylosoxidans,85698,Bacteria;Proteobacteria;Betaproteobacteria;Burkholderiales;Alcaligenaceae;Achromobacter,82,0.00142317,0.124620061 Acinetobacter baumannii,470,Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Moraxellaceae;Acinetobacter,356,0.00617862,0.541033435 Actinomyces cardiffensis,181487,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,1,1.74E-05,0.001519757 Actinomyces dentalis,272548,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,15,0.000260335,0.022796353 Actinomyces georgiae,52768,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,5,8.68E-05,0.007598784 Actinomyces gerencseriae,52769,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,12,0.000208268,0.018237082 Actinomyces israelii,1659,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,93,0.00161408,0.141337386 Actinomyces johnsonii,544581,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,1,1.74E-05,0.001519757 Actinomyces massiliensis,461393,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,12,0.000208268,0.018237082 Actinomyces meyeri,52773,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,81,0.00140581,0.123100304 UNKNOWN,UNKNOWN,UNKNOWN,658,92.0161,- """))) self.krepB = prep_clark_input( io.StringIO( twdd(u"""\ Name,TaxID,Lineage,Count,Proportion_All(%),Proportion_Classified(%) Achromobacter xylosoxidans,85698,Bacteria;Proteobacteria;Betaproteobacteria;Burkholderiales;Alcaligenaceae;Achromobacter,10,0.00142317,0.003241491 Acinetobacter baumannii,470,Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Moraxellaceae;Acinetobacter,200,0.00617862,0.064829822 Actinomyces viscosus,1656,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,5,8.68E-05,0.001620746 Aggregatibacter actinomycetemcomitans,714,Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Aggregatibacter,212,0.0036794,0.068719611 Aggregatibacter aphrophilus,732,Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Aggregatibacter,2630,0.0456454,0.852512156 Aggregatibacter segnis,739,Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Aggregatibacter,1,1.74E-05,0.000324149 Agrobacterium fabrum,1176649,Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Agrobacterium,1,1.74E-05,0.000324149 Agrobacterium tumefaciens,358,Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Agrobacterium,9,0.000156201,0.002917342 Alloscardovia omnicolens,419015,Bacteria;Actinobacteria;Actinobacteria;Bifidobacteriales;Bifidobacteriaceae;Alloscardovia,1,1.74E-05,0.000324149 Anaerococcus prevotii,33034,Bacteria;Firmicutes;Tissierellia;Tissierellales;Peptoniphilaceae;Anaerococcus,3,5.21E-05,0.000972447 Arsenicicoccus sp. oral taxon 190,1658671,Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Intrasporangiaceae;Arsenicicoccus,5,8.68E-05,0.001620746 Atopobium parvulum,1382,Bacteria;Actinobacteria;Coriobacteriia;Coriobacteriales;Atopobiaceae;Atopobium,7,0.00012149,0.002269044 Atopobium rimae,1383,Bacteria;Actinobacteria;Coriobacteriia;Coriobacteriales;Atopobiaceae;Atopobium,1,1.74E-05,0.000324149 UNKNOWN,UNKNOWN,UNKNOWN,3085,92.0161,- """))) # parse the sample reports self.taxa = OrderedDict() self.sample_counts = OrderedDict() countsA, taxaA = cb.parse_clark_abundance_tbl(self.krepA) countsB, taxaB = cb.parse_clark_abundance_tbl(self.krepB) self.taxa.update(taxaA) self.taxa.update(taxaB) self.sample_counts["A"] = countsA self.sample_counts["B"] = countsB # create the BIOM table from the sample counts and taxa self.biomT = cb.create_biom_table(self.sample_counts, self.taxa)
def main(): args = handle_program_options() if args.fmt == 'hdf5' and not HAVE_H5PY: args.fmt = 'json' msg = """\ Library 'h5py' not found, unable to write BIOM 2.x (HDF5) files. Defaulting to BIOM 1.0 (JSON).""" print(twdd(msg)) if ranks.index(args.max) > ranks.index(args.min): msg = "ERROR: Max and Min ranks are out of order: {} < {}" sys.exit(msg.format(args.max, args.min)) reports = args.kraken_reports if args.kraken_reports_fp: reports += [str(p) for p in Path(args.kraken_reports_fp).glob('*')] # load all kraken-report files and parse them sample_counts, taxa = process_samples(reports, max_rank=args.max, min_rank=args.min) # Make sample metadata. Reads the givin file or # make simple dummy metadata. sample_metadata = process_metadata(sample_counts, args.metadata) # create new BIOM table from sample counts and taxon ids # add taxonomy strings to row (taxon) metadata biomT = create_biom_table(sample_counts, taxa, sample_metadata) out_fp = write_biom(biomT, args.output_fp, args.fmt, args.gzip) if args.otu_fp: try: write_otu_file(list(taxa), args.otu_fp) except RuntimeError as re: msg = "ERROR creating OTU file: \n\t{}" sys.exit(msg.format(re)) if args.verbose: print("".format(out_fp)) table_str = """\ BIOM-format table written to: {out_fp} Table contains {rows} rows (OTUs) and {cols} columns (Samples) and is {density:.1%} dense.""".format(out_fp=out_fp, rows=biomT.shape[0], cols=biomT.shape[1], density=biomT.get_table_density()) print(twdd(table_str))
def setUp(self): self.crepA = twdd(u"""\ Name,TaxID,Lineage,Count,Proportion_All(%),Proportion_Classified(%) Achromobacter xylosoxidans,85698,Bacteria;Proteobacteria;Betaproteobacteria;Burkholderiales;Alcaligenaceae;Achromobacter,82,0.00142317,0.124620061 Acinetobacter baumannii,470,Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Moraxellaceae;Acinetobacter,356,0.00617862,0.541033435 Actinomyces cardiffensis,181487,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,1,1.74E-05,0.001519757 Actinomyces dentalis,272548,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,15,0.000260335,0.022796353 Actinomyces georgiae,52768,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,5,8.68E-05,0.007598784 Actinomyces gerencseriae,52769,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,12,0.000208268,0.018237082 Actinomyces israelii,1659,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,93,0.00161408,0.141337386 Actinomyces johnsonii,544581,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,1,1.74E-05,0.001519757 Actinomyces massiliensis,461393,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,12,0.000208268,0.018237082 Actinomyces meyeri,52773,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,81,0.00140581,0.123100304 UNKNOWN,UNKNOWN,UNKNOWN,658,92.0161,- """).encode("utf-8") self.crepB = twdd(u"""\ Name,TaxID,Lineage,Count,Proportion_All(%),Proportion_Classified(%) Achromobacter xylosoxidans,85698,Bacteria;Proteobacteria;Betaproteobacteria;Burkholderiales;Alcaligenaceae;Achromobacter,10,0.00142317,0.003241491 Acinetobacter baumannii,470,Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Moraxellaceae;Acinetobacter,200,0.00617862,0.064829822 Actinomyces viscosus,1656,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,5,8.68E-05,0.001620746 Aggregatibacter actinomycetemcomitans,714,Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Aggregatibacter,212,0.0036794,0.068719611 Aggregatibacter aphrophilus,732,Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Aggregatibacter,2630,0.0456454,0.852512156 Aggregatibacter segnis,739,Bacteria;Proteobacteria;Gammaproteobacteria;Pasteurellales;Pasteurellaceae;Aggregatibacter,1,1.74E-05,0.000324149 Agrobacterium fabrum,1176649,Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Agrobacterium,1,1.74E-05,0.000324149 Agrobacterium tumefaciens,358,Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Rhizobiaceae;Agrobacterium,9,0.000156201,0.002917342 Alloscardovia omnicolens,419015,Bacteria;Actinobacteria;Actinobacteria;Bifidobacteriales;Bifidobacteriaceae;Alloscardovia,1,1.74E-05,0.000324149 Anaerococcus prevotii,33034,Bacteria;Firmicutes;Tissierellia;Tissierellales;Peptoniphilaceae;Anaerococcus,3,5.21E-05,0.000972447 Arsenicicoccus sp. oral taxon 190,1658671,Bacteria;Actinobacteria;Actinobacteria;Micrococcales;Intrasporangiaceae;Arsenicicoccus,5,8.68E-05,0.001620746 Atopobium parvulum,1382,Bacteria;Actinobacteria;Coriobacteriia;Coriobacteriales;Atopobiaceae;Atopobium,7,0.00012149,0.002269044 Atopobium rimae,1383,Bacteria;Actinobacteria;Coriobacteriia;Coriobacteriales;Atopobiaceae;Atopobium,1,1.74E-05,0.000324149 UNKNOWN,UNKNOWN,UNKNOWN,3085,92.0161,- """).encode("utf-8") # create temp files containing the above clark results tempf_crepA = tempfile.NamedTemporaryFile(delete=False) tempf_crepA.write(self.crepA) tempf_crepA.close() tempf_crepB = tempfile.NamedTemporaryFile(delete=False) tempf_crepB.write(self.crepB) tempf_crepB.close() self.fps = [tempf_crepA.name, tempf_crepB.name] self.fnames = [osp.split(fp)[1] for fp in self.fps] self.sample_counts, self.taxa = cb.process_samples(self.fps, store_pct=False)
def get(self, request): context = {} # PREFIX P: <{OM.iri}> example_query = twdd(f""" # example query: select all possible tags PREFIX P: <https://ackrep.org/draft/ocse-prototype01#> SELECT ?entity WHERE {{ ?entity rdf:type ?type. ?type rdfs:subClassOf* P:OCSE_Entity. }} """) qsrc = context["query"] = request.GET.get("query", example_query) try: ackrep_entities, onto_entities = core.AOM.run_sparql_query_and_translate_result( qsrc) except Exception as e: context["err"] = f"The following error occurred: {str(e)}" ackrep_entities, onto_entities = [], [] context["ackrep_entities"] = ackrep_entities context["onto_entities"] = onto_entities context["c"] = util.Container( ) # this could be used for further options return TemplateResponse(request, "ackrep_web/search_sparql.html", context)
def main(): args = handle_program_options() if args.fmt == 'hdf5' and not HAVE_H5PY: args.fmt = 'json' msg = """\ Library 'h5py' not found, unable to write BIOM 2.x (HDF5) files. Defaulting to BIOM 1.0 (JSON).""" print(twdd(msg)) if ranks.index(args.max) > ranks.index(args.min): msg = "ERROR: Max and Min ranks are out of order: {} < {}" sys.exit(msg.format(args.max, args.min)) reports = args.kraken_reports if args.kraken_reports_fp: reports += [str(p) for p in Path(args.kraken_reports_fp).glob('*')] # load all kraken-report files and parse them sample_counts, taxa = process_samples(reports, max_rank=args.max, min_rank=args.min) # create new BIOM table from sample counts and taxon ids # add taxonomy strings to row (taxon) metadata biomT = create_biom_table(sample_counts, taxa) out_fp = write_biom(biomT, args.output_fp, args.fmt, args.gzip) if args.otu_fp: try: write_otu_file(list(taxa), args.otu_fp) except RuntimeError as re: msg = "ERROR creating OTU file: \n\t{}" sys.exit(msg.format(re)) if args.verbose: print("".format(out_fp)) table_str = """\ BIOM-format table written to: {out_fp} Table contains {rows} rows (OTUs) and {cols} columns (Samples) and is {density:.1%} dense.""".format(out_fp=out_fp, rows=biomT.shape[0], cols=biomT.shape[1], density=biomT.get_table_density()) print(twdd(table_str))
def main(): args = handle_program_options() if args.fmt == 'hdf5' and not HAVE_H5PY: args.fmt = 'json' msg = """\ Library 'h5py' not found, unable to write BIOM 2.x (HDF5) files. Defaulting to BIOM 1.0 (JSON).""" print(twdd(msg)) # load all abundance table files and parse them sample_counts, taxa = process_samples(args.clark_abd_tbls, store_pct=args.store_pct) # create new BIOM table from sample counts and taxon ids # add taxonomy strings to row (taxon) metadata biomT = create_biom_table(sample_counts, taxa) out_fp = write_biom(biomT, args.output_fp, args.fmt, args.gzip) if args.otu_fp: try: write_otu_file(list(taxa), args.otu_fp) except RuntimeError as re: msg = "ERROR creating OTU file: \n\t{}" sys.exit(msg.format(re)) if args.verbose: print("".format(out_fp)) table_str = """\ BIOM-format table written to: {out_fp} Table contains {rows} rows (OTUs) and {cols} columns (Samples) and is {density:.1%} dense.""".format(out_fp=out_fp, rows=biomT.shape[0], cols=biomT.shape[1], density=biomT.get_table_density()) print(twdd(table_str))
def setUp(self): self.sample_clark_rep = prep_clark_input( io.StringIO( twdd(u"""\ Name,TaxID,Lineage,Count,Proportion_All(%),Proportion_Classified(%) Achromobacter xylosoxidans,85698,Bacteria;Proteobacteria;Betaproteobacteria;Burkholderiales;Alcaligenaceae;Achromobacter,82,0.00142317,0.124620061 Acinetobacter baumannii,470,Bacteria;Proteobacteria;Gammaproteobacteria;Pseudomonadales;Moraxellaceae;Acinetobacter,356,0.00617862,0.541033435 Actinomyces cardiffensis,181487,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,1,1.74E-05,0.001519757 Actinomyces dentalis,272548,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,15,0.000260335,0.022796353 Actinomyces georgiae,52768,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,5,8.68E-05,0.007598784 Actinomyces gerencseriae,52769,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,12,0.000208268,0.018237082 Actinomyces israelii,1659,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,93,0.00161408,0.141337386 Actinomyces johnsonii,544581,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,1,1.74E-05,0.001519757 Actinomyces massiliensis,461393,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,12,0.000208268,0.018237082 Actinomyces meyeri,52773,Bacteria;Actinobacteria;Actinobacteria;Actinomycetales;Actinomycetaceae;Actinomyces,81,0.00140581,0.123100304 UNKNOWN,UNKNOWN,UNKNOWN,658,92.0161,- """)))
def setUp(self): self.krepA = prep_kraken_input(io.StringIO(twdd(u"""\ 100.00 6783846 6783846 U 0 unclassified 0.00 130 18 - 1 root 0.00 105 0 - 131567 cellular organisms 0.00 105 0 D 2 Bacteria 0.00 62 30 P 1239 Firmicutes 0.00 29 0 C 91061 Bacilli 0.00 29 0 O 186826 Lactobacillales 0.00 20 0 F 1300 Streptococcaceae 0.00 20 8 G 1301 Streptococcus 0.00 5 4 S 1304 Streptococcus salivarius 0.00 1 1 - 1048332 Streptococcus salivarius CCHSS3 0.00 3 0 S 1303 Streptococcus oralis 0.00 3 3 - 927666 Streptococcus oralis Uo5 0.00 2 2 S 1318 Streptococcus parasanguinis 0.00 1 0 S 1305 Streptococcus sanguinis 0.00 1 1 - 388919 Streptococcus sanguinis SK36 0.00 1 1 S 1308 Streptococcus thermophilus 0.00 9 0 F 81852 Enterococcaceae 0.00 9 0 G 1350 Enterococcus 0.00 9 0 S 37734 Enterococcus casseliflavus 0.00 9 9 - 565655 Enterococcus casseliflavus EC20 0.00 1 0 C 186801 Clostridia 0.00 1 1 O 186802 Clostridiales 0.00 40 0 P 1224 Proteobacteria 0.00 20 2 C 1236 Gammaproteobacteria 0.00 11 1 O 135622 Alteromonadales 0.00 10 0 F 72275 Alteromonadaceae 0.00 10 0 G 226 Alteromonas 0.00 10 0 S 28108 Alteromonas macleodii 0.00 10 10 - 1300257 Alteromonas macleodii str. 'Ionian Sea U8' 0.00 7 0 O 91347 Enterobacteriales 0.00 7 6 F 543 Enterobacteriaceae 0.00 1 0 G 561 Escherichia 0.00 1 1 S 562 Escherichia coli 0.00 20 5 C 28216 Betaproteobacteria 0.00 15 0 O 80840 Burkholderiales 0.00 11 0 F 119060 Burkholderiaceae 0.00 11 0 G 48736 Ralstonia 0.00 11 10 S 329 Ralstonia pickettii 0.00 1 1 - 428406 Ralstonia pickettii 12D 0.00 4 0 F 80864 Comamonadaceae 0.00 2 2 G 12916 Acidovorax 0.00 2 0 G 201096 Alicycliphilus 0.00 2 0 S 179636 Alicycliphilus denitrificans 0.00 2 2 - 596154 Alicycliphilus denitrificans K601 0.00 3 0 P 201174 Actinobacteria 0.00 3 0 C 1760 Actinobacteria 0.00 2 0 - 84998 Coriobacteridae 0.00 2 0 O 84999 Coriobacteriales 0.00 2 0 - 255727 Coriobacterineae 0.00 2 0 F 84107 Coriobacteriaceae 0.00 2 0 G 1380 Atopobium 0.00 2 0 S 1382 Atopobium parvulum 0.00 2 2 - 521095 Atopobium parvulum DSM 20469 0.00 1 0 - 85003 Actinobacteridae 0.00 1 0 O 2037 Actinomycetales 0.00 1 0 - 85009 Propionibacterineae 0.00 1 0 F 31957 Propionibacteriaceae 0.00 1 0 G 1743 Propionibacterium 0.00 1 1 S 1747 Propionibacterium acnes 0.00 20 0 D 2157 Archaea 0.00 20 7 P 28890 Euryarchaeota 0.00 8 0 C 183963 Halobacteria 0.00 8 0 O 1644060 Natrialbales 0.00 8 0 F 1644061 Natrialbaceae 0.00 8 0 G 29287 Natronococcus 0.00 8 0 S 29288 Natronococcus occultus 0.00 8 8 - 694430 Natronococcus occultus SP4 0.00 4 0 C 224756 Methanomicrobia 0.00 4 0 O 94695 Methanosarcinales 0.00 4 0 F 2206 Methanosarcinaceae 0.00 3 0 G 101191 Methanomethylovorans 0.00 3 0 S 101192 Methanomethylovorans hollandica 0.00 3 3 - 867904 Methanomethylovorans hollandica DSM 15978 0.00 1 0 G 2207 Methanosarcina 0.00 1 0 S 2214 Methanosarcina acetivorans 0.00 1 1 - 188937 Methanosarcina acetivorans C2A 0.00 1 0 C 183939 Methanococci 0.00 1 0 O 2182 Methanococcales 0.00 1 0 F 2183 Methanococcaceae 0.00 1 0 G 2184 Methanococcus 0.00 1 0 S 2188 Methanococcus voltae 0.00 1 1 - 456320 Methanococcus voltae A3 0.00 7 0 D 10239 Viruses 0.00 2 0 - 29258 ssDNA viruses 0.00 2 0 F 10841 Microviridae 0.00 2 0 G 10842 Microvirus 0.00 2 2 S 374840 Enterobacteria phage phiX174 sensu lato 0.00 2 0 - 35237 dsDNA viruses, no RNA stage 0.00 2 0 F 10482 Polydnaviridae 0.00 2 0 G 10483 Ichnovirus 0.00 2 2 S 265522 Hyposoter fugitivus ichnovirus 0.00 2 0 - 439488 ssRNA viruses 0.00 2 0 - 35278 ssRNA positive-strand viruses, no DNA stage 0.00 2 0 F 11018 Togaviridae 0.00 2 0 G 11019 Alphavirus 0.00 2 0 - 177872 VEEV complex 0.00 2 2 S 11036 Venezuelan equine encephalitis virus 0.00 1 0 - 35268 Retro-transcribing viruses 0.00 1 0 F 11632 Retroviridae 0.00 1 0 - 35276 unclassified Retroviridae 0.00 1 0 - 206037 Human endogenous retroviruses 0.00 1 0 S 45617 Human endogenous retrovirus K 0.00 1 1 - 166122 Human endogenous retrovirus K113 """))) self.krepB = prep_kraken_input(io.StringIO(twdd(u"""\ 100.00 6783846 6783846 U 0 unclassified 0.00 130 18 - 1 root 0.00 105 0 - 131567 cellular organisms 0.00 105 0 D 2 Bacteria 0.00 62 30 P 1239 Firmicutes 0.00 29 0 C 91061 Bacilli 0.00 29 0 O 186826 Lactobacillales 0.00 20 0 F 1300 Streptococcaceae 0.00 20 8 G 1301 Streptococcus 0.00 5 4 S 1304 Streptococcus salivarius 0.00 1 1 - 1048332 Streptococcus salivarius CCHSS3 0.00 3 0 S 1303 Streptococcus oralis 0.00 3 3 - 927666 Streptococcus oralis Uo5 0.00 2 2 S 1318 Streptococcus parasanguinis 0.00 1 0 S 1305 Streptococcus sanguinis 0.00 1 1 - 388919 Streptococcus sanguinis SK36 0.00 1 1 S 1308 Streptococcus thermophilus 0.00 9 0 F 81852 Enterococcaceae 0.00 9 0 G 1350 Enterococcus 0.00 9 0 S 37734 Enterococcus casseliflavus 0.00 9 9 - 565655 Enterococcus casseliflavus EC20 0.00 2 0 C 909932 Negativicutes 0.00 2 0 O 909929 Selenomonadales 0.00 2 0 F 31977 Veillonellaceae 0.00 2 0 G 29465 Veillonella 0.00 2 0 S 29466 Veillonella parvula 0.00 2 2 - 479436 Veillonella parvula DSM 2008 0.00 20 0 D 2157 Archaea 0.00 20 7 P 28890 Euryarchaeota 0.00 8 0 C 183963 Halobacteria 0.00 8 0 O 1644060 Natrialbales 0.00 8 0 F 1644061 Natrialbaceae 0.00 8 0 G 29287 Natronococcus 0.00 8 0 S 29288 Natronococcus occultus 0.00 8 8 - 694430 Natronococcus occultus SP4 0.00 7 0 D 10239 Viruses 0.00 2 0 - 29258 ssDNA viruses 0.00 2 0 F 10841 Microviridae 0.00 2 0 G 10842 Microvirus 0.00 2 2 S 374840 Enterobacteria phage phiX174 sensu lato 0.00 2 0 - 35237 dsDNA viruses, no RNA stage 0.00 2 0 F 10482 Polydnaviridae 0.00 2 0 G 10483 Ichnovirus 0.00 2 2 S 265522 Hyposoter fugitivus ichnovirus """))) # parse the sample reports self.taxa = OrderedDict() self.sample_counts = OrderedDict() countsA, taxaA = kb.parse_kraken_report(self.krepA, max_rank="O", min_rank="S") countsB, taxaB = kb.parse_kraken_report(self.krepB, max_rank="O", min_rank="S") self.taxa.update(taxaA) self.taxa.update(taxaB) self.sample_counts["A"] = countsA self.sample_counts["B"] = countsB # Make the dummy metadata for the samples. self.metadata =kb.process_metadata(self.sample_counts ,None) # create the BIOM table from the sample counts and taxa self.biomT = kb.create_biom_table(self.sample_counts, self.taxa,self.metadata)
def setUp(self): self.krepA = twdd(u"""\ 100.00 6783846 6783846 U 0 unclassified 0.00 130 18 - 1 root 0.00 105 0 - 131567 cellular organisms 0.00 105 0 D 2 Bacteria 0.00 62 30 P 1239 Firmicutes 0.00 29 0 C 91061 Bacilli 0.00 29 0 O 186826 Lactobacillales 0.00 20 0 F 1300 Streptococcaceae 0.00 20 8 G 1301 Streptococcus 0.00 5 4 S 1304 Streptococcus salivarius 0.00 1 1 - 1048332 Streptococcus salivarius CCHSS3 0.00 3 0 S 1303 Streptococcus oralis 0.00 3 3 - 927666 Streptococcus oralis Uo5 0.00 2 2 S 1318 Streptococcus parasanguinis 0.00 1 0 S 1305 Streptococcus sanguinis 0.00 1 1 - 388919 Streptococcus sanguinis SK36 0.00 1 1 S 1308 Streptococcus thermophilus 0.00 9 0 F 81852 Enterococcaceae 0.00 9 0 G 1350 Enterococcus 0.00 9 0 S 37734 Enterococcus casseliflavus 0.00 9 9 - 565655 Enterococcus casseliflavus EC20 0.00 1 0 C 186801 Clostridia 0.00 1 1 O 186802 Clostridiales 0.00 40 0 P 1224 Proteobacteria 0.00 20 2 C 1236 Gammaproteobacteria 0.00 11 1 O 135622 Alteromonadales 0.00 10 0 F 72275 Alteromonadaceae 0.00 10 0 G 226 Alteromonas 0.00 10 0 S 28108 Alteromonas macleodii 0.00 10 10 - 1300257 Alteromonas macleodii str. 'Ionian Sea U8' 0.00 7 0 O 91347 Enterobacteriales 0.00 7 6 F 543 Enterobacteriaceae 0.00 1 0 G 561 Escherichia 0.00 1 1 S 562 Escherichia coli 0.00 20 5 C 28216 Betaproteobacteria 0.00 15 0 O 80840 Burkholderiales 0.00 11 0 F 119060 Burkholderiaceae 0.00 11 0 G 48736 Ralstonia 0.00 11 10 S 329 Ralstonia pickettii 0.00 1 1 - 428406 Ralstonia pickettii 12D 0.00 4 0 F 80864 Comamonadaceae 0.00 2 2 G 12916 Acidovorax 0.00 2 0 G 201096 Alicycliphilus 0.00 2 0 S 179636 Alicycliphilus denitrificans 0.00 2 2 - 596154 Alicycliphilus denitrificans K601 0.00 3 0 P 201174 Actinobacteria 0.00 3 0 C 1760 Actinobacteria 0.00 2 0 - 84998 Coriobacteridae 0.00 2 0 O 84999 Coriobacteriales 0.00 2 0 - 255727 Coriobacterineae 0.00 2 0 F 84107 Coriobacteriaceae 0.00 2 0 G 1380 Atopobium 0.00 2 0 S 1382 Atopobium parvulum 0.00 2 2 - 521095 Atopobium parvulum DSM 20469 0.00 1 0 - 85003 Actinobacteridae 0.00 1 0 O 2037 Actinomycetales 0.00 1 0 - 85009 Propionibacterineae 0.00 1 0 F 31957 Propionibacteriaceae 0.00 1 0 G 1743 Propionibacterium 0.00 1 1 S 1747 Propionibacterium acnes 0.00 20 0 D 2157 Archaea 0.00 20 7 P 28890 Euryarchaeota 0.00 8 0 C 183963 Halobacteria 0.00 8 0 O 1644060 Natrialbales 0.00 8 0 F 1644061 Natrialbaceae 0.00 8 0 G 29287 Natronococcus 0.00 8 0 S 29288 Natronococcus occultus 0.00 8 8 - 694430 Natronococcus occultus SP4 0.00 4 0 C 224756 Methanomicrobia 0.00 4 0 O 94695 Methanosarcinales 0.00 4 0 F 2206 Methanosarcinaceae 0.00 3 0 G 101191 Methanomethylovorans 0.00 3 0 S 101192 Methanomethylovorans hollandica 0.00 3 3 - 867904 Methanomethylovorans hollandica DSM 15978 0.00 1 0 G 2207 Methanosarcina 0.00 1 0 S 2214 Methanosarcina acetivorans 0.00 1 1 - 188937 Methanosarcina acetivorans C2A 0.00 1 0 C 183939 Methanococci 0.00 1 0 O 2182 Methanococcales 0.00 1 0 F 2183 Methanococcaceae 0.00 1 0 G 2184 Methanococcus 0.00 1 0 S 2188 Methanococcus voltae 0.00 1 1 - 456320 Methanococcus voltae A3 0.00 7 0 D 10239 Viruses 0.00 2 0 - 29258 ssDNA viruses 0.00 2 0 F 10841 Microviridae 0.00 2 0 G 10842 Microvirus 0.00 2 2 S 374840 Enterobacteria phage phiX174 sensu lato 0.00 2 0 - 35237 dsDNA viruses, no RNA stage 0.00 2 0 F 10482 Polydnaviridae 0.00 2 0 G 10483 Ichnovirus 0.00 2 2 S 265522 Hyposoter fugitivus ichnovirus 0.00 2 0 - 439488 ssRNA viruses 0.00 2 0 - 35278 ssRNA positive-strand viruses, no DNA stage 0.00 2 0 F 11018 Togaviridae 0.00 2 0 G 11019 Alphavirus 0.00 2 0 - 177872 VEEV complex 0.00 2 2 S 11036 Venezuelan equine encephalitis virus 0.00 1 0 - 35268 Retro-transcribing viruses 0.00 1 0 F 11632 Retroviridae 0.00 1 0 - 35276 unclassified Retroviridae 0.00 1 0 - 206037 Human endogenous retroviruses 0.00 1 0 S 45617 Human endogenous retrovirus K 0.00 1 1 - 166122 Human endogenous retrovirus K113 """).encode("utf-8") self.krepB = twdd(u"""\ 100.00 6783846 6783846 U 0 unclassified 0.00 130 18 - 1 root 0.00 105 0 - 131567 cellular organisms 0.00 105 0 D 2 Bacteria 0.00 62 30 P 1239 Firmicutes 0.00 29 0 C 91061 Bacilli 0.00 29 0 O 186826 Lactobacillales 0.00 20 0 F 1300 Streptococcaceae 0.00 20 8 G 1301 Streptococcus 0.00 5 4 S 1304 Streptococcus salivarius 0.00 1 1 - 1048332 Streptococcus salivarius CCHSS3 0.00 3 0 S 1303 Streptococcus oralis 0.00 3 3 - 927666 Streptococcus oralis Uo5 0.00 2 2 S 1318 Streptococcus parasanguinis 0.00 1 0 S 1305 Streptococcus sanguinis 0.00 1 1 - 388919 Streptococcus sanguinis SK36 0.00 1 1 S 1308 Streptococcus thermophilus 0.00 9 0 F 81852 Enterococcaceae 0.00 9 0 G 1350 Enterococcus 0.00 9 0 S 37734 Enterococcus casseliflavus 0.00 9 9 - 565655 Enterococcus casseliflavus EC20 0.00 1 0 C 186801 Clostridia 0.00 1 1 O 186802 Clostridiales 0.00 40 0 P 1224 Proteobacteria 0.00 20 2 C 1236 Gammaproteobacteria 0.00 11 1 O 135622 Alteromonadales 0.00 10 0 F 72275 Alteromonadaceae 0.00 10 0 G 226 Alteromonas 0.00 10 0 S 28108 Alteromonas macleodii 0.00 10 10 - 1300257 Alteromonas macleodii str. 'Ionian Sea U8' 0.00 7 0 O 91347 Enterobacteriales 0.00 7 6 F 543 Enterobacteriaceae 0.00 1 0 G 561 Escherichia 0.00 1 1 S 562 Escherichia coli 0.00 20 5 C 28216 Betaproteobacteria 0.00 15 0 O 80840 Burkholderiales 0.00 11 0 F 119060 Burkholderiaceae 0.00 11 0 G 48736 Ralstonia 0.00 11 10 S 329 Ralstonia pickettii 0.00 1 1 - 428406 Ralstonia pickettii 12D 0.00 4 0 F 80864 Comamonadaceae 0.00 2 2 G 12916 Acidovorax 0.00 2 0 G 201096 Alicycliphilus 0.00 2 0 S 179636 Alicycliphilus denitrificans 0.00 2 2 - 596154 Alicycliphilus denitrificans K601 0.00 3 0 P 201174 Actinobacteria 0.00 3 0 C 1760 Actinobacteria 0.00 2 0 - 84998 Coriobacteridae 0.00 2 0 O 84999 Coriobacteriales 0.00 2 0 - 255727 Coriobacterineae 0.00 2 0 F 84107 Coriobacteriaceae 0.00 2 0 G 1380 Atopobium 0.00 2 0 S 1382 Atopobium parvulum 0.00 2 2 - 521095 Atopobium parvulum DSM 20469 0.00 1 0 - 85003 Actinobacteridae 0.00 1 0 O 2037 Actinomycetales 0.00 1 0 - 85009 Propionibacterineae 0.00 1 0 F 31957 Propionibacteriaceae 0.00 1 0 G 1743 Propionibacterium 0.00 1 1 S 1747 Propionibacterium acnes 0.00 20 0 D 2157 Archaea 0.00 20 7 P 28890 Euryarchaeota 0.00 8 0 C 183963 Halobacteria 0.00 8 0 O 1644060 Natrialbales 0.00 8 0 F 1644061 Natrialbaceae 0.00 8 0 G 29287 Natronococcus 0.00 8 0 S 29288 Natronococcus occultus 0.00 8 8 - 694430 Natronococcus occultus SP4 0.00 4 0 C 224756 Methanomicrobia 0.00 4 0 O 94695 Methanosarcinales 0.00 4 0 F 2206 Methanosarcinaceae 0.00 3 0 G 101191 Methanomethylovorans 0.00 3 0 S 101192 Methanomethylovorans hollandica 0.00 3 3 - 867904 Methanomethylovorans hollandica DSM 15978 0.00 1 0 G 2207 Methanosarcina 0.00 1 0 S 2214 Methanosarcina acetivorans 0.00 1 1 - 188937 Methanosarcina acetivorans C2A 0.00 1 0 C 183939 Methanococci 0.00 1 0 O 2182 Methanococcales 0.00 1 0 F 2183 Methanococcaceae 0.00 1 0 G 2184 Methanococcus 0.00 1 0 S 2188 Methanococcus voltae 0.00 1 1 - 456320 Methanococcus voltae A3 0.00 7 0 D 10239 Viruses 0.00 2 0 - 29258 ssDNA viruses 0.00 2 0 F 10841 Microviridae 0.00 2 0 G 10842 Microvirus 0.00 2 2 S 374840 Enterobacteria phage phiX174 sensu lato 0.00 2 0 - 35237 dsDNA viruses, no RNA stage 0.00 2 0 F 10482 Polydnaviridae 0.00 2 0 G 10483 Ichnovirus 0.00 2 2 S 265522 Hyposoter fugitivus ichnovirus 0.00 2 0 - 439488 ssRNA viruses 0.00 2 0 - 35278 ssRNA positive-strand viruses, no DNA stage 0.00 2 0 F 11018 Togaviridae 0.00 2 0 G 11019 Alphavirus 0.00 2 0 - 177872 VEEV complex 0.00 2 2 S 11036 Venezuelan equine encephalitis virus """).encode("utf-8") # create temp files containing the above kraken results tempf_krepA = tempfile.NamedTemporaryFile() tempf_krepA.write(self.krepA) tempf_krepB = tempfile.NamedTemporaryFile() tempf_krepB.write(self.krepB) self.fps = [tempf_krepA.name, tempf_krepB.name] self.fnames = [osp.split(fp)[1] for fp in self.fps] self.sample_counts, self.taxa = kb.process_samples(self.fps, max_rank="O", min_rank="S")
def setUp(self): self.sample_kraken_rep = prep_kraken_input(io.StringIO(twdd(u"""\ 100.00 6783846 6783846 U 0 unclassified 0.00 130 18 - 1 root 0.00 105 0 - 131567 cellular organisms 0.00 105 0 D 2 Bacteria 0.00 62 30 P 1239 Firmicutes 0.00 29 0 C 91061 Bacilli 0.00 29 0 O 186826 Lactobacillales 0.00 20 0 F 1300 Streptococcaceae 0.00 20 8 G 1301 Streptococcus 0.00 5 4 S 1304 Streptococcus salivarius 0.00 1 1 - 1048332 Streptococcus salivarius CCHSS3 0.00 3 0 S 1303 Streptococcus oralis 0.00 3 3 - 927666 Streptococcus oralis Uo5 0.00 2 2 S 1318 Streptococcus parasanguinis 0.00 1 0 S 1305 Streptococcus sanguinis 0.00 1 1 - 388919 Streptococcus sanguinis SK36 0.00 1 1 S 1308 Streptococcus thermophilus 0.00 9 0 F 81852 Enterococcaceae 0.00 9 0 G 1350 Enterococcus 0.00 9 0 S 37734 Enterococcus casseliflavus 0.00 9 9 - 565655 Enterococcus casseliflavus EC20 0.00 2 0 C 909932 Negativicutes 0.00 2 0 O 909929 Selenomonadales 0.00 2 0 F 31977 Veillonellaceae 0.00 2 0 G 29465 Veillonella 0.00 2 0 S 29466 Veillonella parvula 0.00 2 2 - 479436 Veillonella parvula DSM 2008 0.00 1 0 C 186801 Clostridia 0.00 1 1 O 186802 Clostridiales 0.00 40 0 P 1224 Proteobacteria 0.00 20 2 C 1236 Gammaproteobacteria 0.00 11 1 O 135622 Alteromonadales 0.00 10 0 F 72275 Alteromonadaceae 0.00 10 0 G 226 Alteromonas 0.00 10 0 S 28108 Alteromonas macleodii 0.00 10 10 - 1300257 Alteromonas macleodii str. 'Ionian Sea U8' 0.00 7 0 O 91347 Enterobacteriales 0.00 7 6 F 543 Enterobacteriaceae 0.00 1 0 G 561 Escherichia 0.00 1 1 S 562 Escherichia coli 0.00 20 5 C 28216 Betaproteobacteria 0.00 15 0 O 80840 Burkholderiales 0.00 11 0 F 119060 Burkholderiaceae 0.00 11 0 G 48736 Ralstonia 0.00 11 10 S 329 Ralstonia pickettii 0.00 1 1 - 428406 Ralstonia pickettii 12D 0.00 4 0 F 80864 Comamonadaceae 0.00 2 2 G 12916 Acidovorax 0.00 2 0 G 201096 Alicycliphilus 0.00 2 0 S 179636 Alicycliphilus denitrificans 0.00 2 2 - 596154 Alicycliphilus denitrificans K601 0.00 3 0 P 201174 Actinobacteria 0.00 3 0 C 1760 Actinobacteria 0.00 2 0 - 84998 Coriobacteridae 0.00 2 0 O 84999 Coriobacteriales 0.00 2 0 - 255727 Coriobacterineae 0.00 2 0 F 84107 Coriobacteriaceae 0.00 2 0 G 1380 Atopobium 0.00 2 0 S 1382 Atopobium parvulum 0.00 2 2 - 521095 Atopobium parvulum DSM 20469 0.00 1 0 - 85003 Actinobacteridae 0.00 1 0 O 2037 Actinomycetales 0.00 1 0 - 85009 Propionibacterineae 0.00 1 0 F 31957 Propionibacteriaceae 0.00 1 0 G 1743 Propionibacterium 0.00 1 1 S 1747 Propionibacterium acnes 0.00 20 0 D 2157 Archaea 0.00 20 7 P 28890 Euryarchaeota 0.00 8 0 C 183963 Halobacteria 0.00 8 0 O 1644060 Natrialbales 0.00 8 0 F 1644061 Natrialbaceae 0.00 8 0 G 29287 Natronococcus 0.00 8 0 S 29288 Natronococcus occultus 0.00 8 8 - 694430 Natronococcus occultus SP4 0.00 4 0 C 224756 Methanomicrobia 0.00 4 0 O 94695 Methanosarcinales 0.00 4 0 F 2206 Methanosarcinaceae 0.00 3 0 G 101191 Methanomethylovorans 0.00 3 0 S 101192 Methanomethylovorans hollandica 0.00 3 3 - 867904 Methanomethylovorans hollandica DSM 15978 0.00 1 0 G 2207 Methanosarcina 0.00 1 0 S 2214 Methanosarcina acetivorans 0.00 1 1 - 188937 Methanosarcina acetivorans C2A 0.00 1 0 C 183939 Methanococci 0.00 1 0 O 2182 Methanococcales 0.00 1 0 F 2183 Methanococcaceae 0.00 1 0 G 2184 Methanococcus 0.00 1 0 S 2188 Methanococcus voltae 0.00 1 1 - 456320 Methanococcus voltae A3 0.00 7 0 D 10239 Viruses 0.00 2 0 - 29258 ssDNA viruses 0.00 2 0 F 10841 Microviridae 0.00 2 0 G 10842 Microvirus 0.00 2 2 S 374840 Enterobacteria phage phiX174 sensu lato 0.00 2 0 - 35237 dsDNA viruses, no RNA stage 0.00 2 0 F 10482 Polydnaviridae 0.00 2 0 G 10483 Ichnovirus 0.00 2 2 S 265522 Hyposoter fugitivus ichnovirus 0.00 2 0 - 439488 ssRNA viruses 0.00 2 0 - 35278 ssRNA positive-strand viruses, no DNA stage 0.00 2 0 F 11018 Togaviridae 0.00 2 0 G 11019 Alphavirus 0.00 2 0 - 177872 VEEV complex 0.00 2 2 S 11036 Venezuelan equine encephalitis virus 0.00 1 0 - 35268 Retro-transcribing viruses 0.00 1 0 F 11632 Retroviridae 0.00 1 0 - 35276 unclassified Retroviridae 0.00 1 0 - 206037 Human endogenous retroviruses 0.00 1 0 S 45617 Human endogenous retrovirus K 0.00 1 1 - 166122 Human endogenous retrovirus K113 0.00 3 1 D 2759 Eukaryota 0.00 1 0 P 3041 Chlorophyta 0.00 1 1 C 75966 Trebouxiophyceae 0.00 1 0 - 33682 Euglenozoa 0.00 1 0 O 5653 Kinetoplastida 0.00 1 0 F 5654 Trypanosomatidae 0.00 1 0 G 5690 Trypanosoma 0.00 1 0 - 47570 Schizotrypanum 0.00 1 0 S 5693 Trypanosoma cruzi 0.00 1 1 - 353153 Trypanosoma cruzi strain CL Brener """)))
def handle_program_options(): descr = """\ Create BIOM-format tables (http://biom-format.org) from Kraken output (http://ccb.jhu.edu/software/kraken/). The program takes as input, one or more files output from the kraken-report tool. Each file is parsed and the counts for each OTU (operational taxonomic unit) are recorded, along with database ID (e.g. NCBI), and lineage. The extracted data are then stored in a BIOM table where each count is linked to the Sample and OTU it belongs to. Sample IDs are extracted from the input filenames (everything up to the '.'). OTUs are defined by the --max and --min arguments. By default these are set to Order and Species respectively. This means that counts assigned directly to an Order, Family, or Genus are recorded under the associated OTU ID, and counts assigned at or below the Species level are assigned to the OTU ID for the species. Setting a minimum rank below Species is not yet available. The BIOM format currently has two major versions. Version 1.0 uses the JSON (JavaScript Object Notation) format as a base. Version 2.x uses the HDF5 (Hierarchical Data Format v5) as a base. The output format can be specified with the --fmt option. Note that a tab-separated (tsv) output format is also available. The resulting file will not contain most of the metadata, but can be opened by spreadsheet programs. Version 2 of the BIOM format is used by default for output, but requires the Python library 'h5py'. If the library is not installed, kraken-biom will automatically switch to using version 1.0. Note that the output can optionally be compressed with gzip (--gzip) for version 1.0 and TSV files. Version 2 files are automatically compressed. Usage examples -------------- 1. Basic usage with default parameters: $ kraken-biom.py S1.txt S2.txt This produces a compressed BIOM 2.1 file: table.biom 2. BIOM v1.0 output: $ kraken-biom.py S1.txt S2.txt --fmt json Produces a BIOM 1.0 file: table.biom 3. Compressed TSV output: $ kraken-biom.py S1.txt S2.txt --fmt tsv --gzip -o table.tsv Produces a TSV file: table.tsv.gz 4. Change the max and min OTU levels to Class and Genus: $ kraken-biom.py S1.txt S2.txt --max C --min G Program arguments -----------------""" parser = argparse.ArgumentParser( description=twdd(descr), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('kraken_reports', nargs='*', help="Results files from the kraken-report tool.") parser.add_argument('-k', '--kraken_reports_fp', metavar="REPORTS_FP", help="Folder containing kraken reports") parser.add_argument('--max', default="O", choices=ranks[:-1], help="Assigned reads will be recorded only if \ they are at or below max rank. Default: O.") parser.add_argument('--min', default="S", choices=ranks[:-1], help="Reads assigned at and below min rank \ will be recorded as being assigned to the \ min rank level. Default: S.") parser.add_argument('-o', '--output_fp', default="table.biom", help="Path to the BIOM-format file. By default, the\ table will be in the HDF5 BIOM 2.x format. Users can\ output to a different format using the --fmt option.\ The output can also be gzipped using the --gzip\ option. Default path is: ./table.biom") parser.add_argument('--otu_fp', help="Create a file containing just the (NCBI) OTU IDs\ for use with a service such as phyloT \ (http://phylot.biobyte.de/) to generate a phylogenetic\ tree for use in downstream analysis such as UniFrac, \ iTol (itol.embl.de), or PhyloToAST (phylotoast.org).") parser.add_argument('--fmt', default="hdf5", choices=["hdf5", "json", "tsv"], help="Set the output format of the BIOM table.\ Default is HDF5.") parser.add_argument('--gzip', action='store_true', help="Compress the output BIOM table with gzip.\ HDF5 BIOM (v2.x) files are internally\ compressed by default, so this option\ is not needed when specifying --fmt hdf5.") parser.add_argument('--version', action='version', version="kraken-biom version {}, {}".format( __version__, __url__)) parser.add_argument('-v', '--verbose', action='store_true', help="Prints status messages during program \ execution.") return parser.parse_args()
def handle_program_options(): descr = """\ Create BIOM-format tables (http://biom-format.org) from Kraken output (http://ccb.jhu.edu/software/kraken/). The program takes as input, one or more files output from the kraken-report tool. Each file is parsed and the counts for each OTU (operational taxonomic unit) are recorded, along with database ID (e.g. NCBI), and lineage. The extracted data are then stored in a BIOM table where each count is linked to the Sample and OTU it belongs to. Sample IDs are extracted from the input filenames (everything up to the '.'). OTUs are defined by the --max and --min arguments. By default these are set to Order and Species respectively. This means that counts assigned directly to an Order, Family, or Genus are recorded under the associated OTU ID, and counts assigned at or below the Species level are assigned to the OTU ID for the species. Setting a minimum rank below Species is not yet available. The BIOM format currently has two major versions. Version 1.0 uses the JSON (JavaScript Object Notation) format as a base. Version 2.x uses the HDF5 (Hierarchical Data Format v5) as a base. The output format can be specified with the --fmt option. Note that a tab-separated (tsv) output format is also available. The resulting file will not contain most of the metadata, but can be opened by spreadsheet programs. Version 2 of the BIOM format is used by default for output, but requires the Python library 'h5py'. If the library is not installed, kraken-biom will automatically switch to using version 1.0. Note that the output can optionally be compressed with gzip (--gzip) for version 1.0 and TSV files. Version 2 files are automatically compressed. Usage examples -------------- 1. Basic usage with default parameters: $ kraken-biom S1.txt S2.txt This produces a compressed BIOM 2.1 file: table.biom 2. BIOM v1.0 output: $ kraken-biom S1.txt S2.txt --fmt json Produces a BIOM 1.0 file: table.biom 3. Compressed TSV output: $ kraken-biom S1.txt S2.txt --fmt tsv --gzip -o table.tsv Produces a TSV file: table.tsv.gz 4. Change the max and min OTU levels to Class and Genus: $ kraken-biom S1.txt S2.txt --max C --min G Program arguments -----------------""" parser = argparse.ArgumentParser(description=twdd(descr), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('kraken_reports', nargs='*', help="Results files from the kraken-report tool.") parser.add_argument('-k', '--kraken_reports_fp', metavar="REPORTS_FP", help="Folder containing kraken reports") parser.add_argument('--max', default="O", choices=ranks[:-1], help="Assigned reads will be recorded only if \ they are at or below max rank. Default: O.") parser.add_argument('--min', default="S", choices=ranks[:-1], help="Reads assigned at and below min rank \ will be recorded as being assigned to the \ min rank level. Default: S.") parser.add_argument('-o', '--output_fp', default="table.biom", help="Path to the BIOM-format file. By default, the\ table will be in the HDF5 BIOM 2.x format. Users can\ output to a different format using the --fmt option.\ The output can also be gzipped using the --gzip\ option. Default path is: ./table.biom") parser.add_argument('--otu_fp', help="Create a file containing just the (NCBI) OTU IDs\ for use with a service such as phyloT \ (http://phylot.biobyte.de/) to generate a phylogenetic\ tree for use in downstream analysis such as UniFrac, \ iTol (itol.embl.de), or PhyloToAST (phylotoast.org).") parser.add_argument('--fmt', default="hdf5", choices=["hdf5", "json", "tsv"], help="Set the output format of the BIOM table.\ Default is HDF5.") parser.add_argument('--gzip', action='store_true', help="Compress the output BIOM table with gzip.\ HDF5 BIOM (v2.x) files are internally\ compressed by default, so this option\ is not needed when specifying --fmt hdf5.") parser.add_argument('--version', action='version', version="kraken-biom version {}, {}".format(__version__, __url__)) parser.add_argument('-v', '--verbose', action='store_true', help="Prints status messages during program \ execution.") return parser.parse_args()
def handle_program_options(): descr = """\ Create BIOM-format tables (http://biom-format.org) from CLARK output (http://clark.cs.ucr.edu/). The program takes as input, one or more files output from CLARK's estimate_abundance tool. Each file is parsed and the counts for each OTU (operational taxonomic unit) are recorded, along with database ID (e.g. NCBI), and lineage. The extracted data are then stored in a BIOM table where each count is linked to the Sample and OTU it belongs to. Sample IDs are extracted from the input filenames (everything up to the '.' preceeding the extension). The BIOM format currently has two major versions. Version 1.0 uses the JSON (JavaScript Object Notation) format as a base. Version 2.x uses the HDF5 (Hierarchical Data Format v5) as a base. The output format can be specified with the --fmt option. Note that a tab-separated (tsv) output format is also available. The resulting file will not contain most of the metadata, but can be opened by spreadsheet programs. Version 2 of the BIOM format is used by default for output, but requires the Python library 'h5py'. If the library is not installed, clark-biom will automatically switch to using version 1.0. Note that the output can optionally be compressed with gzip (--gzip) for version 1.0 and TSV files. Version 2 files are automatically compressed. Currently the taxonomy for each OTU ID is stored as row metadata in the BIOM table using the seven-level format used by QIIME and metaphlan: k__K, p__P, ... s__S. If you would like another format supported, please file an issue or send a pull request (note the contribution guidelines). Usage examples -------------- 1. Basic usage with default parameters:: $ clark-biom S1.csv S2.csv This produces a compressed BIOM 2.1 file: table.biom with sample IDs: S1, S2. 2. Process multiple samples from multiple groups:: $ clark-biom groupA/*.csv groupB/*.csv -o groupsAB.biom 3. BIOM v1.0 output:: $ clark-biom S1.csv S2.csv --fmt json Produces a BIOM 1.0 file: table.biom 4. Compressed TSV output:: $ clark-biom S1.csv S2.csv --fmt tsv --gzip -o table.tsv Produces a TSV file: table.tsv.gz Program arguments -----------------""" parser = argparse.ArgumentParser(description=twdd(descr), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('clark_abd_tbls', nargs='+', metavar="TABLE-FILE", help="Result file from estimate_abundance.sh.") parser.add_argument('-o', '--output_fp', default="table.biom", metavar="COMBINED-OUTPUT-FILE", help="Path to the BIOM-format file. By default, the " "table will be in the HDF5 BIOM 2.x format. Users can " "output to a different format using the --fmt option. " "The output can also be gzipped using the --gzip" "option. Default path is: ./table.biom") parser.add_argument('--otu-fp', dest="otu_fp", metavar="OTU-FILE", help="Create a file containing just (NCBI) OTU IDs " "for use with a service such as phyloT " "(http://phylot.biobyte.de/) to generate phylogenetic " "trees for use in downstream analysis such as " "UniFrac, iTol (itol.embl.de), or PhyloToAST " "(phylotoast.org).") parser.add_argument('--fmt', default="hdf5", choices=["hdf5", "json", "tsv"], help="Set the output format of the BIOM table. " "Default is HDF5.") parser.add_argument('--store-pct', dest="store_pct", action='store_true', help="Record the relative abundances " "('Proportion_Classified' column) instead of " "the raw count ('Count' column) data.") parser.add_argument('--gzip', action='store_true', help="Compress the output BIOM table with gzip. " "HDF5 BIOM (v2.x) files are internally " "compressed by default, so this option " "is not needed when specifying --fmt hdf5.") parser.add_argument('--version', action='version', version="clark-biom version {}, {}".format(__version__, __url__)) parser.add_argument('-v', '--verbose', action='store_true', help="Prints status messages during program " "execution.") return parser.parse_args()