コード例 #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--summarise",
                      dest="summarise",
                      type="choice",
                      choices=("level-counts", "taxa-counts", "individual"),
                      help="summarise the taxa counts - no. phyla etc")

    parser.add_option("--output-map",
                      dest="output_map",
                      action="store_true",
                      help="ouput map of taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.output_map:
        found = []
        options.stdout.write("""Domain\t \
        kingdom\t \
        phylum\t \
        class\t \
        order\t \
        family\t \
        genus\t \
        species\n""")
        # only output the mapping file - do not continue
        # summarise regardless of the specified options
        for lca in LCA.iterate(options.stdin):

            # if bacteria or archaea the kingdom will
            # be the domain
            if lca.domain == "Bacteria" or lca.domain == "Archaea":
                kingdom = lca.domain
            else:
                kingdom = lca.kingdom

            hierarchy = [
                lca.domain, kingdom, lca.phylum, lca._class, lca.order,
                lca.family, lca.genus, lca.species
            ]
            if hierarchy in found:
                continue
            else:
                found.append(hierarchy)
                options.stdout.write("\t".join(hierarchy) + "\n")
        return

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_domain = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                nreads_domain += 1
                level_counts["domain"].add(lca.domain)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            # removed subspecies mapping for the time
            # being

            # if lca.subspecies != "NA":
            #     nreads_subspecies += 1
            #     level_counts["subspecies"].add(lca.subspecies)
            # else:
            #     c.subspecies_unmapped += 1

            # if lca.subspecies_plus != "NA":
            #     nreads_subspecies_plus += 1
            #     level_counts["subspecies+"].add(lca.subspecies_plus)
            # else:
            #     c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join([
            "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+",
            "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+",
            "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom",
            "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass",
            "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily",
            "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies",
            "nseqspecies+"
        ]) + "\n")

        options.stdout.write("\t".join(
            map(str, [
                len(level_counts["domain"]),
                len(level_counts["kingdom"]),
                len(level_counts["kingdom+"]),
                len(level_counts["phylum"]),
                len(level_counts["phylum+"]),
                len(level_counts["class"]),
                len(level_counts["class+"]),
                len(level_counts["order"]),
                len(level_counts["order+"]),
                len(level_counts["family"]),
                len(level_counts["family+"]),
                len(level_counts["genus"]),
                len(level_counts["genus+"]),
                len(level_counts["species"]),
                len(level_counts["species+"]), nreads_domain, nreads_kingdom,
                nreads_phylum, nreads_phylum_plus, nreads_class,
                nreads_class_plus, nreads_order, nreads_order_plus,
                nreads_family, nreads_family_plus, nreads_genus,
                nreads_genus_plus, nreads_species, nreads_species_plus
            ])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {
            "domain": collections.defaultdict(int),
            "kingdom": collections.defaultdict(int),
            "kingdom+": collections.defaultdict(int),
            "phylum": collections.defaultdict(int),
            "phylum+": collections.defaultdict(int),
            "class": collections.defaultdict(int),
            "class+": collections.defaultdict(int),
            "order": collections.defaultdict(int),
            "order+": collections.defaultdict(int),
            "family": collections.defaultdict(int),
            "family+": collections.defaultdict(int),
            "genus": collections.defaultdict(int),
            "genus+": collections.defaultdict(int),
            "species": collections.defaultdict(int),
            "species+": collections.defaultdict(int)
        }

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                taxa_counts["domain"][lca.domain] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["domain"] += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n")
        for level, taxa_count in sorted(taxa_counts.items()):
            total_level = total - unmapped[level]
            for taxa, count in sorted(taxa_count.items()):
                options.stdout.write("\t".join([
                    level, taxa,
                    str(count), "{:.8}".format(float(count) /
                                               total_level), "{:.8}".
                    format(float(count) / (float(total_level) / 1000000))
                ]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join([
            "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+",
            "class", "class+", "order", "order+", "family", "family+", "genus",
            "genus+", "species", "species+"
        ]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([
                lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus,
                lca.phylum, lca.phylum_plus, lca._class, lca._class_plus,
                lca.order, lca.order_plus, lca.family, lca.family_plus,
                lca.genus, lca.genus_plus, lca.species, lca.species_plus
            ]) + "\n")

    # write footer and output benchmark information.
    E.Stop()
コード例 #2
0
ファイル: lca2table.py プロジェクト: CGATOxford/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s", "--summarise", dest="summarise", type="choice",
                      choices=("level-counts", "taxa-counts", "individual"),
                      help="summarise the taxa counts - no. phyla etc")

    parser.add_option("--output-map", dest="output_map", action="store_true",
                      help="ouput map of taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.output_map:
        found = []
        options.stdout.write("""Domain\t \
        kingdom\t \
        phylum\t \
        class\t \
        order\t \
        family\t \
        genus\t \
        species\n""")
        # only output the mapping file - do not continue
        # summarise regardless of the specified options
        for lca in LCA.iterate(options.stdin):

            # if bacteria or archaea the kingdom will
            # be the domain
            if lca.domain == "Bacteria" or lca.domain == "Archaea":
                kingdom = lca.domain
            else:
                kingdom = lca.kingdom

            hierarchy = [lca.domain,
                         kingdom,
                         lca.phylum,
                         lca._class,
                         lca.order,
                         lca.family,
                         lca.genus,
                         lca.species]
            if hierarchy in found:
                continue
            else:
                found.append(hierarchy)
                options.stdout.write("\t".join(hierarchy) + "\n")
        return

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_domain = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                nreads_domain += 1
                level_counts["domain"].add(lca.domain)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            # removed subspecies mapping for the time
            # being
            
            # if lca.subspecies != "NA":
            #     nreads_subspecies += 1
            #     level_counts["subspecies"].add(lca.subspecies)
            # else:
            #     c.subspecies_unmapped += 1

            # if lca.subspecies_plus != "NA":
            #     nreads_subspecies_plus += 1
            #     level_counts["subspecies+"].add(lca.subspecies_plus)
            # else:
            #     c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join(["ndomain",
                                        "nkingdom",
                                        "nkingdom+",
                                        "nphylum",
                                        "nphylum+",
                                        "nclass",
                                        "nclass+",
                                        "norder",
                                        "norder+",
                                        "nfamily",
                                        "nfamily+",
                                        "ngenus",
                                        "ngenus+",
                                        "nspecies",
                                        "nspecies+",
                                        "nseqkingdom",
                                        "nseqkingdom+",
                                        "nseqphylum",
                                        "nseqphylum+",
                                        "nseqclass",
                                        "nseqclass+",
                                        "nseqorder",
                                        "nseqorder+",
                                        "nseqfamily",
                                        "nseqfamily+",
                                        "nseqgenus",
                                        "nseqgenus+",
                                        "nseqspecies",
                                        "nseqspecies+"]) + "\n")

        options.stdout.write("\t".join(map(
            str, [len(level_counts["domain"]),
                  len(level_counts["kingdom"]),
                  len(level_counts["kingdom+"]),
                  len(level_counts["phylum"]),
                  len(level_counts["phylum+"]),
                  len(level_counts["class"]),
                  len(level_counts["class+"]),
                  len(level_counts["order"]),
                  len(level_counts["order+"]),
                  len(level_counts["family"]),
                  len(level_counts["family+"]),
                  len(level_counts["genus"]),
                  len(level_counts["genus+"]),
                  len(level_counts["species"]),
                  len(level_counts["species+"]),
                  nreads_domain,
                  nreads_kingdom,
                  nreads_phylum,
                  nreads_phylum_plus,
                  nreads_class,
                  nreads_class_plus,
                  nreads_order,
                  nreads_order_plus,
                  nreads_family,
                  nreads_family_plus,
                  nreads_genus,
                  nreads_genus_plus,
                  nreads_species,
                  nreads_species_plus])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {"domain": collections.defaultdict(int),
                       "kingdom": collections.defaultdict(int),
                       "kingdom+": collections.defaultdict(int),
                       "phylum": collections.defaultdict(int),
                       "phylum+": collections.defaultdict(int),
                       "class": collections.defaultdict(int),
                       "class+": collections.defaultdict(int),
                       "order": collections.defaultdict(int),
                       "order+": collections.defaultdict(int),
                       "family": collections.defaultdict(int),
                       "family+": collections.defaultdict(int),
                       "genus": collections.defaultdict(int),
                       "genus+": collections.defaultdict(int),
                       "species": collections.defaultdict(int),
                       "species+": collections.defaultdict(int)}

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                taxa_counts["domain"][lca.domain] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["domain"] += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n")
        for level, taxa_count in sorted(taxa_counts.items()):
            total_level = total - unmapped[level]
            for taxa, count in sorted(taxa_count.items()):
                options.stdout.write("\t".join(
                    [level,
                     taxa,
                     str(count),
                     "{:.8}".format(float(count)/total_level),
                     "{:.8}".format(float(count)/(float(total_level)/1000000))])
                                     + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join(["id",
                                        "domain",
                                        "kingdom",
                                        "kingdom+",
                                        "phylum",
                                        "phylum+",
                                        "class",
                                        "class+",
                                        "order",
                                        "order+",
                                        "family",
                                        "family+",
                                        "genus",
                                        "genus+",
                                        "species",
                                        "species+"]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([lca.identifier,
                                            lca.domain,
                                            lca.kingdom,
                                            lca.kingdom_plus,
                                            lca.phylum,
                                            lca.phylum_plus,
                                            lca._class,
                                            lca._class_plus,
                                            lca.order,
                                            lca.order_plus,
                                            lca.family,
                                            lca.family_plus,
                                            lca.genus,
                                            lca.genus_plus,
                                            lca.species,
                                            lca.species_plus]) + "\n")

    # write footer and output benchmark information.
    E.Stop()
コード例 #3
0
ファイル: lca2table.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s",
        "--summarise",
        dest="summarise",
        type="choice",
        choices=("level-counts", "taxa-counts", "individual"),
        help="summarise the taxa counts i.e. no. phyla, classes etc")

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            if lca.subspecies != "NA":
                nreads_subspecies += 1
                level_counts["subspecies"].add(lca.subspecies)
            else:
                c.subspecies_unmapped += 1

            if lca.subspecies_plus != "NA":
                nreads_subspecies_plus += 1
                level_counts["subspecies+"].add(lca.subspecies_plus)
            else:
                c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join([
            "nkingdom", "nkingdom+", "nphylum", "nphylum+", "nclass",
            "nclass+", "norder", "norder+", "nfamily", "nfamily+", "ngenus",
            "ngenus+", "nspecies", "nspecies+", "nsubspecies", "nsubspecies+",
            "nseqkingdom", "nseqkingdom+", "nseqphylum", "nseqphylum+",
            "nseqclass", "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily",
            "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies",
            "nseqspecies+", "nseqsubspecies", "nseqsubspecies+"
        ]) + "\n")

        options.stdout.write("\t".join(
            map(str, [
                len(level_counts["kingdom"]),
                len(level_counts["kingdom+"]),
                len(level_counts["phylum"]),
                len(level_counts["phylum+"]),
                len(level_counts["class"]),
                len(level_counts["class+"]),
                len(level_counts["order"]),
                len(level_counts["order+"]),
                len(level_counts["family"]),
                len(level_counts["family+"]),
                len(level_counts["genus"]),
                len(level_counts["genus+"]),
                len(level_counts["species"]),
                len(level_counts["species+"]),
                len(level_counts["subspecies"]),
                len(level_counts["subspecies+"]), nreads_kingdom,
                nreads_phylum, nreads_phylum_plus, nreads_class,
                nreads_class_plus, nreads_order, nreads_order_plus,
                nreads_family, nreads_family_plus, nreads_genus,
                nreads_genus_plus, nreads_species, nreads_species_plus,
                nreads_subspecies, nreads_subspecies_plus
            ])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {
            "kingdom": collections.defaultdict(int),
            "kingdom+": collections.defaultdict(int),
            "phylum": collections.defaultdict(int),
            "phylum+": collections.defaultdict(int),
            "class": collections.defaultdict(int),
            "class+": collections.defaultdict(int),
            "order": collections.defaultdict(int),
            "order+": collections.defaultdict(int),
            "family": collections.defaultdict(int),
            "family+": collections.defaultdict(int),
            "genus": collections.defaultdict(int),
            "genus+": collections.defaultdict(int),
            "species": collections.defaultdict(int),
            "species+": collections.defaultdict(int),
            "subspecies": collections.defaultdict(int),
            "subspecies+": collections.defaultdict(int)
        }

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1
            if lca.subspecies != "NA":
                taxa_counts["subspecies"][lca.subspecies] += 1
            else:
                c.subspecies_unmapped += 1
                unmapped["subspecies"] += 1
            if lca.subspecies_plus != "NA":
                taxa_counts["subspecies+"][lca.subspecies_plus] += 1
            else:
                c.subspecies_plus_unmapped += 1
                unmapped["subspecies+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\n")
        for level, taxa_count in taxa_counts.iteritems():
            total_level = total - unmapped[level]
            for taxa, count in taxa_count.iteritems():
                options.stdout.write("\t".join(
                    [level, taxa,
                     str(count),
                     str(float(count) / total_level)]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join([
            "id", "kingdom", "kingdom+", "phylum", "phylum+", "class",
            "class+", "order", "order+", "family", "family+", "genus",
            "genus+", "species", "species+", "subspecies", "subspecies+"
        ]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([
                lca.identifier, lca.kingdom, lca.kingdom_plus, lca.phylum,
                lca.phylum_plus, lca._class, lca._class_plus, lca.order,
                lca.order_plus, lca.family, lca.family_plus, lca.genus,
                lca.genus_plus, lca.species, lca.species_plus, lca.subspecies,
                lca.subspecies_plus
            ]) + "\n")

    ## write footer and output benchmark information.
    E.Stop()
コード例 #4
0
ファイル: lca2table.py プロジェクト: Charlie-George/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s", "--summarise", dest="summarise", type="choice", choices=("level-counts",
                      "taxa-counts", "individual"), help="summarise the taxa counts i.e. no. phyla, classes etc")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            if lca.subspecies != "NA":
                nreads_subspecies += 1
                level_counts["subspecies"].add(lca.subspecies)
            else:
                c.subspecies_unmapped += 1

            if lca.subspecies_plus != "NA":
                nreads_subspecies_plus += 1
                level_counts["subspecies+"].add(lca.subspecies_plus)
            else:
                c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join(["nkingdom", "nkingdom+", "nphylum", "nphylum+", "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+", "ngenus", "ngenus+", "nspecies", "nspecies+", "nsubspecies", "nsubspecies+", "nseqkingdom",
                             "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass", "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily", "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies", "nseqspecies+", "nseqsubspecies", "nseqsubspecies+"]) + "\n")

        options.stdout.write("\t".join(map(str, [len(level_counts["kingdom"]), len(level_counts["kingdom+"]), len(level_counts["phylum"]), len(level_counts["phylum+"]), len(level_counts["class"]), len(level_counts["class+"]), len(level_counts["order"]), len(level_counts["order+"]), len(level_counts["family"]), len(level_counts["family+"]), len(level_counts["genus"]), len(level_counts["genus+"]), len(
            level_counts["species"]), len(level_counts["species+"]), len(level_counts["subspecies"]), len(level_counts["subspecies+"]), nreads_kingdom, nreads_phylum, nreads_phylum_plus, nreads_class, nreads_class_plus, nreads_order, nreads_order_plus, nreads_family, nreads_family_plus, nreads_genus, nreads_genus_plus, nreads_species, nreads_species_plus, nreads_subspecies, nreads_subspecies_plus])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {"kingdom": collections.defaultdict(int), "kingdom+": collections.defaultdict(int), "phylum": collections.defaultdict(int), "phylum+": collections.defaultdict(int), "class": collections.defaultdict(int), "class+": collections.defaultdict(int), "order": collections.defaultdict(int), "order+": collections.defaultdict(
            int), "family": collections.defaultdict(int), "family+": collections.defaultdict(int), "genus": collections.defaultdict(int), "genus+": collections.defaultdict(int), "species": collections.defaultdict(int), "species+": collections.defaultdict(int), "subspecies": collections.defaultdict(int), "subspecies+": collections.defaultdict(int)}

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1
            if lca.subspecies != "NA":
                taxa_counts["subspecies"][lca.subspecies] += 1
            else:
                c.subspecies_unmapped += 1
                unmapped["subspecies"] += 1
            if lca.subspecies_plus != "NA":
                taxa_counts["subspecies+"][lca.subspecies_plus] += 1
            else:
                c.subspecies_plus_unmapped += 1
                unmapped["subspecies+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\n")
        for level, taxa_count in taxa_counts.iteritems():
            total_level = total - unmapped[level]
            for taxa, count in taxa_count.iteritems():
                options.stdout.write(
                    "\t".join([level, taxa, str(count), str(float(count) / total_level)]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join(["id", "kingdom", "kingdom+", "phylum", "phylum+", "class", "class+", "order",
                             "order+", "family", "family+", "genus", "genus+", "species", "species+", "subspecies", "subspecies+"]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([lca.identifier, lca.kingdom, lca.kingdom_plus, lca.phylum, lca.phylum_plus, lca._class, lca._class_plus, lca.order,
                                 lca.order_plus, lca.family, lca.family_plus, lca.genus, lca.genus_plus, lca.species, lca.species_plus, lca.subspecies, lca.subspecies_plus]) + "\n")

    # write footer and output benchmark information.
    E.Stop()