def filter_by_sample_pct(otus, nsamples, pct, phyl_level):
    """
    Split the list of OTUs (and associated sequence ids) into two lists:
    those occurring in more than some percentage of samples and those less than
    the cutoff.

    :type otus: dict
    :param otus: {otuid: [taxonomy, [sequence IDs]]}
    :type nsamples: int
    :param nsamples: The total number of samples in the data set
    :type pct: float
    :param pct: The cutoff percentage for inclusion in the filtered
                 set of OTUs
    :type phyl_level: str
    :param phyl_level: The phylogenetic level (e.g. family, group, etc...) at
                       which to combine OTU counts for thresholding. One of
                       the following: ['k','p','c','o','f','g','s']

    :rtype: tuple
    :return: Two dicts: the OTU IDs and sequence IDs above and below the
              percentage threshold.
    """
    if phyl_level not in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        phyl_level = 's'
    nsamples = float(nsamples)
    sample_counts = defaultdict(set)
    # count the number of sequences per OTU
    for otuid in otus:
        phyl = util.split_phylogeny(otus[otuid][0], phyl_level)
        samples = {seqid.split('_')[0] for seqid in otus[otuid][1]}
        sample_counts[phyl].update(samples)
    sample_counts = {
        phyl: len(sample_counts[phyl]) / nsamples
        for phyl in sample_counts
    }

    # separate OTUs
    above = {}
    below = {}
    for otuid in otus:
        phyl = util.split_phylogeny(otus[otuid][0], phyl_level)
        if sample_counts[phyl] >= pct:
            above[otuid] = otus[otuid]
        else:
            below[otuid] = [
                sample_counts[phyl], '', otus[otuid][0], otus[otuid][1]
            ]

    return above, below
def filter_by_sample_pct(otus, nsamples, pct, phyl_level):
    """
    Split the list of OTUs (and associated sequence ids) into two lists:
    those occurring in more than some percentage of samples and those less than
    the cutoff.

    :type otus: dict
    :param otus: {otuid: [taxonomy, [sequence IDs]]}
    :type nsamples: int
    :param nsamples: The total number of samples in the data set
    :type pct: float
    :param pct: The cutoff percentage for inclusion in the filtered
                 set of OTUs
    :type phyl_level: str
    :param phyl_level: The phylogenetic level (e.g. family, group, etc...) at
                       which to combine OTU counts for thresholding. One of
                       the following: ['k','p','c','o','f','g','s']

    :rtype: tuple
    :return: Two dicts: the OTU IDs and sequence IDs above and below the
              percentage threshold.
    """
    if phyl_level not in ["k", "p", "c", "o", "f", "g", "s"]:
        phyl_level = "s"
    nsamples = float(nsamples)
    sample_counts = defaultdict(set)
    # count the number of sequences per OTU
    for otuid in otus:
        phyl = util.split_phylogeny(otus[otuid][0], phyl_level)
        samples = {seqid.split("_")[0] for seqid in otus[otuid][1]}
        sample_counts[phyl].update(samples)
    sample_counts = {phyl: len(sample_counts[phyl]) / nsamples for phyl in sample_counts}

    # separate OTUs
    above = {}
    below = {}
    for otuid in otus:
        phyl = util.split_phylogeny(otus[otuid][0], phyl_level)
        if sample_counts[phyl] >= pct:
            above[otuid] = otus[otuid]
        else:
            below[otuid] = [sample_counts[phyl], "", otus[otuid][0], otus[otuid][1]]

    return above, below
    def test_split_phylogeny(self):
        """
        Testing split_phylogeny() function of util.py.

        :return: Returns OK for successful run of the test, otherwise raises
                 error.
        """
        p1 = "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi"

        for lvl in ["k", "p", "c", "o", "f", "g", "s"]:
            if lvl == "k":
                self.assertEqual(
                    ut.split_phylogeny(p1, "k"), "k__Bacteria",
                    msg="Error. Identification failed at level 'k'."
                    )
            if lvl == "p":
                self.assertEqual(
                    ut.split_phylogeny(p1, "p"), "k__Bacteria; p__Firmicutes",
                    msg="Error. Identification failed at level 'p'."
                    )
            if lvl == "c":
                self.assertEqual(
                    ut.split_phylogeny(p1, "c"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia",
                    msg="Error. Identification failed at level 'c'."
                    )
            if lvl == "o":
                self.assertEqual(
                    ut.split_phylogeny(p1, "o"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales",
                    msg="Error. Identification failed at level 'o'."
                    )
            if lvl == "f":
                self.assertEqual(
                    ut.split_phylogeny(p1, "f"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae",
                    msg="Error. Identification failed at level 'f'."
                    )
            if lvl == "g":
                self.assertEqual(
                    ut.split_phylogeny(p1, "g"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella",
                    msg="Error. Identification failed at level 'g'."
                    )
            if lvl == "s":
                self.assertEqual(
                    ut.split_phylogeny(p1, "s"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi",
                    msg="Error. Identification failed at level 's'."
                    )
    def test_split_phylogeny(self):
        """
        Testing split_phylogeny() function of util.py.

        :return: Returns OK for successful run of the test, otherwise raises
                 error.
        """
        p1 = "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi"

        for lvl in ["k", "p", "c", "o", "f", "g", "s"]:
            if lvl == "k":
                self.assertEqual(
                    ut.split_phylogeny(p1, "k"),
                    "k__Bacteria",
                    msg="Error. Identification failed at level 'k'.")
            if lvl == "p":
                self.assertEqual(
                    ut.split_phylogeny(p1, "p"),
                    "k__Bacteria; p__Firmicutes",
                    msg="Error. Identification failed at level 'p'.")
            if lvl == "c":
                self.assertEqual(
                    ut.split_phylogeny(p1, "c"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia",
                    msg="Error. Identification failed at level 'c'.")
            if lvl == "o":
                self.assertEqual(
                    ut.split_phylogeny(p1, "o"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales",
                    msg="Error. Identification failed at level 'o'.")
            if lvl == "f":
                self.assertEqual(
                    ut.split_phylogeny(p1, "f"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae",
                    msg="Error. Identification failed at level 'f'.")
            if lvl == "g":
                self.assertEqual(
                    ut.split_phylogeny(p1, "g"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella",
                    msg="Error. Identification failed at level 'g'.")
            if lvl == "s":
                self.assertEqual(
                    ut.split_phylogeny(p1, "s"),
                    "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella; s__denticariosi",
                    msg="Error. Identification failed at level 's'.")
def main():
    args = handle_program_options()

    try:
        with open(args.seqs_otus_fn):
            pass
    except IOError as ioe:
        sys.exit(
            '\nError with output file from pick OTUs step:{}\n'.format(ioe))

    try:
        with open(args.id_to_taxonomy_fn):
            pass
    except IOError as ioe:
        sys.exit(
            '\nError with file mapping seqences to asssigned taxonomy:{}\n'.
            format(ioe))

    seqs_otus, nsamples, nseqs = gather_otus_samples(args.seqs_otus_fn)
    otu_taxa = assign_taxonomy(seqs_otus.keys(), args.id_to_taxonomy_fn)

    otus = {}
    for otuid, seqids in seqs_otus.iteritems():
        otus[otuid] = (otu_taxa[otuid], seqids)

    above, below = filter_by_sample_pct(otus, nsamples,
                                        args.percent_of_samples,
                                        args.phylogenetic_level)

    above, below2 = filter_by_sequence_pct(above, nseqs,
                                           args.percent_of_sequences,
                                           args.phylogenetic_level)

    above2, below3 = filter_by_sequence_pct(
        {boid: below[boid][2:]
         for boid in below}, nseqs, args.percent_of_sequences,
        args.phylogenetic_level)
    below = {boid: below[boid] for boid in below3}
    below.update(below2)
    above.update(above2)

    with open(args.output_pruned_otus_fn, 'w') as outF:
        for otuid, item in above.iteritems():
            outF.write('{0}\t{1}\n'.format(otuid, '\t'.join(item[1])))

    with open(args.output_removed_otus_fn, 'w') as outF:
        outF.write('OTU ID\tSample%\tSeq %\tSequence IDs\n')
        for oid, item in below.iteritems():
            seqpct = '{seqpct:.4f}' if item[0] != '' else '     '
            samplepct = '{samplepct:.2G}' if item[1] != '' else '     '
            line = '{otuid}\t' + seqpct + '\t' + samplepct + '\t{seqs}\n'
            outF.write(
                line.format(otuid=oid,
                            seqpct=item[0],
                            samplepct=item[1],
                            seqs='\t'.join(item[3])))

    if args.verbose:
        print 'Input: \t{} total samples'.format(nsamples)
        print '\t{} total sequences\n'.format(nseqs)
        print 'From a total of {} input otus'.format(len(otus))
        print '{} otus remain '.format(len(above))
        print '{} otus removed'.format(len(below))

        phyl_map = {
            'k': 'kingdoms',
            'p': 'phyla',
            'c': 'classes',
            'o': 'orders',
            'f': 'families',
            'g': 'genera',
            's': 'species'
        }
        phyls = {
            util.split_phylogeny(otus[oid][0], args.phylogenetic_level)
            for oid in otus
        }
        print '\nFrom the {} total {}'.format(
            len(phyls), phyl_map[args.phylogenetic_level])
        phyl_above = {
            util.split_phylogeny(otus[aoid][0], args.phylogenetic_level)
            for aoid in above
        }
        phyl_below = {
            util.split_phylogeny(otus[boid][0], args.phylogenetic_level)
            for boid in below
        }
        above_abundance = sum([len(item[1]) for item in above.values()])
        below_abundance = sum([len(below[boid][3]) for boid in below])
        report = ('{0} {1} ({2:.4G}%) were {3}, and account for {4:.4G}% of' +
                  ' all sequence data ({5} sequences)')
        print report.format(len(phyl_above), phyl_map[args.phylogenetic_level],
                            len(phyl_above) / float(len(phyls)) * 100, 'kept',
                            above_abundance / float(nseqs) * 100,
                            above_abundance)
        print report.format(len(phyl_below), phyl_map[args.phylogenetic_level],
                            len(phyl_below) / float(len(phyls)) * 100,
                            'removed', below_abundance / float(nseqs) * 100,
                            below_abundance)
def main():
    args = handle_program_options()

    try:
        with open(args.seqs_otus_fn):
            pass
    except IOError as ioe:
        sys.exit("\nError with output file from pick OTUs step:{}\n".format(ioe))

    try:
        with open(args.id_to_taxonomy_fn):
            pass
    except IOError as ioe:
        sys.exit("\nError with file mapping seqences to asssigned taxonomy:{}\n".format(ioe))

    seqs_otus, nsamples, nseqs = gather_otus_samples(args.seqs_otus_fn)
    otu_taxa = assign_taxonomy(seqs_otus.keys(), args.id_to_taxonomy_fn)

    otus = {}
    for otuid, seqids in seqs_otus.iteritems():
        otus[otuid] = (otu_taxa[otuid], seqids)

    above, below = filter_by_sample_pct(otus, nsamples, args.percent_of_samples, args.phylogenetic_level)

    above, below2 = filter_by_sequence_pct(above, nseqs, args.percent_of_sequences, args.phylogenetic_level)

    above2, below3 = filter_by_sequence_pct(
        {boid: below[boid][2:] for boid in below}, nseqs, args.percent_of_sequences, args.phylogenetic_level
    )
    below = {boid: below[boid] for boid in below3}
    below.update(below2)
    above.update(above2)

    with open(args.output_pruned_otus_fn, "w") as outF:
        for otuid, item in above.iteritems():
            outF.write("{0}\t{1}\n".format(otuid, "\t".join(item[1])))

    with open(args.output_removed_otus_fn, "w") as outF:
        outF.write("OTU ID\tSample%\tSeq %\tSequence IDs\n")
        for oid, item in below.iteritems():
            seqpct = "{seqpct:.4f}" if item[0] != "" else "     "
            samplepct = "{samplepct:.2G}" if item[1] != "" else "     "
            line = "{otuid}\t" + seqpct + "\t" + samplepct + "\t{seqs}\n"
            outF.write(line.format(otuid=oid, seqpct=item[0], samplepct=item[1], seqs="\t".join(item[3])))

    if args.verbose:
        print "Input: \t{} total samples".format(nsamples)
        print "\t{} total sequences\n".format(nseqs)
        print "From a total of {} input otus".format(len(otus))
        print "{} otus remain ".format(len(above))
        print "{} otus removed".format(len(below))

        phyl_map = {
            "k": "kingdoms",
            "p": "phyla",
            "c": "classes",
            "o": "orders",
            "f": "families",
            "g": "genera",
            "s": "species",
        }
        phyls = {util.split_phylogeny(otus[oid][0], args.phylogenetic_level) for oid in otus}
        print "\nFrom the {} total {}".format(len(phyls), phyl_map[args.phylogenetic_level])
        phyl_above = {util.split_phylogeny(otus[aoid][0], args.phylogenetic_level) for aoid in above}
        phyl_below = {util.split_phylogeny(otus[boid][0], args.phylogenetic_level) for boid in below}
        above_abundance = sum([len(item[1]) for item in above.values()])
        below_abundance = sum([len(below[boid][3]) for boid in below])
        report = "{0} {1} ({2:.4G}%) were {3}, and account for {4:.4G}% of" + " all sequence data ({5} sequences)"
        print report.format(
            len(phyl_above),
            phyl_map[args.phylogenetic_level],
            len(phyl_above) / float(len(phyls)) * 100,
            "kept",
            above_abundance / float(nseqs) * 100,
            above_abundance,
        )
        print report.format(
            len(phyl_below),
            phyl_map[args.phylogenetic_level],
            len(phyl_below) / float(len(phyls)) * 100,
            "removed",
            below_abundance / float(nseqs) * 100,
            below_abundance,
        )