Exemple #1
0
def tax_connect(feature, taxmap):
    old = feature
    feature, name, stratum = util.fsplit(feature)
    if stratum is None or stratum == c_unclassified:
        stratum2 = taxmap.get(feature, c_unclassified)
    else:
        stratum2 = taxmap.get(stratum, c_unclassified)
    return util.fjoin(feature, name, stratum2)
Exemple #2
0
def regroup(table, map_feature_groups, function, precision, ungrouped=False):

    function = c_funcmap[function]
    seen_before = {}
    feature_counts = {}
    # index of new group names to old table rows
    mapping = {}

    for i, rowhead in enumerate(table.rowheads):
        feature, name, stratum = util.fsplit(rowhead)
        if feature not in feature_counts:
            feature_counts[feature] = 0
        # decide which groups to use
        if feature in map_feature_groups:
            groups = map_feature_groups[feature]
        elif ungrouped:
            groups = [util.c_ungrouped]
        else:
            groups = []
        # track grouping
        for group in groups:
            if feature not in seen_before and group != util.c_ungrouped:
                feature_counts[feature] += 1
            # account for stratified feature
            groupname = group
            if stratum is not None:
                groupname = util.fjoin(groupname, stratum=stratum)
            mapping.setdefault(groupname, []).append(i)
        # we have processed an instance of this feature
        seen_before[feature] = 1

    # rebuild table
    groupnames = util.fsort(mapping.keys())
    groupdata = []
    for groupname in groupnames:
        oldrow_index = mapping[groupname]
        newrow = [[] for j in range(len(table.colheads))]
        for i in oldrow_index:
            for j in range(len(table.colheads)):
                newrow[j].append(float(table.data[i][j]))
        # collapse groups
        groupdata.append(
            [round(function(block), precision) for block in newrow])
    table.rowheads = groupnames
    table.data = groupdata

    # report
    n = len(feature_counts)
    ungrouped = list(feature_counts.values()).count(0)
    grouped_total = n - ungrouped
    grouped_multi = grouped_total - list(feature_counts.values()).count(1)
    print( "Original Feature Count: %d; Grouped 1+ times: %d (%.1f%%); Grouped 2+ times: %d (%.1f%%)" % \
           ( n,
             grouped_total,
             100 * grouped_total / float( n ),
             grouped_multi,
             100 * grouped_multi / float( n ),
         ), file=sys.stderr )
Exemple #3
0
def main():
    args = get_args()
    tbl = util.Table(args.input)
    # build the taxmap
    print("Building taxonomic map for input table", file=sys.stderr)
    p_datafile = args.dev if args.dev is not None else databases[
        args.resolution]
    taxmap = build_taxmap(tbl.rowheads, args.level, p_datafile)
    # refine the taxmap
    counts = {}
    for old, new in taxmap.items():
        counts[new] = counts.get(new, 0) + 1
    total = float(sum(counts.values()))
    count = {k: v / total for k, v in counts.items()}
    taxmap = {
        old: new
        for old, new in taxmap.items() if count[new] >= args.threshold
    }
    # reindex the table
    print("Reindexing the input table", file=sys.stderr)
    ticker = util.Ticker(tbl.rowheads)
    index = {}
    for i, rowhead in enumerate(tbl.rowheads):
        ticker.tick()
        feature, name, stratum = util.fsplit(rowhead)
        new_rowhead = tax_connect(rowhead, taxmap)
        # unmapped is never stratified
        if feature == util.c_unmapped:
            index.setdefault(rowhead, []).append(i)
        # outside of unclassified mode, keep totals
        elif stratum is None and args.mode != c_umode:
            index.setdefault(rowhead, []).append(i)
            # in totals mode, guess at taxonomy from uniref name
            if args.mode == c_tmode:
                index.setdefault(new_rowhead, []).append(i)
        elif stratum == c_unclassified and args.mode == c_umode:
            # in unclassified mode, make a new row for the total...
            index.setdefault(util.fjoin(feature, name, None), []).append(i)
            # ...then replace "unclassified" with inferred taxonomy
            index.setdefault(new_rowhead, []).append(i)
        elif stratum is not None and args.mode == c_smode:
            index.setdefault(new_rowhead, []).append(i)
    # rebuild the table
    print("Rebuilding the input table", file=sys.stderr)
    rowheads2, data2 = [], []
    ticker = util.Ticker(index)
    for rowhead in util.fsort(index):
        ticker.tick()
        rowheads2.append(rowhead)
        newrow = [0 for k in tbl.colheads]
        for i in index[rowhead]:
            oldrow = map(float, tbl.data[i])
            newrow = [a + b for a, b in zip(newrow, oldrow)]
        data2.append(newrow)
    tbl.rowheads = rowheads2
    tbl.data = data2
    # output
    print("Writing new table", file=sys.stderr)
    tbl.write(args.output, unfloat=True)
    # report on performance
    success, total = 0, 0
    for rowhead in tbl.rowheads:
        feature, name, stratum = util.fsplit(rowhead)
        if stratum is not None:
            total += 1
            if stratum != c_unclassified:
                success += 1
    print(
        "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)"
        .format(
            TOTAL=total,
            SUCCESS=success,
            TARGET=args.level,
            PERCENT=round(100 * success / float(total), 1),
        ),
        file=sys.stderr,
    )