def tax_connect(feature, taxmap): old = feature feature, name, stratum = util.fsplit(feature) if stratum is None or stratum == c_unclassified: stratum2 = taxmap.get(feature, c_unclassified) else: stratum2 = taxmap.get(stratum, c_unclassified) return util.fjoin(feature, name, stratum2)
def regroup(table, map_feature_groups, function, precision, ungrouped=False): function = c_funcmap[function] seen_before = {} feature_counts = {} # index of new group names to old table rows mapping = {} for i, rowhead in enumerate(table.rowheads): feature, name, stratum = util.fsplit(rowhead) if feature not in feature_counts: feature_counts[feature] = 0 # decide which groups to use if feature in map_feature_groups: groups = map_feature_groups[feature] elif ungrouped: groups = [util.c_ungrouped] else: groups = [] # track grouping for group in groups: if feature not in seen_before and group != util.c_ungrouped: feature_counts[feature] += 1 # account for stratified feature groupname = group if stratum is not None: groupname = util.fjoin(groupname, stratum=stratum) mapping.setdefault(groupname, []).append(i) # we have processed an instance of this feature seen_before[feature] = 1 # rebuild table groupnames = util.fsort(mapping.keys()) groupdata = [] for groupname in groupnames: oldrow_index = mapping[groupname] newrow = [[] for j in range(len(table.colheads))] for i in oldrow_index: for j in range(len(table.colheads)): newrow[j].append(float(table.data[i][j])) # collapse groups groupdata.append( [round(function(block), precision) for block in newrow]) table.rowheads = groupnames table.data = groupdata # report n = len(feature_counts) ungrouped = list(feature_counts.values()).count(0) grouped_total = n - ungrouped grouped_multi = grouped_total - list(feature_counts.values()).count(1) print( "Original Feature Count: %d; Grouped 1+ times: %d (%.1f%%); Grouped 2+ times: %d (%.1f%%)" % \ ( n, grouped_total, 100 * grouped_total / float( n ), grouped_multi, 100 * grouped_multi / float( n ), ), file=sys.stderr )
def main(): args = get_args() tbl = util.Table(args.input) # build the taxmap print("Building taxonomic map for input table", file=sys.stderr) p_datafile = args.dev if args.dev is not None else databases[ args.resolution] taxmap = build_taxmap(tbl.rowheads, args.level, p_datafile) # refine the taxmap counts = {} for old, new in taxmap.items(): counts[new] = counts.get(new, 0) + 1 total = float(sum(counts.values())) count = {k: v / total for k, v in counts.items()} taxmap = { old: new for old, new in taxmap.items() if count[new] >= args.threshold } # reindex the table print("Reindexing the input table", file=sys.stderr) ticker = util.Ticker(tbl.rowheads) index = {} for i, rowhead in enumerate(tbl.rowheads): ticker.tick() feature, name, stratum = util.fsplit(rowhead) new_rowhead = tax_connect(rowhead, taxmap) # unmapped is never stratified if feature == util.c_unmapped: index.setdefault(rowhead, []).append(i) # outside of unclassified mode, keep totals elif stratum is None and args.mode != c_umode: index.setdefault(rowhead, []).append(i) # in totals mode, guess at taxonomy from uniref name if args.mode == c_tmode: index.setdefault(new_rowhead, []).append(i) elif stratum == c_unclassified and args.mode == c_umode: # in unclassified mode, make a new row for the total... index.setdefault(util.fjoin(feature, name, None), []).append(i) # ...then replace "unclassified" with inferred taxonomy index.setdefault(new_rowhead, []).append(i) elif stratum is not None and args.mode == c_smode: index.setdefault(new_rowhead, []).append(i) # rebuild the table print("Rebuilding the input table", file=sys.stderr) rowheads2, data2 = [], [] ticker = util.Ticker(index) for rowhead in util.fsort(index): ticker.tick() rowheads2.append(rowhead) newrow = [0 for k in tbl.colheads] for i in index[rowhead]: oldrow = map(float, tbl.data[i]) newrow = [a + b for a, b in zip(newrow, oldrow)] data2.append(newrow) tbl.rowheads = rowheads2 tbl.data = data2 # output print("Writing new table", file=sys.stderr) tbl.write(args.output, unfloat=True) # report on performance success, total = 0, 0 for rowhead in tbl.rowheads: feature, name, stratum = util.fsplit(rowhead) if stratum is not None: total += 1 if stratum != c_unclassified: success += 1 print( "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)" .format( TOTAL=total, SUCCESS=success, TARGET=args.level, PERCENT=round(100 * success / float(total), 1), ), file=sys.stderr, )