def regroup( table, map_feature_groups, function, precision, ungrouped=False ): function = c_funcmap[function] seen_before = {} feature_counts = {} # index of new group names to old table rows mapping = {} for i, rowhead in enumerate( table.rowheads ): feature, name, stratum = util.fsplit( rowhead ) if feature not in feature_counts: feature_counts[feature] = 0 # decide which groups to use if feature in map_feature_groups: groups = map_feature_groups[feature] elif ungrouped: groups = [util.c_ungrouped] else: groups = [] # track grouping for group in groups: if feature not in seen_before and group != util.c_ungrouped: feature_counts[feature] += 1 # account for stratified feature groupname = group if stratum is not None: groupname = util.fjoin( groupname, stratum=stratum ) mapping.setdefault( groupname, [] ).append( i ) # we have processed an instance of this feature seen_before[feature] = 1 # rebuild table groupnames = util.fsort( mapping.keys( ) ) groupdata = [] for groupname in groupnames: oldrow_index = mapping[groupname] newrow = [[] for j in range( len( table.colheads ) )] for i in oldrow_index: for j in range( len( table.colheads ) ): newrow[j].append( float( table.data[i][j] ) ) # collapse groups newrow = [function( block ) for block in newrow] if precision is not None: newrow = [round( k, precision ) for k in newrow] groupdata.append( newrow ) table.rowheads = groupnames table.data = groupdata # report n = len( feature_counts ) ungrouped = list( feature_counts.values( ) ).count( 0 ) grouped_total = n - ungrouped grouped_multi = grouped_total - list(feature_counts.values()).count( 1 ) print( "Original Feature Count: %d; Grouped 1+ times: %d (%.1f%%); Grouped 2+ times: %d (%.1f%%)" % \ ( n, grouped_total, 100 * grouped_total / float( n ), grouped_multi, 100 * grouped_multi / float( n ), ), file=sys.stderr )
def main(): args = get_args() T = util.Table(args.input) # build the taxmap (uniref -> taxon mapping) print("Building taxonomic map for input table", file=sys.stderr) if args.devdb is not None: p_datafile = args.devdb elif args.database in databases: p_datafile = databases[args.database] else: sys.exit( "Must specify a valid database (from utility mapping or --devdb)") taxmap = build_taxmap(T.rowheads, args.level, args.lca_choice, p_datafile) # refine the taxmap (remove rare taxa) counts = Counter(taxmap.values()) total = float(sum(counts.values())) counts = {k: v / total for k, v in counts.items()} taxmap = { old: new for old, new in taxmap.items() if counts[new] >= args.threshold } # reindex the table (which rows to keep, which rows to pseudo-stratify) print("Reindexing the input table", file=sys.stderr) ticker = util.Ticker(T.rowheads) index = {} for i, rowhead in enumerate(T.rowheads): ticker.tick() feature, name, stratum = util.fsplit(rowhead) new_rowhead = tax_connect(rowhead, taxmap) # unmapped is never stratified if feature == util.c_unmapped: index.setdefault(rowhead, []).append(i) # outside of unclassified mode, keep totals elif stratum is None and args.mode != c_umode: index.setdefault(rowhead, []).append(i) # in totals mode, guess at taxonomy from uniref name if args.mode == c_tmode: index.setdefault(new_rowhead, []).append(i) # in unclassified mode, make a new row for the total... elif stratum == c_unclassified and args.mode == c_umode: index.setdefault(util.fjoin(feature, name, None), []).append(i) # ...then replace "unclassified" with inferred taxonomy index.setdefault(new_rowhead, []).append(i) # update strata in stratified mode elif stratum is not None and args.mode == c_smode: index.setdefault(new_rowhead, []).append(i) # rebuild the table print("Rebuilding the input table", file=sys.stderr) rowheads2, data2 = [], [] ticker = util.Ticker(index) for rowhead in util.fsort(index): ticker.tick() rowheads2.append(rowhead) newrow = [0 for k in T.colheads] for i in index[rowhead]: oldrow = map(float, T.data[i]) newrow = [a + b for a, b in zip(newrow, oldrow)] data2.append(newrow) T.rowheads = rowheads2 T.data = data2 print("Writing new table", file=sys.stderr) T.write(args.output, unfloat=True) # report on performance success, total = 0, 0 for rowhead in T.rowheads: feature, name, stratum = util.fsplit(rowhead) if stratum is not None: total += 1 if stratum != c_unclassified: success += 1 print( "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)" .format( TOTAL=total, SUCCESS=success, TARGET=args.level, PERCENT=round(100 * success / float(total), 1), ), file=sys.stderr, )
def main(): args = get_args() tbl = util.Table(args.input) # build the taxmap print("Building taxonomic map for input table", file=sys.stderr) p_datafile = args.dev if args.dev is not None else databases[ args.resolution] taxmap = build_taxmap(tbl.rowheads, args.level, p_datafile) # refine the taxmap counts = {} for old, new in taxmap.items(): counts[new] = counts.get(new, 0) + 1 total = float(sum(counts.values())) count = {k: v / total for k, v in counts.items()} taxmap = { old: new for old, new in taxmap.items() if count[new] >= args.threshold } # reindex the table print("Reindexing the input table", file=sys.stderr) ticker = util.Ticker(tbl.rowheads) index = {} for i, rowhead in enumerate(tbl.rowheads): ticker.tick() feature, name, stratum = util.fsplit(rowhead) new_rowhead = tax_connect(rowhead, taxmap) # unmapped is never stratified if feature == util.c_unmapped: index.setdefault(rowhead, []).append(i) # outside of unclassified mode, keep totals elif stratum is None and args.mode != c_umode: index.setdefault(rowhead, []).append(i) # in totals mode, guess at taxonomy from uniref name if args.mode == c_tmode: index.setdefault(new_rowhead, []).append(i) elif stratum == c_unclassified and args.mode == c_umode: # in unclassified mode, make a new row for the total... index.setdefault(util.fjoin(feature, name, None), []).append(i) # ...then replace "unclassified" with inferred taxonomy index.setdefault(new_rowhead, []).append(i) elif stratum is not None and args.mode == c_smode: index.setdefault(new_rowhead, []).append(i) # rebuild the table print("Rebuilding the input table", file=sys.stderr) rowheads2, data2 = [], [] ticker = util.Ticker(index) for rowhead in util.fsort(index): ticker.tick() rowheads2.append(rowhead) newrow = [0 for k in tbl.colheads] for i in index[rowhead]: oldrow = map(float, tbl.data[i]) newrow = [a + b for a, b in zip(newrow, oldrow)] data2.append(newrow) tbl.rowheads = rowheads2 tbl.data = data2 # output print("Writing new table", file=sys.stderr) tbl.write(args.output, unfloat=True) # report on performance success, total = 0, 0 for rowhead in tbl.rowheads: feature, name, stratum = util.fsplit(rowhead) if stratum is not None: total += 1 if stratum != c_unclassified: success += 1 print( "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)" .format( TOTAL=total, SUCCESS=success, TARGET=args.level, PERCENT=round(100 * success / float(total), 1), ), file=sys.stderr, )
def join_gene_tables(gene_tables, output, verbose=None): """ Join the gene tables to a single gene table """ gene_table_data = {} start_column_id = "" samples = [] file_basenames = [] index = 0 for gene_table in gene_tables: if verbose: print("Reading file: " + gene_table) lines = util.process_gene_table_with_header( gene_table, allow_for_missing_header=True) header = next(lines) # get the basename of the file file_basename = '.'.join(os.path.basename(gene_table).split('.')[:-1]) file_basenames.append(file_basename) if header: header_info = header.split(GENE_TABLE_DELIMITER) if not start_column_id: start_column_id = header_info[0] # allow for multiple samples sample_names = header_info[1:] else: # if there is no header in the file then use the file name as the sample name sample_names = [file_basename] for line in lines: data = line.split(GENE_TABLE_DELIMITER) try: gene = data[0] # if the header names multiple samples, merge all samples # this prevents extra columns from being included in some rows # this requires files containing multiple samples to include a header data_points = data[1:len(sample_names) + 1] except IndexError: gene = "" if gene: current_data = gene_table_data.get(gene, "") fill = index - current_data.count(GENE_TABLE_DELIMITER) if fill > 0: # fill in zeros for samples without data then add data point gene_table_data[ gene] = current_data + GENE_TABLE_DELIMITER.join( ["0"] * fill ) + GENE_TABLE_DELIMITER + GENE_TABLE_DELIMITER.join( data_points) + GENE_TABLE_DELIMITER elif fill < 0: # add data point to other data point from the same sample current_data_points = current_data.split( GENE_TABLE_DELIMITER) for i, point in enumerate(data_points): store_index = len(data_points) * -1 - 1 + i current_data_points[store_index] = str( float(current_data_points[store_index]) + float(point)) gene_table_data[gene] = GENE_TABLE_DELIMITER.join( current_data_points) else: # add data point to end of list gene_table_data[ gene] = current_data + GENE_TABLE_DELIMITER.join( data_points) + GENE_TABLE_DELIMITER samples += sample_names index += len(sample_names) # if all of the header names for the files are the same # then use the file names as headers if samples.count(samples[0]) == len(samples): samples = file_basenames # write the joined gene table if not start_column_id: start_column_id = "# header " sample_header = [start_column_id] + samples total_gene_tables = len(samples) sorted_gene_list = util.fsort(list(gene_table_data)) try: file_handle = open(output, "w") file_handle.write(GENE_TABLE_DELIMITER.join(sample_header) + "\n") except EnvironmentError: sys.exit("Unable to write file: " + output) for gene in sorted_gene_list: # extend gene data for any gene that is not included in all samples current_data = gene_table_data[gene] fill = total_gene_tables - current_data.count(GENE_TABLE_DELIMITER) if fill: current_data = current_data + GENE_TABLE_DELIMITER.join( ["0"] * fill) + GENE_TABLE_DELIMITER file_handle.write(gene + GENE_TABLE_DELIMITER + current_data.rstrip(GENE_TABLE_DELIMITER) + "\n") file_handle.close()