コード例 #1
0
def regroup( table, map_feature_groups, function, precision, ungrouped=False ):
    
    function = c_funcmap[function]
    seen_before = {}
    feature_counts = {}
    # index of new group names to old table rows
    mapping = {}

    for i, rowhead in enumerate( table.rowheads ):
        feature, name, stratum = util.fsplit( rowhead )
        if feature not in feature_counts:
            feature_counts[feature] = 0
        # decide which groups to use
        if feature in map_feature_groups:
            groups = map_feature_groups[feature]
        elif ungrouped:
            groups = [util.c_ungrouped]
        else:
            groups = []
        # track grouping
        for group in groups:
            if feature not in seen_before and group != util.c_ungrouped:
                feature_counts[feature] += 1
            # account for stratified feature
            groupname = group 
            if stratum is not None:
                groupname = util.fjoin( groupname, stratum=stratum )
            mapping.setdefault( groupname, [] ).append( i )
        # we have processed an instance of this feature
        seen_before[feature] = 1

    # rebuild table
    groupnames = util.fsort( mapping.keys( ) )
    groupdata = []
    for groupname in groupnames:
        oldrow_index = mapping[groupname]
        newrow = [[] for j in range( len( table.colheads ) )]
        for i in oldrow_index:
            for j in range( len( table.colheads ) ):
                newrow[j].append( float( table.data[i][j] ) )
        # collapse groups
        newrow = [function( block ) for block in newrow]
        if precision is not None:
            newrow = [round( k, precision ) for k in newrow]
        groupdata.append( newrow )
    table.rowheads = groupnames
    table.data = groupdata

    # report
    n = len( feature_counts )
    ungrouped = list( feature_counts.values( ) ).count( 0 )
    grouped_total = n - ungrouped
    grouped_multi = grouped_total - list(feature_counts.values()).count( 1 )
    print( "Original Feature Count: %d; Grouped 1+ times: %d (%.1f%%); Grouped 2+ times: %d (%.1f%%)" % \
           ( n,
             grouped_total,
             100 * grouped_total / float( n ),
             grouped_multi,
             100 * grouped_multi / float( n ),
         ), file=sys.stderr )
コード例 #2
0
def main():

    args = get_args()
    T = util.Table(args.input)

    # build the taxmap (uniref -> taxon mapping)
    print("Building taxonomic map for input table", file=sys.stderr)
    if args.devdb is not None:
        p_datafile = args.devdb
    elif args.database in databases:
        p_datafile = databases[args.database]
    else:
        sys.exit(
            "Must specify a valid database (from utility mapping or --devdb)")
    taxmap = build_taxmap(T.rowheads, args.level, args.lca_choice, p_datafile)

    # refine the taxmap (remove rare taxa)
    counts = Counter(taxmap.values())
    total = float(sum(counts.values()))
    counts = {k: v / total for k, v in counts.items()}
    taxmap = {
        old: new
        for old, new in taxmap.items() if counts[new] >= args.threshold
    }

    # reindex the table (which rows to keep, which rows to pseudo-stratify)
    print("Reindexing the input table", file=sys.stderr)
    ticker = util.Ticker(T.rowheads)
    index = {}
    for i, rowhead in enumerate(T.rowheads):
        ticker.tick()
        feature, name, stratum = util.fsplit(rowhead)
        new_rowhead = tax_connect(rowhead, taxmap)
        # unmapped is never stratified
        if feature == util.c_unmapped:
            index.setdefault(rowhead, []).append(i)
        # outside of unclassified mode, keep totals
        elif stratum is None and args.mode != c_umode:
            index.setdefault(rowhead, []).append(i)
            # in totals mode, guess at taxonomy from uniref name
            if args.mode == c_tmode:
                index.setdefault(new_rowhead, []).append(i)
        # in unclassified mode, make a new row for the total...
        elif stratum == c_unclassified and args.mode == c_umode:
            index.setdefault(util.fjoin(feature, name, None), []).append(i)
            # ...then replace "unclassified" with inferred taxonomy
            index.setdefault(new_rowhead, []).append(i)
        # update strata in stratified mode
        elif stratum is not None and args.mode == c_smode:
            index.setdefault(new_rowhead, []).append(i)

    # rebuild the table
    print("Rebuilding the input table", file=sys.stderr)
    rowheads2, data2 = [], []
    ticker = util.Ticker(index)
    for rowhead in util.fsort(index):
        ticker.tick()
        rowheads2.append(rowhead)
        newrow = [0 for k in T.colheads]
        for i in index[rowhead]:
            oldrow = map(float, T.data[i])
            newrow = [a + b for a, b in zip(newrow, oldrow)]
        data2.append(newrow)
    T.rowheads = rowheads2
    T.data = data2
    print("Writing new table", file=sys.stderr)
    T.write(args.output, unfloat=True)

    # report on performance
    success, total = 0, 0
    for rowhead in T.rowheads:
        feature, name, stratum = util.fsplit(rowhead)
        if stratum is not None:
            total += 1
            if stratum != c_unclassified:
                success += 1
    print(
        "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)"
        .format(
            TOTAL=total,
            SUCCESS=success,
            TARGET=args.level,
            PERCENT=round(100 * success / float(total), 1),
        ),
        file=sys.stderr,
    )
コード例 #3
0
def main():
    args = get_args()
    tbl = util.Table(args.input)
    # build the taxmap
    print("Building taxonomic map for input table", file=sys.stderr)
    p_datafile = args.dev if args.dev is not None else databases[
        args.resolution]
    taxmap = build_taxmap(tbl.rowheads, args.level, p_datafile)
    # refine the taxmap
    counts = {}
    for old, new in taxmap.items():
        counts[new] = counts.get(new, 0) + 1
    total = float(sum(counts.values()))
    count = {k: v / total for k, v in counts.items()}
    taxmap = {
        old: new
        for old, new in taxmap.items() if count[new] >= args.threshold
    }
    # reindex the table
    print("Reindexing the input table", file=sys.stderr)
    ticker = util.Ticker(tbl.rowheads)
    index = {}
    for i, rowhead in enumerate(tbl.rowheads):
        ticker.tick()
        feature, name, stratum = util.fsplit(rowhead)
        new_rowhead = tax_connect(rowhead, taxmap)
        # unmapped is never stratified
        if feature == util.c_unmapped:
            index.setdefault(rowhead, []).append(i)
        # outside of unclassified mode, keep totals
        elif stratum is None and args.mode != c_umode:
            index.setdefault(rowhead, []).append(i)
            # in totals mode, guess at taxonomy from uniref name
            if args.mode == c_tmode:
                index.setdefault(new_rowhead, []).append(i)
        elif stratum == c_unclassified and args.mode == c_umode:
            # in unclassified mode, make a new row for the total...
            index.setdefault(util.fjoin(feature, name, None), []).append(i)
            # ...then replace "unclassified" with inferred taxonomy
            index.setdefault(new_rowhead, []).append(i)
        elif stratum is not None and args.mode == c_smode:
            index.setdefault(new_rowhead, []).append(i)
    # rebuild the table
    print("Rebuilding the input table", file=sys.stderr)
    rowheads2, data2 = [], []
    ticker = util.Ticker(index)
    for rowhead in util.fsort(index):
        ticker.tick()
        rowheads2.append(rowhead)
        newrow = [0 for k in tbl.colheads]
        for i in index[rowhead]:
            oldrow = map(float, tbl.data[i])
            newrow = [a + b for a, b in zip(newrow, oldrow)]
        data2.append(newrow)
    tbl.rowheads = rowheads2
    tbl.data = data2
    # output
    print("Writing new table", file=sys.stderr)
    tbl.write(args.output, unfloat=True)
    # report on performance
    success, total = 0, 0
    for rowhead in tbl.rowheads:
        feature, name, stratum = util.fsplit(rowhead)
        if stratum is not None:
            total += 1
            if stratum != c_unclassified:
                success += 1
    print(
        "Summary: Of {TOTAL} stratifications, {SUCCESS} mapped at {TARGET} level ({PERCENT}%)"
        .format(
            TOTAL=total,
            SUCCESS=success,
            TARGET=args.level,
            PERCENT=round(100 * success / float(total), 1),
        ),
        file=sys.stderr,
    )
コード例 #4
0
def join_gene_tables(gene_tables, output, verbose=None):
    """
    Join the gene tables to a single gene table
    """

    gene_table_data = {}
    start_column_id = ""
    samples = []
    file_basenames = []
    index = 0
    for gene_table in gene_tables:

        if verbose:
            print("Reading file: " + gene_table)

        lines = util.process_gene_table_with_header(
            gene_table, allow_for_missing_header=True)
        header = next(lines)

        # get the basename of the file
        file_basename = '.'.join(os.path.basename(gene_table).split('.')[:-1])
        file_basenames.append(file_basename)

        if header:
            header_info = header.split(GENE_TABLE_DELIMITER)
            if not start_column_id:
                start_column_id = header_info[0]
            # allow for multiple samples
            sample_names = header_info[1:]
        else:
            # if there is no header in the file then use the file name as the sample name
            sample_names = [file_basename]

        for line in lines:
            data = line.split(GENE_TABLE_DELIMITER)
            try:
                gene = data[0]
                # if the header names multiple samples, merge all samples
                # this prevents extra columns from being included in some rows
                # this requires files containing multiple samples to include a header
                data_points = data[1:len(sample_names) + 1]
            except IndexError:
                gene = ""

            if gene:
                current_data = gene_table_data.get(gene, "")
                fill = index - current_data.count(GENE_TABLE_DELIMITER)
                if fill > 0:
                    # fill in zeros for samples without data then add data point
                    gene_table_data[
                        gene] = current_data + GENE_TABLE_DELIMITER.join(
                            ["0"] * fill
                        ) + GENE_TABLE_DELIMITER + GENE_TABLE_DELIMITER.join(
                            data_points) + GENE_TABLE_DELIMITER
                elif fill < 0:
                    # add data point to other data point from the same sample
                    current_data_points = current_data.split(
                        GENE_TABLE_DELIMITER)
                    for i, point in enumerate(data_points):
                        store_index = len(data_points) * -1 - 1 + i
                        current_data_points[store_index] = str(
                            float(current_data_points[store_index]) +
                            float(point))
                    gene_table_data[gene] = GENE_TABLE_DELIMITER.join(
                        current_data_points)
                else:
                    # add data point to end of list
                    gene_table_data[
                        gene] = current_data + GENE_TABLE_DELIMITER.join(
                            data_points) + GENE_TABLE_DELIMITER

        samples += sample_names
        index += len(sample_names)
    # if all of the header names for the files are the same
    # then use the file names as headers
    if samples.count(samples[0]) == len(samples):
        samples = file_basenames

    # write the joined gene table
    if not start_column_id:
        start_column_id = "# header "
    sample_header = [start_column_id] + samples
    total_gene_tables = len(samples)
    sorted_gene_list = util.fsort(list(gene_table_data))
    try:
        file_handle = open(output, "w")
        file_handle.write(GENE_TABLE_DELIMITER.join(sample_header) + "\n")
    except EnvironmentError:
        sys.exit("Unable to write file: " + output)

    for gene in sorted_gene_list:
        # extend gene data for any gene that is not included in all samples
        current_data = gene_table_data[gene]
        fill = total_gene_tables - current_data.count(GENE_TABLE_DELIMITER)
        if fill:
            current_data = current_data + GENE_TABLE_DELIMITER.join(
                ["0"] * fill) + GENE_TABLE_DELIMITER
        file_handle.write(gene + GENE_TABLE_DELIMITER +
                          current_data.rstrip(GENE_TABLE_DELIMITER) + "\n")

    file_handle.close()