Beispiel #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codonbias_weights2tsv.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--methods",
                      dest="methods",
                      type="string",
                      help="methods to apply.")

    parser.add_option("--is-frequencies",
                      dest="is_frequencies",
                      action="store_true",
                      help="data is frequencies (default: weights).")

    parser.add_option("-s",
                      "--sort",
                      dest="sort",
                      type="choice",
                      choices=("percent-difference", "aa"),
                      help="sort order of output table.")

    parser.add_option(
        "-g",
        "--global-sort",
        dest="global_sort",
        action="store_true",
        help="globally sort results (otherwise: by species pair).")

    parser.set_defaults( \
       methods = "",
       is_frequencies = False,
       sort = "percent-difference",
       global_sort= False,
       )

    (options, args) = E.Start(parser)
    if options.methods:
        options.methods = options.methods.split(",")

    fields, table = CSV.ReadTable(sys.stdin)

    ## convert weights to floats
    table = CSV.getConvertedTable(table, range(1, len(fields)))

    for method in options.methods:

        if method == "overview":
            if options.is_frequencies:
                WriteOverviewFrequencies(fields, table, options)
            else:
                WriteOverviewWeights(fields, table, options)
Beispiel #2
0
def readAndExpandTable( infile, options ):
    '''splits fields in table at separator. 
    
    If a field in a row contains multiple values,
    the row is expanded into multiple rows such
    that all values have space.
    '''

    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )

    options.stdout.write("\t".join(fields) + "\n")
    
    for row in table:

        data = []
        for x in range(len(fields)):
            data.append( row[x].split( options.separator ) )

        nrows = max( [ len(d) for d in data ] )

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write( "\t".join( [ d[n] for d in data ] ) + "\n" )
Beispiel #3
0
def readAndGroupTable( infile, options ):
    """read table from infile and group.
    """
    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )
    options.columns = getColumns( fields, options.columns )
    assert options.group_column not in options.columns

    converter = float
    new_fields = [ fields[options.group_column] ] + [ fields[x] for x in options.columns ]

    if options.group_function == "min":
        f = min
    elif options.group_function == "max":
        f = max
    elif options.group_function == "sum":
        f = lambda z: reduce( lambda x,y: x+y, z)
    elif options.group_function == "mean":
        f = scipy.mean
    elif options.group_function == "cat":
        f = lambda x: ";".join( [ y for y in x if y != "" ] )
        converter = str
    elif options.group_function == "uniq":
        f = lambda x: ";".join( [ y for y in set(x) if y != "" ] )
        converter = str
    elif options.group_function == "stats":
        f = lambda x: str(Stats.DistributionalParameters(x))
        # update headers
        new_fields = [ fields[options.group_column] ]
        for c in options.columns:
            new_fields += list( map(lambda x: "%s_%s" % (fields[c], x), Stats.DistributionalParameters().getHeaders() ) )

    ## convert values to floats (except for group_column)
    ## Delete rows with unconvertable values and not in options.columns
    new_table = []
    for row in table:
        skip = False
        new_row = [ row[options.group_column] ]

        for c in options.columns:
            if row[c] == options.missing_value:
                new_row.append(row[c])
            else:
                try:
                    new_row.append( converter(row[c]) )
                except ValueError:
                    skip = True
                    break
        if not skip: new_table.append(new_row)
    table = new_table

    new_rows = CSV.GroupTable( table,
                               group_column = 0,
                               group_function = f )

    options.stdout.write("\t".join(new_fields) + "\n")        
    for row in new_rows:
        options.stdout.write( "\t".join( map(str,row) ) + "\n")
Beispiel #4
0
def getGODescriptions(infile):
    '''return dictionary mapping GO category to description
    and namespace.
    '''

    with IOTools.openFile(infile) as inf:
        fields, table = CSV.ReadTable(inf, as_rows=False)

    return dict([
        (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[
            fields.index("go_id")], table[fields.index("description")])
    ])
Beispiel #5
0
def readAndJoinTable(infile, options):

    fields, table = CSV.ReadTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    join_column = options.join_column - 1
    join_name = options.join_column_name - 1

    join_rows = list(set(map(lambda x: x[join_column], table)))
    join_rows.sort()

    join_names = list(set(map(lambda x: x[join_name], table)))
    join_names.sort()

    join_columns = list(
        set(range(len(fields))).difference(set((join_column, join_name))))
    join_columns.sort()

    new_table = []
    map_old2new = {}

    map_name2start = {}
    x = 1
    for name in join_names:
        map_name2start[name] = x
        x += len(join_columns)

    row_width = len(join_columns) * len(join_names)
    for x in join_rows:
        map_old2new[x] = len(map_old2new)
        new_row = [
            x,
        ] + ["na"] * row_width
        new_table.append(new_row)

    for row in table:
        row_index = map_old2new[row[join_column]]
        start = map_name2start[row[join_name]]
        for x in join_columns:
            new_table[row_index][start] = row[x]
            start += 1

    # print new table
    options.stdout.write(fields[join_column])
    for name in join_names:
        for column in join_columns:
            options.stdout.write("\t%s%s%s" %
                                 (name, options.separator, fields[column]))
    options.stdout.write("\n")

    for row in new_table:
        options.stdout.write("\t".join(row) + "\n")
Beispiel #6
0
def readAndCollapseTable(infile, options, missing_value=""):
    '''collapse a table.

    Collapse a table of two columns with row names in the first
    column. Outputs a table with multiple columns for each row name.
    '''

    fields, table = CSV.ReadTable(infile,
                                  with_header=options.has_headers,
                                  as_rows=True)

    if len(fields) != 2:
        raise NotImplementedError("can only work on tables with two columns")

    values = collections.defaultdict(list)

    # column header after which to add
    separator = table[0][0]
    row_names = set([x[0] for x in table])

    row_name, value = table[0]

    values[row_name].append(value)
    added = set([row_name])
    for row_name, value in table[1:]:
        if row_name == separator:
            for r in row_names:
                if r not in added:
                    values[r].append(missing_value)
            added = set()

        values[row_name].append(value)
        added.add(row_name)

    for r in row_names:
        if r not in added:
            values[r].append(missing_value)

    sizes = set([len(x) for x in values.values()])
    assert len(sizes) == 1, "unequal number of row_names"
    size = list(sizes)[0]

    options.stdout.write("row\t%s\n" %
                         ("\t".join(["column_%i" % x for x in range(size)])))

    for key, row in values.items():
        options.stdout.write("%s\t%s\n" % (key, "\t".join(row)))
Beispiel #7
0
def computeFDR( infile, options ):
    '''compute FDR on a table.
    '''

    fields, table  = CSV.ReadTable( infile, with_header = options.has_headers, as_rows = True )

    options.stdout.write("\t".join(fields) + "\n")
    
    for row in table:

        data = []
        for x in range(len(fields)):
            data.append( row[x].split( options.separator ) )

        nrows = max( [ len(d) for d in data ] )

        for d in data:
            d += [""] * (nrows - len(d))

        for n in range(nrows):
            options.stdout.write( "\t".join( [ d[n] for d in data ] ) + "\n" )
Beispiel #8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_sites_slr.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("summary-slr", "summary-filtered",
                               "over-representation", "positive-site-table",
                               "negative-site-table", "neutral-site-table",
                               "positive-site-list", "negative-site-list",
                               "neutral-site-list"),
                      help="method to apply.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix for rows.")

    parser.add_option("-s",
                      "--filename-sites",
                      dest="filename_sites",
                      type="string",
                      help="filename with sites information.")

    parser.add_option("-l",
                      "--filename-log",
                      dest="filename_log",
                      type="string",
                      help="filename with logging information.")

    parser.add_option(
        "-m",
        "--filename-mali",
        dest="filename_mali",
        type="string",
        help=
        "filename of multiple alignment, that was input to SLR. If given, is used to filter indels."
    )

    parser.add_option(
        "--filter-probability",
        dest="filter_probability",
        type="float",
        help="threshold for probability above which to include positive sites."
    )

    parser.add_option("--no-header",
                      dest="write_header",
                      action="store_false",
                      help="only output header.")

    parser.add_option("--only-header",
                      dest="only_header",
                      action="store_true",
                      help="only output header.")

    parser.add_option("--significance-threshold",
                      dest="significance_threshold",
                      type="float",
                      help="threshold for significance tests [%default].")

    parser.add_option("--use-adjusted",
                      dest="use_adjusted",
                      action="store_true",
                      help="use SLR adjusted probability values.")

    parser.add_option("--truncate-sites-list",
                      dest="truncate_sites_list",
                      type="int",
                      help="truncate sites list after ## entries (0 for all).")

    parser.add_option(
        "--context-size",
        dest="context_size",
        type="int",
        help="size of left/right context around a selected residue.")

    parser.set_defaults(
        prefix=None,
        filter_probability=0,
        filter_omega=0,
        filename_sites="-",
        filename_log=None,
        filename_mali=None,
        significance_threshold=0.05,
        write_header=True,
        only_header=False,
        use_adjusted=False,
        context_size=0,
        truncate_sites_list=0,
    )

    (options, args) = E.Start(parser)

    slr = WrapperSlr.Slr()

    # write headers
    if "%s" in options.filename_sites:
        options.prefix = True

    if options.method == "summary-slr":

        # write header
        if options.write_header or options.only_header:

            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# This uses the thresholds as set in SLR. Use "counts" for filtering
# residues based on your own thresholds
""")
            thresholds = "95%", "99%", "95% corrected", "99% corrected"

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write(
                "ltree\tomega\tkappa\tlnL\tnsites\tnsyn\tngap\t")
            options.stdout.write("\t".join(
                map(lambda x: "npos_" + x.replace(" ", "_"), thresholds)))
            options.stdout.write("\t")
            options.stdout.write("\t".join(
                map(lambda x: "nneg_" + x.replace(" ", "_"), thresholds)))
            options.stdout.write("\n")

    elif options.method == "summary-filtered":

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# This method uses the supplied threshold and the multiple alignment to filter.
# All positions that are above the threshold (P-Value) and which are located in
# indels: >= 1 sequence missing from column, are removed.
""")

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write(
                "ltree\tomega\tkappa\tlnL\tnsites\tnfiltered\tntotal\tnsyn\tnneg\tnpos\n"
            )

    elif options.method in ("positive-site-table", "negative-site-table",
                            "neutral-site-table"):

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Numbers of positive/neutral/negative sites according to SLR
#
# Note: sequence positions are 1-based, but mali positions are 0-based.
# Residues in indel positions have been removed and signifnicance was determined according
# with a threshold of %5.2e
""" % options.significance_threshold)

            if options.prefix:
                options.stdout.write("prefix\t")
            options.stdout.write("cluster\tnsites\tp-value\tsites\n")

    elif options.method in ("positive-site-list", "negative-site-list",
                            "neutral-site-list"):

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write(
                    """# Sites under positive/neutral/negative selection according to SLR
#
# Note: sequence positions are 1-based, but mali positions are 0-based.
# Residues in indel positions have been removed and signifnicance was determined according
# with a threshold of %5.2e
""" % options.significance_threshold)

            if options.prefix:
                options.stdout.write("prefix\t")

            options.stdout.write(
                "sequence\tn\taa\tseq_pos\tmali_pos\tcontext\n")

    elif options.method == "over-representation":

        # write header
        if options.write_header or options.only_header:
            if options.loglevel >= 1:
                options.stdlog.write("""# Genes with over-represented sites.
#
# This method uses as input the output of summary-filtered.
""")

    if options.only_header:
        sys.exit(0)

    if options.method in ("summary-slr", "summary-filtered",
                          "positive-site-table", "negative-site-table",
                          "neutral-site-table", "positive-site-list",
                          "negative-site-list", "neutral-site-list"):

        ninput, noutput, nskipped = 0, 0, 0

        if "%s" in options.filename_sites:

            headers, table = CSV.ReadTable(sys.stdin)

            fprefix = headers.index("prefix")

            try:
                fsignificance = headers.index("p")
            except ValueError:
                fsignificance = None

            for row in table:

                id = row[fprefix]
                if fsignificance is not None:
                    p_value = row[fsignificance]
                else:
                    p_value = None

                ninput += 1

                fn = re.sub("%s", id, options.filename_sites)
                if not os.path.exists(fn):
                    nskipped += 1
                    continue

                lines_sites = open(fn, "r").readlines()
                if options.filename_log:
                    lines_log = open(re.sub("%s", id, options.filename_log),
                                     "r").readlines()

                result = slr.parseOutput(lines_sites, lines_log)

                if options.method in ("summary-filtered",
                                      "positive-site-table",
                                      "negative-site-table",
                                      "neutral-site-table"):
                    mali = Mali.Mali()
                    mali.readFromFile(
                        open(re.sub("%s", id, options.filename_mali), "r"))
                else:
                    mali = None

                ProcessResult(result,
                              options,
                              mali,
                              prefix=id,
                              p_value=p_value)
                noutput += 1
        else:
            if options.filename_sites == "-":
                lines_sites = sys.stdin.readlines()
            else:
                lines_sites = open(options.filename_sites, "r").readlines()

            ninput += 1
            if options.filename_log:
                lines_log = open(options.filename_log, "r").readlines()

            result = slr.parseOutput(lines_sites, lines_log)

            if options.filename_mali:
                mali = Mali.Mali()
                mali.readFromFile(open(options.filename_mali, "r"))
            else:
                if options.method == "summary-filtered":
                    raise "please supply a multiple alignment for filtering."

                mali = None

            ProcessResult(result, options, mali, prefix=options.prefix)
            noutput += 1

        if options.loglevel >= 1:
            options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" %
                                 (ninput, noutput, nskipped))

    else:
        if options.method == "over-representation":

            results = []
            for line in sys.stdin:
                if line[0] == "#":
                    continue
                data = line[:-1].split("\t")
                if data[0] == "prefix":
                    continue

                results.append(
                    Result(data[0], int(data[6]), int(data[7]), int(data[8]),
                           int(data[9]), int(data[10])))

            # probability of a single site being positive
            ntotal = sum(map(lambda x: x.mNTotal, results))
            npositives = sum(map(lambda x: x.mNPositive, results))
            p = float(npositives) / float(ntotal)

            if options.loglevel >= 1:
                options.stdlog.write("# sites: total=%i, positive=%i, p=%f\n" %
                                     (ntotal, npositives, p))

            new_results = []
            for result in results:
                if result.mNTotal == 0:
                    continue

                # use -1, because I need P( x >= X)
                # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X )
                # = P (x > X ).
                r = scipy.stats.binom.sf(result.mNPositive - 1, result.mNTotal,
                                         p)

                result.mSignificance = r

                if r < options.significance_threshold:
                    new_results.append(result)

            new_results.sort(
                lambda x, y: cmp(x.mSignificance, y.mSignificance))

            options.stdlog.write(Result().getHeader() + "\n")

            for result in new_results:
                options.stdout.write(str(result) + "\n")

            if options.loglevel >= 1:
                options.stdlog.write("# ntotal=%i, npos=%i\n" %
                                     (len(results), len(new_results)))

    E.Stop()
Beispiel #9
0
def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: analyze_readpositions.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--output-filename-pattern",
                      dest="output_filename_pattern",
                      type="string",
                      help="pattern for additional output files [%default].")

    parser.set_defaults(
        length=1000,
        minimum_coverage=0.90,
        maximum_reads=[1, 10, 20, 50, 100],
        output_filename_pattern="%s",
        normalize=True,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    fields, table = CSV.ReadTable(sys.stdin, dictreader=CSV.DictReaderLarge)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    coverage_5prime = numpy.zeros(options.length, numpy.float)
    coverage_3prime = numpy.zeros(options.length, numpy.float)

    coverage_maxreads5prime = numpy.zeros(options.length, numpy.float)
    coverage_maxreads3prime = numpy.zeros(options.length, numpy.float)

    coverage_full5prime = numpy.zeros(options.length, numpy.float)
    coverage_full3prime = numpy.zeros(options.length, numpy.float)

    coverage_min5prime = numpy.zeros(options.length, numpy.float)
    coverage_min3prime = numpy.zeros(options.length, numpy.float)

    histograms = []
    for x in range(len(options.maximum_reads)):
        histograms.append([
            numpy.zeros(options.length, numpy.float),
            numpy.zeros(options.length, numpy.float), 0
        ])

    ninput, noutput, nfull, nmincov, nskipped, nlength, nmaxreads = 0, 0, 0, 0, 0, 0, 0
    for row in table:
        length, covered, meancov, data, nreads = (int(row["cov_nval"]),
                                                  float(row["cov_covered"]),
                                                  float(row["cov_mean"]),
                                                  row["cov_values"],
                                                  int(row["nover2"]))
        ninput += 1
        if length < options.length:
            nlength += 1
            continue

        if data == "na":
            nskipped += 1
            continue

        noutput += 1
        mincov = covered / length
        values = map(float, data.split(";"))
        m = max(values)
        values = [x / m for x in values]
        coverage_5prime += values[0:1000]
        coverage_3prime += values[-1000:]

        if mincov >= 1.0:
            coverage_full5prime += values[0:1000]
            coverage_full3prime += values[-1000:]
            nfull += 1

        if meancov >= options.minimum_coverage:
            coverage_min5prime += values[0:1000]
            coverage_min3prime += values[-1000:]
            nmincov += 1

        for maxreads in range(len(options.maximum_reads)):
            if nreads <= options.maximum_reads[maxreads]:
                histograms[maxreads][0] += values[0:1000]
                histograms[maxreads][1] += values[-1000:]
                histograms[maxreads][2] += 1

    if options.normalize:
        for x5, x3 in ((coverage_5prime, coverage_3prime),
                       (coverage_min5prime, coverage_min3prime),
                       (coverage_full5prime, coverage_full3prime)):
            m = max((max(x5), max(x3)))
            x3 /= m
            x5 /= m

        for x5, x3, c in histograms:
            m = max((max(x5), max(x3)))
            x5 /= m
            x3 /= m

    outfile = options.stdout
    outfile.write("\t".join(("distance", "minlen-5'", "minlen-3'", "mincov-5'",
                             "mincov-3'", "full-5'", "full-3'")) + "\n")

    for x in range(0, options.length):
        outfile.write( "\t".join( [ "%6.4f" % x for x in \
                                        (x,
                                         coverage_5prime[x],
                                         coverage_3prime[x],
                                         coverage_min5prime[x],
                                         coverage_min3prime[x],
                                         coverage_full5prime[x],
                                         coverage_full3prime[x] ) ] ) + "\n" )

    outfile5 = open(options.output_filename_pattern % "reads5", "w")
    outfile3 = open(options.output_filename_pattern % "reads3", "w")

    outfile5.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    outfile3.write("\t".join([
        "distance",
    ] + [
        "reads%i" % options.maximum_reads[y]
        for y in range(len(options.maximum_reads))
    ]) + "\n")
    for x in range(0, options.length):
        outfile5.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][0][x]
            for y in range(len(options.maximum_reads))
        ])))
        outfile3.write("%i\t%s\n" % (x, "\t".join([
            "%6.4f" % histograms[y][1][x]
            for y in range(len(options.maximum_reads))
        ])))

    E.info( "ninput=%i, noutput=%i, nmaxreads=%i, nfull=%i, nmincov=%i, nskipped=%i, nlength=%i" %\
                (ninput, noutput, nmaxreads, nfull, nmincov, nskipped, nlength) )

    E.Stop()
Beispiel #10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if len(args) != 2:
        raise "please specify two files to join."

    options.filename1, options.filename2 = args

    table1 = CSV.ReadTable(open(options.filename1, "r"))
    table2 = CSV.ReadTable(open(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    # build new field list
    new_fields = []

    for x in options.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in options.join_fields1:
            new_fields.append(x)
        if x not in options.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=options.csv_dialect,
                                lineterminator=options.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if options.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines, dialect=options.csv_dialect)

        print "\t".join(fields)

        first_row = True
        for row in reader:
            row = CSV.ConvertDictionary(row)
            writer.writerow(row)

    E.Stop()
Beispiel #11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_set.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.add_option("-1",
                      "--join-fields1",
                      dest="join_fields1",
                      type="string",
                      help="join fields in first table.")
    parser.add_option("-2",
                      "--join-fields2",
                      dest="join_fields2",
                      type="string",
                      help="join fields in second table.")
    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="set operation to perform.",
                      choices=("intersection", "rest", "union"))

    parser.set_defaults(
        remove=False,
        unique=False,
        join_fields1=None,
        join_fields2=None,
        method="intersection",
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if len(args) != 2:
        raise "please specify two files to join."

    if not options.join_fields1 or not options.join_fields2:
        raise "please specify at least one join field per table."

    options.join_fields1 = options.join_fields1.split(",")
    options.join_fields2 = options.join_fields2.split(",")

    options.filename1, options.filename2 = args

    fields1, table1 = CSV.ReadTable(open(options.filename1, "r"))
    fields2, table2 = CSV.ReadTable(open(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    nfields1 = []
    for x in range(len(fields1)):
        if fields1[x] in options.join_fields1: nfields1.append(x)
    nfields2 = []
    for x in range(len(fields2)):
        if fields2[x] in options.join_fields2: nfields2.append(x)

    ## calculate row indices: double keys are not taken care of here
    keys = {}
    for row1 in table1:
        v = map(lambda x: row1[x], nfields1)
        key = hashlib.md5("".join(v)).digest()
        keys[key] = row1

    if options.method == "intersection":
        ## build new field list
        take = range(len(fields1))
        c = len(take)
        for x in fields2:
            if x not in options.join_fields2:
                take.append(c)
            c += 1

        t = fields1 + fields2

        new_fields = map(lambda x: t[x], take)

        print "\t".join(new_fields)

        for row2 in table2:
            v = map(lambda x: row2[x], nfields2)
            key = hashlib.md5("".join(v)).digest()
            if key in keys:
                new_row = keys[key] + row2
                outfile.write("\t".join(map(lambda x: new_row[x], take)) +
                              "\n")

    elif options.method == "rest":

        new_fields = fields2
        print "\t".join(new_fields)

        for row2 in table2:
            v = map(lambda x: row2[x], nfields2)
            key = hashlib.md5("".join(v)).digest()
            if key not in keys:
                outfile.write("\t".join(row2) + "\n")

    E.Stop()
Beispiel #12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: data2bins.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--column",
                      dest="column",
                      type="int",
                      help="column to split on.")

    parser.add_option("--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins to create.")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("equal-sized-bins", ),
                      help="method to use to bin data.")

    parser.add_option("--no-headers",
                      dest="has_headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option(
        "-p",
        "--output-filename-pattern",
        dest="output_filename_pattern",
        type="string",
        help=
        "OUTPUT filename with histogram information on aggregate coverages [%default]."
    )

    parser.set_defaults(
        has_headers=True,
        method="equal-sized-bins",
        column=1,
        num_bins=4,
        output_filename_pattern="bin%i",
    )

    (options, args) = E.Start(parser)
    options.column -= 1

    if args:
        if args[0] == "-":
            infile = sys.stdin
        else:
            infile = open(args[0], "r")
    else:
        infile = sys.stdin

    fields, data = CSV.ReadTable(infile)

    c = options.column
    values = [float(x[c]) for x in data]

    bins = []

    if options.method == "equal-sized-bins":
        increment = int(math.floor(float(len(values)) / options.num_bins))
        indices = range(0, len(values))
        indices.sort(key=lambda x: values[x])
        for x in xrange(len(values)):
            values[indices[x]] = x
        bins = range(0, len(values) - increment, increment)

    elif options.method == "pass":
        pass

    E.debug("bins=%s" % str(bins))

    outputters = []
    for x in xrange(0, len(bins)):
        outputters.append(
            Outputter(options.output_filename_pattern % x, fields))

    # output tables
    for x in xrange(0, len(data)):
        bin = bisect.bisect(bins, values[x]) - 1
        outputters[bin].write(data[x])

    # stats
    if options.loglevel >= 1:
        options.stdlog.write("# bin\tstart\tcounts\tfilename\n")
        for x in xrange(0, len(bins)):
            options.stdlog.write(
                "# %i\t%f\t%i\t%s\n" %
                (x, bins[x], outputters[x].mCounts, outputters[x].mFilename))

    E.info("ninput=%i, noutput=%i" %
           (len(data), sum((x.mCounts for x in outputters))))

    E.Stop()
Beispiel #13
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: table2table.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m", "--method", dest="methods", type="choice", action="append",
                      choices=( "transpose", "normalize-by-max","normalize-by-value","multiply-by-value",
                               "percentile","remove-header","normalize-by-table",
                               "upper-bound","lower-bound","kullback-leibler",
                                "expand","compress", "fdr", "grep" ),
                      help="""actions to perform on table.""")
    
    parser.add_option("-s", "--scale", dest="scale", type="float",
                      help="factor to scale matrix by."  )
    
    parser.add_option("-f", "--format", dest="format", type="string",
                      help="output number format."  )

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="Parameters for various functions."  )

    parser.add_option("-t", "--headers", dest="has_headers", action="store_true",
                      help="matrix has row/column headers."  )

    parser.add_option("--transpose", dest="transpose", action="store_true",
                      help="transpose table."  )

    parser.add_option("--set-transpose-field", dest="set_transpose_field", type="string",
                      help="set first field (row 1 and col 1) to this value [%default]."  )

    parser.add_option("--transpose-format", dest="transpose_format", type="choice",
                      choices=("default", "separated", ),
                      help="input format of un-transposed table"  )

    parser.add_option("--expand", dest="expand_table", action="store_true",
                      help="expand table - multi-value cells with be expanded over several rows."  )

    parser.add_option("--no-headers", dest="has_headers", action="store_false",
                      help="matrix has no row/column headers."  )

    parser.add_option("--columns", dest="columns", type="string",
                      help="columns to use."  )

    parser.add_option( "--file", dest="file", type="string",
                      help="columns to test from table.",
                      metavar="FILE" )

    parser.add_option("-d", "--delimiter", dest="delimiter", type="string",
                      help="delimiter of columns." ,
                      metavar="DELIM" )

    parser.add_option("-V", "--invert-match", dest="invert_match", action="store_true",
                      help="invert match." )

    parser.add_option("--sort-by-rows", dest="sort_rows", type="string",
                      help="output order for rows."  )

    parser.add_option("-a", "--value", dest="value", type="float",
                      help="value to use for various algorithms."  )

    parser.add_option("--group", dest="group_column", type="int",
                      help="group values by column. Supply an integer column [default=%default]"  )

    parser.add_option("--group-function", dest="group_function", type="choice",
                      choices=("min", "max", "sum", "mean", "stats", "cat", "uniq"),
                      help="function to group values by."  )

    parser.add_option("--join-table", dest="join_column", type="int",
                      help="join rows in a table by columns."  )

    parser.add_option("--collapse-table", dest="collapse_table", type="string",
                      help="collapse a table. Value determines the missing variable [%default]."  )

    parser.add_option("--join-column-name", dest="join_column_name", type="int",
                      help="use this column as a prefix."  )

    parser.add_option("--flatten-table", dest="flatten_table", action="store_true",
                      help="flatten a table [%default]."  )

    parser.add_option("--as-column", dest="as_column", action="store_true",
                      help="output table as a single column."  )

    parser.add_option("--split-fields", dest="split_fields", action="store_true",
                      help="split fields."  )

    parser.add_option("--separator", dest="separator", type="string",
                      help="separator for multi-valued fields [default=%default]."  )

    parser.add_option( "--fdr-method", dest="fdr_method", type="choice",
                      choices = ( "BH", "bonferroni", "holm", "hommel", "hochberg", "BY" ),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )

    parser.add_option( "--fdr-add-column", dest="fdr_add_column", type="string",
                       help = "add new column instead of replacing existing columns. "
                       "The value of the option will be used as prefix if there are multiple columns [%default]" )

    #IMS: add option to use a column as the row id in flatten
    parser.add_option("--id-column", dest="id_column", type ="string",
                      help="list of column(s) to use as the row id when flattening the table. "
                      "If None, then row number is used. [default=%default].")

    parser.add_option("--variable-name", dest="variable_name", type = "string",
                      help="the column header for the 'variable' column when flattening [default=%default].")

    parser.add_option("--value-name", dest="value_name", type = "string",
                      help="the column header for the 'value' column when flattening [default=%default].")


    parser.set_defaults(
        methods = [],
        scale = 1.0,
        has_headers = True,
        format = "%5.2f",
        value = 0.0,
        parameters = "",
        columns = "all",
        transpose = False,
        set_transpose_field = None,
        transpose_format = "default",
        group = False,
        group_column = 0,
        group_function = "mean",
        missing_value = "na",
        sort_rows = None,
        flatten_table= False,
        collapse_table = None,
        separator = ";",
        expand = False,
        join_column = None,
        join_column_name = None,
        compute_fdr = None,
        as_column = False,
        fdr_method= "BH",
        fdr_add_column = None,
        id_column=None,
        variable_name="column",
        value_name="value",
        file=None,
        delimiter = "\t",
        invert_match = False,
        )
    
    (options, args) = E.Start( parser, add_pipe_options = True )

    options.parameters = options.parameters.split(",")
    
    if options.group_column:
        options.group = True
        options.group_column -= 1

    ######################################################################
    ######################################################################
    ######################################################################
    ## if only to remove header, do this quickly
    if options.methods== ["remove-header"]:
        
        first = True
        for line in options.stdin:
            if line[0] == "#": continue
            if first:
                first = False
                continue
            options.stdout.write( line )

    elif options.transpose or "transpose" in options.methods:

        readAndTransposeTable( options.stdin, options )


    elif options.flatten_table:
        #IMS: bug fixed to make work. Also added options for keying on a particular
        #     and adding custom column headings

        fields, table  = CSV.ReadTable( options.stdin, with_header = options.has_headers, as_rows = True )
        
        options.columns = getColumns( fields, options.columns )
        
        if options.id_column:
            id_columns = map(lambda x: int(x) -1,options.id_column.split(","))
            id_header = "\t".join([fields[id_column] for id_column in id_columns])
            options.columns = [x for x in options.columns if x not in id_columns]
        else:
            id_header = "row"

        options.stdout.write( "%s\t%s\t%s\n" %(id_header, options.variable_name, options.value_name) )
        
        for x, row in enumerate(table):

            if options.id_column:
                row_id = "\t".join([row[int(x)-1] for x in options.id_column.split(",")])
            else:
                row_id = str(x)

            for y in options.columns:
                options.stdout.write( "%s\t%s\t%s\n" % (row_id,fields[y], row[y] ))

    elif options.as_column:
        
        fields, table  = CSV.ReadTable( options.stdin, with_header = options.has_headers, as_rows = True )
        options.columns = getColumns( fields, options.columns )
        table = zip( *table )
        
        options.stdout.write( "value\n" )
        
        for column in options.columns:
            options.stdout.write("\n".join( table[column] ) + "\n" )

    elif options.split_fields:

        # split comma separated fields
        fields, table  = CSV.ReadTable( options.stdin, 
                                        with_header = options.has_headers, 
                                        as_rows = True )
        

        options.stdout.write( "%s\n" % ("\t".join(fields)))

        for row in table:
            row = [ x.split(options.separator) for x in row ]
            for d in itertools.product( *row ):
                options.stdout.write( "%s\n" % "\t".join( d ) )
            
    elif options.group:
        readAndGroupTable( options.stdin, options )

    elif options.join_column:
        readAndJoinTable( options.stdin, options )

    elif options.expand_table:
        readAndExpandTable( options.stdin, options )

    elif options.collapse_table != None:
        readAndCollapseTable( options.stdin, options, options.collapse_table )

    elif "grep" in options.methods:

        options.columns = map(lambda x: int(x)-1, options.columns.split(","))

        patterns = []

        if options.file:
            infile = open( options.file, "r")
            for line in infile:
                if line[0] == "#": continue
                patterns.append( line[:-1].split(options.delimiter)[0] )
        else:
            patterns=args

        for line in options.stdin:

            data = line[:-1].split(options.delimiter)
            found = False

            for c in options.columns:

                if data[c] in patterns:
                    found = True
                    break

            if (not found and options.invert_match) or (found and not options.invert_match):
                print line[:-1]
    else:

        ######################################################################
        ######################################################################
        ######################################################################
        ## Apply remainder of transformations
        fields, table  = CSV.ReadTable( options.stdin, with_header = options.has_headers, as_rows = False )
        # convert columns to list
        table = [ list(x) for x in table]

        ncols = len(fields)
        if len(table) == 0:
            raise ValueError( "table is empty" )
            
        nrows = len(table[0])

        E.info( "processing table with %i rows and %i columns" % (nrows, ncols) )

        options.columns = getColumns( fields, options.columns )
        
        ## convert all values to float
        for c in options.columns:
            for r in range(nrows):
                try:
                    table[c][r] = float (table[c][r] )
                except ValueError:
                    continue

        for method in options.methods:

            if method == "normalize-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = map( lambda x: x / value, table[c] )

            elif method == "multiply-by-value":

                value = float(options.parameters[0])
                del options.parameters[0]

                for c in options.columns:
                    table[c] = map( lambda x: x * value, table[c] )

            elif method == "normalize-by-max":

                for c in options.columns:
                    m = max( table[c] )
                    table[c] = map( lambda x: x / m, table[c] )

            elif method == "kullback-leibler":
                options.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n")
                for x in range(0,len(options.columns)-1):
                    for y in range(x+1, len(options.columns)):
                        c1 = options.columns[x]
                        c2 = options.columns[y]
                        e1 = 0
                        e2 = 0
                        for z in range(nrows):
                            p = table[c1][z]
                            q = table[c2][z]
                            e1 += p * math.log( p / q )
                            e2 += q * math.log( q / p )

                        options.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2],
                                                                       options.format % e1,
                                                                       options.format % e2,
                                                                       options.format % ((e1 + e2) / 2)) )
                E.Stop()
                sys.exit(0)

            elif method == "rank":

                for c in options.columns:
                    tt = table[c]
                    t = zip( tt, range(nrows) )
                    t.sort()
                    for i,n in zip( map(lambda x: x[1], t), range(nrows)):
                        tt[i] = n

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[0])
                del options.parameters[0]
                new_value = float(options.parameters[0])
                del options.parameters[0]

                if method == "upper-bound":
                    for c in options.columns:                
                        for r in range(nrows):
                            if type(table[c][r]) == types.FloatType and \
                                   table[c][r] > boundary:
                                table[c][r] = new_value
                else:
                    for c in options.columns:                
                        for r in range(nrows):
                            if type(table[c][r]) == types.FloatType and \
                                    table[c][r] < boundary:
                                table[c][r] = new_value

            elif method == "fdr":
                pvalues = []
                for c in options.columns: pvalues.extend( table[c] )

                assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % str(max(pvalues))
                assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % str(min(pvalues))

                # convert to str to avoid test for float downstream
                qvalues = map(str, Stats.adjustPValues( pvalues, method = options.fdr_method ))

                if options.fdr_add_column == None:
                    x = 0
                    for c in options.columns: 
                        table[c] = qvalues[x:x+nrows]
                        x += nrows
                else:
                    # add new column headers

                    if len(options.columns) == 1:
                        fields.append( options.fdr_add_column )
                    else:
                        for co in options.columns:
                            fields.append( options.fdr_add_column + fields[c] )

                    x = 0
                    for c in options.columns:
                        # add a new column
                        table.append(qvalues[x:x+nrows])
                        x += nrows
                    ncols += len(options.columns)

            elif method == "normalize-by-table":

                other_table_name = options.parameters[0]
                del options.parameters[0]
                other_fields, other_table  = CSV.ReadTable( open(other_table_name, "r"), with_header = options.has_headers, as_rows = False )

                # convert all values to float
                for c in options.columns:
                    for r in range(nrows):
                        try:
                            other_table[c][r] = float (other_table[c][r] )
                        except ValueError:
                            continue

                ## set 0s to 1 in the other matrix
                for c in options.columns:            
                    for r in range(nrows):
                        if type(table[c][r]) == types.FloatType and \
                               type(other_table[c][r]) == types.FloatType and \
                               other_table[c][r] != 0:
                               table[c][r] /= other_table[c][r]
                        else:
                            table[c][r] = options.missing_value

        ## convert back
        for c in options.columns:
            for r in range(nrows):
                if type(table[c][r]) == types.FloatType:
                    table[c][r] = options.format % table[c][r]

        options.stdout.write( "\t".join(fields) + "\n" )
        if options.sort_rows:
            old2new = {}
            for r in range(nrows):
                old2new[table[0][r]] = r
            for x in options.sort_rows.split(","):
                if x not in old2new: continue
                r = old2new[x]
                options.stdout.write( "\t".join( [ table[c][r] for c in range(ncols) ] ) + "\n")            
        else:
            for r in range(nrows):
                options.stdout.write( "\t".join( [ table[c][r] for c in range(ncols) ] ) + "\n")

    E.Stop()
Beispiel #14
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_ribosomes.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--schemas",
                      dest="schemas",
                      type="string",
                      help="schemas in the set.")

    parser.add_option("-e",
                      "--field-extract",
                      dest="field_extract",
                      type="string",
                      help="pattern for the field to extract.")

    parser.add_option("-c",
                      "--field-compare",
                      dest="field_compare",
                      type="string",
                      help="pattern for the field to compare.")

    parser.add_option("-i",
                      "--filename-identifiers",
                      dest="filename_identifiers",
                      type="string",
                      help="identifiers in the positive set.")

    parser.add_option("-u",
                      "--filename-subset",
                      dest="filename_subset",
                      type="string",
                      help="subset in the positive set.")

    parser.add_option("--filter-min-ratio",
                      dest="filter_min_ratio",
                      type="float",
                      help="minimum boundary for filter.")

    parser.add_option("--filter-max-ratio",
                      dest="filter_max_ratio",
                      type="float",
                      help="maximum boundary for filter.")

    parser.add_option(
        "-o",
        "--output-fields",
        dest="output_fields",
        type="string",
        help=
        "output fields, choices are: zscore, val, nvals, sum, min, max, stddev, mean, median."
    )

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        "pattern for table headers, should contain %s for schema and %s for field anme."
    )

    parser.add_option(
        "-f",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("table", "list", "values"),
        help="output format. Tabular form (one row per ortholog) or list form."
    )

    parser.add_option("--format",
                      dest="format",
                      type="string",
                      help="output format for numbers.")

    parser.add_option("--remove-na",
                      dest="remove_na",
                      action="store_true",
                      help="remove entries with any na values.")

    parser.set_defaults(
        field_extract="%s_length",
        field_compare="%s_length",
        filename_identifiers=None,
        filename_subset=None,
        filter_min_ratio=0.00,
        filter_max_ratio=0.00,
        schemas="",
        output_fields="",
        output_pattern="%s_%s",
        output_format="table",
        format="%6.4f",
        remove_na=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    options.schemas = options.schemas.split(",")
    if not options.schemas:
        raise "please supply schemas."

    if options.output_fields:
        options.output_fields = options.output_fields.split(",")
    else:
        options.output_fields = ()

    fields, table = CSV.ReadTable(sys.stdin)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    if options.loglevel >= 1:
        options.stdlog.write("# read a %i x %i table.\n" %
                             (len(table), len(fields)))

    if options.filename_subset:
        subset, nerrors = IOTools.ReadList(open(options.filename_subset, "r"))
        subset = set(subset)

        table = filter(lambda x: x[0] in subset, table)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# subset of %i entries reduced table to a %i x %i table.\n" %
                (len(subset), len(table), len(fields)))

    if options.filename_identifiers:
        identifiers, nerrors = IOTools.ReadList(
            open(options.filename_identifiers, "r"))
    else:
        identifiers = []

    identifiers = set(identifiers)

    # extract rows with positive identifiers
    positive_rows = filter(lambda x: x[0] in identifiers, table)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# subset of %i identifiers gives %i positive entries.\n" %
            (len(identifiers), len(positive_rows)))

    if options.output_format == "table":
        options.stdout.write("id")
        for schema in options.schemas:
            if options.output_fields:
                for field in options.output_fields:
                    options.stdout.write("\t" + options.output_pattern %
                                         (schema, field))
            else:
                options.stdout.write("\t%s" % (schema))

        options.stdout.write("\n")
    else:
        options.stdout.write("schema\tvalue\n")

    if identifiers:
        for row in positive_rows:

            if options.output_format == "table":
                options.stdout.write(row[0])

            for schema in options.schemas:

                # set fields for extraction
                f_extract = map_fields2column[options.field_extract % schema]
                f_compare = map_fields2column[options.field_compare % schema]

                # get region for extraction
                if row[f_compare] != "na":
                    r = float(row[f_compare])
                    if options.filter_min_ratio or options.filter_max_ratio:
                        mi = r * options.filter_min_ratio
                        ma = r * options.filter_max_ratio
                        f = lambda x: x[f_compare] != "na" and float(
                            x[f_compare]
                        ) >= mi and float(x[f_compare]) <= ma and x[
                            0] not in identifiers and x[f_extract] != "na"
                    else:
                        f = lambda x: x[0] not in identifiers and x[f_extract
                                                                    ] != "na"
                    # extract values: filter by minimum and maximum range and remove
                    # positive identifiers.
                    v = float(row[f_extract])
                    values = map(lambda x: float(x[f_extract]),
                                 filter(f, table))

                    stats = Stats.DistributionalParameters(values)
                else:
                    v = None

                for field in options.output_fields:

                    if v is not None:
                        if field == "zscore":
                            f = options.format % stats.getZScore(v)
                        elif field == "diff":
                            f = options.format % (v - stats["mean"])
                        elif field == "reldiff":
                            f = options.format % (
                                (v - stats["mean"]) / stats["mean"])
                        elif field == "val":
                            f = options.format % v
                        else:
                            f = options.format % stats[field]
                    else:
                        f = "na"

                    if options.output_format == "table":
                        options.stdout.write("\t%s" % f)
                    elif options.output_format == "list":
                        options.stdout.write("%s\t%s\n" % (schema, f))
                    elif options.output_format == "values":
                        options.stdout.write(
                            "%s\t%s\t%5.2f\t%s\n" %
                            (row[0], schema, v, ",".join(
                                map(lambda x: options.format % x, values))))

            if options.output_format == "table":
                options.stdout.write("\n")

    else:

        extract_columns = []

        for schema in options.schemas:
            extract_columns.append(map_fields2column[options.field_extract %
                                                     schema])

        # simply dump a subset of values
        for row in table:

            skip = False

            if options.filter_min_ratio or options.filter_max_ratio:

                master = options.schemas[0]

                v = row[map_fields2column[options.field_compare % master]]

                if v == "na":
                    continue

                v = float(v)

                mi = v * options.filter_min_ratio
                ma = v * options.filter_max_ratio

                for schema in options.schemas[1:]:

                    r = row[map_fields2column[options.field_compare % schema]]

                    if r == "na":
                        if options.remove_na:
                            skip = True
                        continue

                    r = float(r)

                    if r < mi or r > ma:
                        skip = True
                        if options.loglevel >= 3:
                            if options.format == "table":
                                options.stdout.write("* ")
                                options.stdout.write("%s\t" % row[0])
                                options.stdout.write("\t".join(
                                    [row[y] for y in extract_columns]))
                                options.stdout.write("\n")
                        break

            if skip:
                continue

            if options.output_format == "table":
                options.stdout.write("%s\t" % row[0])
                options.stdout.write("\t".join(
                    [row[y] for y in extract_columns]))
                options.stdout.write("\n")

            elif options.output_format == "list":
                has_na = False
                for x in range(len(options.schemas)):
                    v = row[extract_columns[x]]
                    if v == "na":
                        has_na = True

                if has_na and options.remove_na:
                    continue

                for x in range(len(options.schemas)):
                    options.stdout.write(
                        "%s\t%s\n" %
                        (options.schemas[x], row[extract_columns[x]]))

    E.Stop()