コード例 #1
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/export_clade_data.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--filename-groups",
                      dest="filename_groups",
                      type="string",
                      help="filename with orthologous groups to extract.")

    parser.set_defaults(
        table_name_malis="malis_genes_aa",
        table_name_members="groups_members",
        mode="sequences",
        filename_groups=None,
        output_format="fasta",
        separator="|",
    )

    (options, args) = E.Start(parser, add_psql_options=True)

    # database handle for connecting to postgres
    dbhandle = pgdb.connect(options.psql_connection)

    if options.filename_groups:
        data, errors = IOTools.ReadList(open(options.filename_groups, "r"))
        groups = map(lambda x: x.split(options.separator)[:2], data)

        result = getMembersOfGroups(dbhandle, groups, options)

    if options.output_format == "fasta":
        for schema, gene_id, sequence in result:
            options.stdlog.write(">%s%s%s\n%s\n" %
                                 (schema, options.separator, gene_id,
                                  re.sub("-", "", sequence)))

    E.Stop()
コード例 #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="method to use [t-test=t-test,wilcox=wilcox]",
                      choices=("t-test", "wilcox"))
    parser.add_option("-1",
                      "--infile",
                      dest="filename_input",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("--header",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")

    parser.set_defaults(
        method="t-test",
        filename_input=None,
        header="value",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_input:
        infile = open(options.filename_input, "r")
    else:
        infile = sys.stdin

    values, errors = IOTools.ReadList(infile, map_function=float)
    if options.filename_input:
        infile.close()

    if errors:
        E.warn("errors in input: %s" % ";".join(map(str, errors)))

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.filename_input2:
        infile = open(options.filename_input2, "r")
        values2, errors2 = IOTools.ReadList(infile, map_function=float)
        infile.close()
    else:
        values2 = None

    stat = Stats.Summary(values)

    power, diff_at_power95 = None, None
    if options.method == "t-test":
        if values2:
            result = R.t_test(values, values2, *xargs, **kwargs)
        else:
            result = R.t_test(values, *xargs, **kwargs)
            # compute power of test
            power = R.power_t_test(n=len(values),
                                   delta=abs(stat["mean"]),
                                   sd=stat["stddev"],
                                   sig_level=0.05)['power']
            diff_at_power95 = R.power_t_test(n=len(values),
                                             power=0.95,
                                             sd=stat["stddev"],
                                             sig_level=0.05)['delta']

    if options.method == "wilcox":
        result = R.wilcox_test(values, *xargs, **kwargs)

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key, value in sorted(result.items()):
        if key == "data.name":
            continue
        if key == "p.value":
            options.stdout.write("%s\t%5.2e\n" % (str(key), value))
        else:
            options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    for key, value in stat.items():
        options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    if power:
        options.stdout.write("1-power\t%5.2e\n" % (1.0 - power))
        options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95)

    E.Stop()
コード例 #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="string",
        help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("-p",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
    )

    (options, args) = E.Start(
        parser,
        add_pipe_options=True,
        add_psql_options=True,
    )

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))

    values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"),
                                        map_category=map_category2value)
    values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                        map_category=map_category2value)

    E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" %
           (len(values1), len(errors1), len(values2), len(errors2)))

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2)
    elif options.method == "mwu":
        result = R.wilcox_test(values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)

    R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

    R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
      )

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')"""
      )
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')"""
      )
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )

    print "## Results for %s" % result['method']
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print x, result[x]

    E.Stop()
コード例 #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        help=
        "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
        choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for histograms."
                      "")
    parser.add_option("-f",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option(
        "-n",
        "--norm-test",
        dest="norm_test",
        action="store_true",
        help=
        """test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot",
                      dest="plot",
                      action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write(
            "# creating %i samples from normal distribution with mean %f and stddev %f\n"
            % (len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" %
            (len(values1), len(errors1), len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=False,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=True,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created."
            )
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
          )

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()
コード例 #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2malis.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-d",
        "--pattern-output",
        dest="pattern_output",
        type="string",
        help="filename pattern for output multiple alignment files.")

    parser.add_option("-f",
                      "--filename-filter",
                      dest="filename_filter",
                      type="string",
                      help="filename with strings to filter by.")

    parser.add_option("--list-filter",
                      dest="list_filter",
                      type="string",
                      help="list of strings to filter by.")

    parser.set_defaults(
        pattern_output="%s.mali",
        methods="",
        parameters="",
        filename_filter=None,
        list_filter=None,
    )

    addOptions(parser)

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    if not options.pattern_mali:
        raise "Please specifiy a pattern to find the malis using --pattern-mali"

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read components
    ####################################################################
    map_seq_id2component, map_component2seq_id, map_component2input_id = \
        readComponents( options )

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read filtering information
    ####################################################################
    if options.filename_filter:
        id_filter, nerrors = IOTools.ReadList(
            open(options.filename_filter, "r"))
        if options.loglevel >= 1:
            options.stdlog.write(
                "# read %i identifiers to filter each multiple alignment with.\n"
                % len(id_filter))
            options.stdlog.flush()
    elif options.list_filter:
        id_filter = options.list_filter.split(",")
    else:
        id_filter = None

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to mask
    ####################################################################
    map_component2masks = readMasks(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to extract
    ####################################################################
    map_component2extracts = readExtracts(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to annotate
    ####################################################################
    map_component2annotations = readAnnotations(options,
                                                map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Prepare for run
    ####################################################################
    component_ids = map_component2seq_id.keys()
    component_ids.sort()

    if options.loglevel >= 1:
        options.stdlog.write("# %i component ids to start with.\n" %
                             (len(component_ids)))

    component_ids, map_sample2reference = selectComponents(
        component_ids, map_component2seq_id, map_component2input_id, id_filter,
        options)

    if options.test:
        component_ids = component_ids[:options.test]

    if options.loglevel >= 1:
        options.stdlog.write("# %i component ids selected for output.\n" %
                             (len(component_ids)))

    ninput = 0
    noutput = 0
    nskipped = 0
    nskipped_length = 0

    for component_id in component_ids:

        ninput += 1

        if options.loglevel >= 3:
            options.stdlog.write("# processing component %s\n" %
                                 (component_id))

        mali = getMali(component_id, map_component2seq_id,
                       map_component2input_id, id_filter, options)

        if mali == None:
            E.warn("empty mali returned for component %s" % (component_id))
            nskipped += 1
            continue

        if mali.getNumColumns() == 0:
            E.warn("skipping output of empty alignment for component %s" %
                   (component_id))
            nskipped += 1
            continue

        mali.setName(str(component_id))

        ###############################################################
        ## add annotations
        if map_component2annotations != None:
            annotateAlignment(mali, map_component2annotations, options)

        ###############################################################
        ## mask the alignment
        maskAlignment(mali, map_component2masks, map_component2extracts,
                      map_sample2reference, options)

        if mali.getNumColumns() < options.minimum_mali_length:
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# component %s: skipped, because length %i less than threshold.\n"
                    % (component_id, mali.getNumColumns()))
            continue

        ###############################################################
        ## prepare the mali for output
        if "%s" not in options.pattern_output:
            append = True
        else:
            append = False

        output_filename = re.sub("%s", component_id, options.pattern_output)
        input_id = map_component2input_id[component_id]

        if options.loglevel >= 2:
            options.stdlog.write(
                "# component %s: input from %s, goes to %s\n" %
                (component_id, input_id, output_filename))

        dirname = os.path.dirname(output_filename)

        if dirname and not os.path.exists(dirname):
            os.makedirs(dirname)

        if not os.path.exists(output_filename):
            mali.writeToFile(open(output_filename, "w"),
                             format=options.output_format)
            noutput += 1
        else:
            if append:
                mali.writeToFile(open(output_filename, "a"),
                                 format=options.output_format)
                noutput += 1
            else:
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# skipping because output for component %s already exists: %s\n"
                        % (component_id, output_filename))
                nskipped += 1

        # if we only sample, stop if you have reached
        # the desired number
        if options.sample and noutput == options.sample:
            break

    E.info("ninput=%i, noutput=%i, nskipped=%i, nskipped_length=%i" %
           (ninput, noutput, nskipped, nskipped_length))

    E.Stop()
コード例 #6
0
                        aggregate="mean",
                        value_format="%5.2f",
                        method="counts")

    (options, args) = E.Start(parser)

    if not options.filename_map:
        raise "please supply filename mapping probesets to identifiers."

    map_probe2locus = IOTools.ReadMap(open(options.filename_map, "r"))

    matrix, row_headers, col_headers = MatlabTools.readMatrix(
        sys.stdin, format="full", headers=options.headers)

    if options.filename_tissues:
        tissues, nerrors = IOTools.ReadList(open(options.filename_tissues,
                                                 "r"))
        tissues = set(tissues)
        columns = []
        for x in range(len(col_headers)):
            if col_headers[x] in tissues:
                columns.append(x)
    else:
        columns = range(len(col_headers))

    nrows, ncols = len(row_headers), len(col_headers)

    ninput, noutput, nkept = 0, 0, 0

    no_map = []
    degenerate = []
コード例 #7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2summary.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na"),
        help="alphabet to use [default=%default].",
    )

    parser.add_option(
        "-p",
        "--pattern-mali",
        dest="pattern_mali",
        type="string",
        help="filename pattern for input multiple alignment files.")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        mask_chars="nN",
        gap_chars="-.",
        alphabet="na",
        pattern_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.pattern_mali:
        prefix_header = "prefix\t"
        prefix_row = "\t"
    else:
        prefix_header = ""
        prefix_row = ""

    options.stdout.write(
        "%sncol_mean\tpcol_mean\tncol_median\tpcol_median\tnrow_mean\tprow_mean\tnrow_median\tprow_median\n"
        % (prefix_header, ))

    ninput, nskipped, noutput, nempty = 0, 0, 0, 0

    if options.pattern_mali:

        ids, errors = IOTools.ReadList(sys.stdin)

        E.debug("read %i identifiers.\n" % len(ids))

        nsubstitutions = len(re.findall("%s", options.pattern_mali))

        for id in ids:

            filename = options.pattern_mali % tuple([id] * nsubstitutions)
            ninput += 1

            if not os.path.exists(filename):
                nskipped += 1
                continue

            # read multiple alignment in various formats
            mali = Mali.Mali()
            mali.readFromFile(open(filename, "r"), format=options.input_format)

            if mali.isEmpty():
                nempty += 1
                continue

            E.debug("read mali with %i entries from %s.\n" %
                    (len(mali), filename))

            if analyzeMali(mali, options, prefix_row="%s\t" % id):
                noutput += 1

    else:

        # read multiple alignment in various formats
        mali = Mali.Mali()
        mali.readFromFile(sys.stdin, format=options.input_format)
        ninput += 1

        if mali.isEmpty():
            nempty += 1
        else:
            E.debug("read mali with %i entries." % (len(mali)))

            if analyzeMali(mali, options, prefix_row=""):
                noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, nempty=%i." %
           (ninput, noutput, nskipped, nempty))

    E.Stop()
コード例 #8
0
ファイル: filter_fasta.py プロジェクト: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/filter_fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("longest-transcript", "ids", "quality"),
        help=
        """method to apply to sequences ["longest-transcript", "ids", "quality"]."""
    )

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("-t",
                      "--type",
                      dest="type",
                      type="choice",
                      choices=("aa", "na"),
                      help="sequence type (aa or na).")

    parser.set_defaults(
        methods="",
        parameters="",
        type="na",
        aa_mask_chars="xX",
        aa_mask_char="x",
        na_mask_chars="nN",
        na_mask_char="n",
        gap_chars="-.",
        gap_char="-",
        template_identifier="ID%06i",
        separator="|",
    )

    (options, args) = E.Start(parser)
    options.parameters = options.parameters.split(",")

    iterator = FastaIterator.FastaIterator(sys.stdin)

    if options.method == "quality":
        filter_quality = set(options.parameters)
    else:
        filter_quality = None

    sequences = []
    ninput, noutput, nskipped = 0, 0, 0

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1

        if filter_quality:
            id = re.split(" ", cur_record.title)[0]
            species, transcript, gene, quality = id.split(options.separator)

            if quality not in filter_quality:
                nskipped += 1
                continue

        sequences.append(cur_record)

    take = None

    if options.method == "longest-transcript":

        take = []
        lengths = []
        for x in range(len(sequences)):
            l = len(re.sub(" ", "", sequences[x].sequence))
            id = re.split(" ", sequences[x].title)[0]
            species, transcript, gene = id.split(options.separator)[:3]
            lengths.append((species, gene, -l, x))

        lengths.sort()

        last_species = None
        last_gene = None

        for species, gene, l, x in lengths:
            if last_species == species and last_gene == gene:
                continue
            take.append(x)
            last_species, last_gene = species, gene

    elif options.method == "ids":

        take = []
        ids, nerrors = IOTools.ReadList(open(options.parameters[0], "r"))
        del options.parameters[0]

        ids = set(ids)

        for x in range(len(sequences)):
            id = re.split(" ", sequences[x].title)[0]
            if id in ids:
                take.append(x)

    if take != None:
        sequences = map(lambda x: sequences[x], take)

    noutput = len(sequences)

    for sequence in sequences:
        options.stdout.write(">%s\n%s\n" % (sequence.title, sequence.sequence))

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
コード例 #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--filename-filter-positives",
                      dest="filename_filter_positives",
                      type="string",
                      help="filename with positive list of trees to analyze.")

    parser.add_option("-s",
                      "--filename-species-tree",
                      dest="filename_species_tree",
                      type="string",
                      help="filename with species tree.")

    parser.add_option(
        "--filename-species2colour",
        dest="filename_species2colour",
        type="string",
        help=
        "filename with map of species to colours. If not given, random colours are assigned to species."
    )

    parser.add_option("-t",
                      "--species-tree",
                      dest="species_tree",
                      type="string",
                      help="species tree.")

    parser.add_option(
        "-e",
        "--filename-locations",
        dest="filename_locations",
        type="string",
        help=
        "filename with map of transcript information to location information.")

    parser.add_option("--no-create",
                      dest="create",
                      action="store_false",
                      help="do not create files, but append to them.")

    parser.add_option(
        "--max-separation",
        dest="max_separation",
        type="int",
        help=
        "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)."
    )

    parser.add_option(
        "--filename-species2url",
        dest="filename_species2url",
        type="string",
        help="filename with mapping information of species to URL.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to add as first column.")

    parser.add_option(
        "--outgroup-species",
        dest="outgroup_species",
        type="string",
        help="species to used as outgroups. Separate multiple species by ','.")

    parser.add_option("--subtrees-trees",
                      dest="subtrees_trees",
                      action="store_true",
                      help="write trees for subtrees.")

    parser.add_option("--subtrees-identifiers",
                      dest="subtrees_identifiers",
                      action="store_true",
                      help="write identifiers of subtrees.")

    parser.add_option("--svg-add-ids",
                      dest="svg_add_ids",
                      action="store_true",
                      help="add node ids to svg plot.")

    parser.add_option("--svg-otus",
                      dest="svg_otus",
                      type="string",
                      help="otus to output in svg species tree.")

    parser.add_option("--svg-branch-lenghts",
                      dest="svg_branch_lengths",
                      type="choice",
                      choices=("contemporary", "uniform", "median"),
                      help="branch lengths in species tree.")

    parser.add_option("--print-totals",
                      dest="print_totals",
                      action="store_true",
                      help="output totals sections.")

    parser.add_option("--print-subtotals",
                      dest="print_subtotals",
                      action="store_true",
                      help="output subtotals sections.")

    parser.add_option(
        "--print-best",
        dest="print_best",
        action="store_true",
        help="output best node assignment for each node in gene tree.")

    parser.add_option("--print-svg",
                      dest="print_svg",
                      action="store_true",
                      help="output svg files.")

    parser.add_option("--print-species-svg",
                      dest="print_species_svg",
                      action="store_true",
                      help="output species svg files.")

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        """output pattern for separate output of sections [default: %default].
                       Set to None, if output to stdout. Can contain one %s to be substituted with section."""
    )

    parser.add_option(
        "--output-pattern-svg",
        dest="output_pattern_svg",
        type="string",
        help=
        "filename for svg output. If it contains %s, this is replaced by gene_tree name."
    )

    parser.add_option(
        "--filename-node-types",
        dest="filename_node_types",
        type="string",
        help="filename with node type information from a previous run.")

    parser.add_option("--analyze-resolution-data",
                      dest="analyze_resolution_data",
                      type="choice",
                      action="append",
                      choices=("stats", "histograms"),
                      help="stdin is resolution data.")

    parser.add_option("--filter-quality",
                      dest="filter_quality",
                      type="choice",
                      choices=("all", "genes", "pseudogenes"),
                      help="filter predictions by gene type.")

    parser.add_option("--filter-location",
                      dest="filter_location",
                      type="choice",
                      choices=("all", "local", "non-local", "cis", "unplaced"),
                      help="filter predictions by location.")

    parser.add_option("--remove-unplaced",
                      dest="remove_unplaced",
                      action="store_true",
                      help="remove predictions on unplaced contigs.")

    parser.add_option("--skip-without-outgroups",
                      dest="skip_without_outgroups",
                      action="store_true",
                      help="skip clusters without outgroups.")

    parser.set_defaults(
        filter_quality="all",
        filter_location="all",
        remove_unplaced=False,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_species_tree=None,
        priority={
            "Speciation": 0,
            "SpeciationDeletion": 1,
            "Transcripts": 2,
            "DuplicationLineage": 3,
            "Duplication": 4,
            "DuplicationDeletion": 5,
            "DuplicationInconsistency": 6,
            "Outparalogs": 7,
            "InconsistentTranscripts": 8,
            "Inconsistency": 9,
            "Masked": 10
        },
        species_tree=None,
        filename_species2colour=None,
        filename_locations=None,
        max_separation=0,
        filename_species2url=None,
        separator="|",
        prefix=None,
        output_pattern=None,
        output_pattern_svg=None,
        outgroup_species=None,
        svg_add_ids=False,
        svg_branch_lengths="median",
        svg_otus=None,
        subtrees=False,
        print_svg=False,
        print_subtotals=False,
        print_totals=False,
        print_best=False,
        subtrees_identifiers=False,
        create=True,
        min_branch_length=0.00,
        filename_node_types=None,
        format_branch_length="%6.4f",
        nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"),
        analyze_resolution_data=None,
        warning_small_branch_length=0.01,
        filename_filter_positives=None,
        skip_without_outgroups=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    if options.outgroup_species:
        options.outgroup_species = set(options.outgroup_species.split(","))

    if options.svg_otus:
        options.svg_otus = set(options.svg_otus.split(","))

    rx_species = re.compile(options.species_regex)
    extract_species = lambda x: rx_species.match(x).groups()[0]
    if options.gene_regex:
        rx_gene = re.compile(options.gene_regex)
        extract_gene = lambda x: rx_gene.match(x).groups()[0]
    else:
        extract_gene = None

    extract_quality = lambda x: x.split(options.separator)[3]

    #########################################################################
    #########################################################################
    #########################################################################
    # read positive list of malis
    #########################################################################
    if options.filename_filter_positives:
        filter_positives, nerrors = IOTools.ReadList(
            open(options.filename_filter_positives, "r"))
        filter_positives = set(filter_positives)
    else:
        filter_positives = None

    #########################################################################
    #########################################################################
    #########################################################################
    # read location info
    #########################################################################
    if options.filename_locations:
        map_id2location = TreeReconciliation.readLocations(
            open(options.filename_locations, "r"), extract_species)
    else:
        map_id2location = {}

    if (options.remove_unplaced or options.filter_location != "all"
        ) and not options.filename_locations:
        raise "please supply a file with location information."

    #########################################################################
    #########################################################################
    #########################################################################
    # delete output files
    #########################################################################
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees",
                        "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write("# deleting file %s.\n" % fn)
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus(sys.stdin)

    Tree.updateNexus(gene_nexus)

    if options.loglevel >= 1:
        options.stdlog.write("# read %i gene trees from stdin.\n" %
                             len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################
    # main loop over gene trees
    #########################################################################
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0

    # total counts
    total_heights_per_species = {}
    total_relheights_per_species = {}
    total_heights_per_tree = []
    total_relheights_per_tree = []

    for gene_tree in gene_nexus.trees:

        ninput += 1

        xname = re.sub("_tree.*", "", gene_tree.name)
        xname = re.sub("subtree_", "", xname)

        if filter_positives and xname not in filter_positives:
            nskipped_filter += 1
            continue

        if options.loglevel >= 6:
            gene_tree.display()

        #######################################################################
        #######################################################################
        #######################################################################
        # get identifier for this tree and update prefixes accordingly
        #######################################################################
        if options.prefix:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix1\tprefix2\t"
                prefix_row = options.prefix + "\t" + gene_tree.name + "\t"
                prefix_prefix = options.prefix + "_" + gene_tree.name + "_"
                prefix_name = options.prefix + "_" + gene_tree.name
            else:
                prefix_header = "prefix\t"
                prefix_row = options.prefix + "\t"
                prefix_prefix = options.prefix + "_"
                prefix_name = options.prefix
        else:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix\t"
                prefix_row = gene_tree.name + "\t"
                prefix_prefix = gene_tree.name + "\t"
                prefix_name = gene_tree.name
            else:
                prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", ""

        #######################################################################
        #######################################################################
        #######################################################################
        # apply filters to gene tree
        #######################################################################
        TreeReconciliation.filterTree(gene_tree, options, map_id2location)

        otus = TreeTools.GetTaxa(gene_tree)

        if len(otus) <= 1:
            nfiltered += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty after filtering - skipped.\n" %
                    gene_tree.name)
            continue

        this_species_list = map(extract_species, otus)
        # check, if only outgroups
        if options.outgroup_species:
            if not set(this_species_list).difference(options.outgroup_species):
                nfiltered += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: only outgroups after filtering - skipped.\n"
                        % gene_tree.name)
                continue

            if options.skip_without_outgroups and not set(
                    this_species_list).intersection(options.outgroup_species):
                nskipped_outgroups += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: no outgroups - skipped.\n" %
                        gene_tree.name)
                continue

        #######################################################################
        #######################################################################
        #######################################################################
        # reroot gene tree, if outgroups have been given.
        #######################################################################
        if options.outgroup_species:
            TreeReconciliation.rerootTree(gene_tree, extract_species, options)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute distance to root for each node
        #######################################################################
        distance_to_root = TreeTools.GetDistanceToRoot(gene_tree)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute counts
        #######################################################################
        # heights per tree
        heights_per_tree = []
        # relative heights per tree
        relheights_per_tree = []
        # distance to root
        heights_per_species = {}
        # distance to root (relative to maximum distance to root)
        relheights_per_species = {}

        analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets(
            gene_tree, extract_quality, options)

        if len(analysis_set) == 0:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty analysis set - skipped.\n" %
                    gene_tree.name)
            nskipped += 1
            continue

        reference_height = TreeReconciliation.getReferenceHeight(
            distance_to_root,
            gene_tree,
            gene_set,
            options,
            extract_species,
            method="median")

        if reference_height is None:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: reference height not computable or 0 - skipped.\n"
                    % gene_tree.name)
            nskipped += 1
            continue

        for node_id in analysis_set:

            node = gene_tree.node(node_id)
            species = extract_species(node.data.taxon)
            height = distance_to_root[node_id]

            if height < options.warning_small_branch_length:
                options.stdlog.write(
                    "# tree %s: small distance %s to root at node %i: %s\n" %
                    (gene_tree.name, options.format_branch_length % height,
                     node_id, node.data.taxon))

            relheight = height / reference_height
            try:
                heights_per_species[species].append(height)
            except KeyError:
                heights_per_species[species] = [height]
                relheights_per_species[species] = []

            relheights_per_species[species].append(relheight)

            # do not use outgroup species
            if options.outgroup_species and species in options.outgroup_species:
                continue

            heights_per_tree.append(height)
            relheights_per_tree.append(relheight)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: reference_height=%s\n" %
                (gene_tree.name,
                 options.format_branch_length % reference_height))
            options.stdlog.flush()

        if options.print_subtotals:
            printCounts(heights_per_species, relheights_per_species,
                        heights_per_tree, relheights_per_tree, options,
                        prefix_header, prefix_row)

        #######################################################################
        #######################################################################
        #######################################################################
        # update total counts
        #######################################################################
        TreeReconciliation.appendCounts(total_heights_per_species,
                                        heights_per_species)
        TreeReconciliation.appendCounts(total_relheights_per_species,
                                        relheights_per_species)

        TreeReconciliation.appendCounts(total_heights_per_tree,
                                        heights_per_tree)
        TreeReconciliation.appendCounts(total_relheights_per_tree,
                                        relheights_per_tree)

        noutput += 1

    if options.print_totals:

        if options.prefix:
            prefix_header = "prefix1\tprefix2\t"
            prefix_row = options.prefix + "\t" + "total" + "\t"
            prefix_prefix = options.prefix + "_" + "total" + "_"
            prefix_name = options.prefix + "_" + "total"
        else:
            prefix_header = "prefix\t"
            prefix_row = "total" + "\t"
            prefix_prefix = "total" + "_"
            prefix_name = "total"

        printCounts(total_heights_per_species, total_relheights_per_species,
                    total_heights_per_tree, total_relheights_per_tree, options,
                    prefix_header, prefix_row)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n"
            % (ninput, nfiltered, nskipped, nskipped_filter,
               nskipped_outgroups, noutput))

    E.Stop()
コード例 #10
0
ファイル: malis2mali.py プロジェクト: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    addOptions(parser)

    parser.add_option(
        "--filename-coordinates",
        dest="filename_coordinates",
        type="string",
        help="filename of coordinates that constitute the multiple alignment.")

    parser.add_option("--filename-identifiers",
                      dest="filename_identifiers",
                      type="string",
                      help="filename with list of identifiers to use.")

    parser.add_option(
        "-x",
        "--pattern-identifier",
        dest="pattern_identifier",
        type="string",
        help="pattern to extract identifier from a sequence header.")

    parser.add_option(
        "-w",
        "--width",
        dest="width",
        type="int",
        help=
        "width of an alignment column (choose 3 for codon alignments) [default=%default]."
    )

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      choices=("filter-variants", ),
                      help="methods to apply")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("--mask-acgtn",
                      dest="mask_actgn",
                      action="store_true",
                      help="mask. Anything not [ACGTN] will be N.")

    parser.set_defaults(
        pattern_identifier="(^\S+)",
        methods=[],
        parameters="",
        filename_identifiers=None,
        filename_coordinates=None,
        mask_acgtn=False,
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    if not options.pattern_mali:
        raise ValueError(
            "Please specifiy a pattern to find the malis using --pattern-mali")

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read components
    ####################################################################
    map_seq_id2component, map_component2seq_id, map_component2input_id = \
        readComponents( options )

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to mask
    ####################################################################
    map_component2masks = readMasks(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to extract
    ####################################################################
    map_component2extracts = readExtracts(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## read identifiers
    ####################################################################
    if options.filename_identifiers:
        identifiers, nerrors = IOTools.ReadList(
            open(options.filename_identifiers, "r"))
        identifiers_set = set(identifiers)
    else:
        identifiers = None
        identifiers_set = None

    ####################################################################
    ####################################################################
    ####################################################################
    ## Prepare for run
    ####################################################################

    rx_identifier = re.compile(options.pattern_identifier)

    ## build list of concatenated malis
    sequences = {}
    if identifiers:
        for id in identifiers_set:
            sequences[id] = []
    else:
        identifiers_set = set()
        for seq_id in map_seq_id2component.keys():
            id = rx_identifier.search(seq_id).groups()[0]
            sequences[id] = []
            identifiers_set.add(id)
        identifiers = list(identifiers_set)
        identifiers.sort()

    component_ids = map_component2seq_id.keys()
    component_ids.sort()

    if options.test:
        component_ids = component_ids[:options.test]

    ####################################################################
    ####################################################################
    ####################################################################
    ## Build list of components to output.
    ####################################################################
    component_ids, map_sample2reference = selectComponents(
        component_ids, map_component2seq_id, map_component2input_id, None,
        options)

    nskipped = 0
    new_component_ids = []

    for component_id in component_ids:

        try:
            mali = getMali(component_id, map_component2seq_id,
                           map_component2input_id, None, options)
        except OSError, msg:
            E.warn("could not find mali %s: %s" % (component_id, msg))
            nskipped += 1
            continue

        ###############################################################
        ###############################################################
        ###############################################################
        ## check if all identifiers in component are present in mali
        ## and build a temporary alignment with all of those found
        component_set = set(map_component2seq_id[component_id])
        if len(component_set.difference(set(mali.getIdentifiers()))) != 0:
            nskipped += 1
            continue

        found = {}
        is_double = None
        temp_mali = Mali.Mali()
        temp_mali.setName(str(component_id))

        for seq_id in map_component2seq_id[component_id]:
            id = rx_identifier.search(seq_id).groups()[0]
            if id not in identifiers_set:
                continue
            if id in found:

                if options.skip_doubles:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# component %s: removed double entry %s\n" %
                            (component_id, seq_id))
                    continue
                else:
                    is_double = id
                    break

            if options.output_format == "codeml":
                if len(mali[seq_id]) % 3 != 0:
                    raise "length of sequence %s is not a multiple of 3: %i" % (
                        seq_id, len(mali[seq_id]))

            ## change identifier to id
            found[id] = True
            entry = mali.getEntry(seq_id)
            temp_mali.addSequence(id, entry.mFrom, entry.mTo, entry.mString)

        if is_double:
            nskipped += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# component %s: skipped because it contains double entry %s\n"
                    % (component_id, is_double))
            continue

        if set(found.keys()) != identifiers_set:
            nskipped += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# component %s: skipped because incomplete: %s\n" %
                    (component_id, str(found.keys())))
            continue

        ###############################################################
        ###############################################################
        ###############################################################
        ## mask the temporary alignment
        maskAlignment(temp_mali, map_component2masks, map_component2extracts,
                      map_sample2reference, options)

        for id, o in temp_mali.items():
            if options.mask_acgtn:
                s = re.sub("[^ACGTNactgn]", "N", o.mString)
            else:
                s = o.mString
            sequences[id].append(s)

        new_component_ids.append(component_id)

        ## if we only sample, stop if you have reached
        ## the desired number
        if options.sample and len(new_component_ids) == options.sample:
            break
コード例 #11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/regions2gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="pattern to look for sequence filename.")

    parser.add_option(
        "-i",
        "--ids",
        dest="ids",
        type="string",
        help=
        "comma separated list of prediction ids. Use 'all' to use all predictions."
    )

    parser.add_option("-f",
                      "--filename-ids",
                      dest="filename_ids",
                      type="string",
                      help="filename with prediction ids.")

    parser.add_option("-t",
                      "--type",
                      dest="type",
                      type="choice",
                      choices=("genes", "mrnas", "introns", "intronic",
                               "exons", "exonic", "intergenic",
                               "exons-third-codons"),
                      help="type to output.")

    parser.add_option(
        "-e",
        "--extend-region",
        dest="extend_region",
        type="int",
        help="regions are extended by this margin at either end.")

    parser.add_option(
        "-r",
        "--shorten-region",
        dest="shorten_region",
        type="int",
        help="regions are shortened by this margin at either end.")

    parser.add_option("-m",
                      "--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of segment.")

    parser.add_option("-s",
                      "--schema",
                      dest="schema",
                      type="string",
                      help="schema to take data from.")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("fasta", "table", "region"),
                      help="output formats.")

    parser.add_option("--fasta-format",
                      dest="fasta_format",
                      type="choice",
                      choices=("id-coordinates", "coordinates",
                               "schema-coordinates"),
                      help="output formats for fasta formatted headers.")

    parser.add_option("--orthologs",
                      dest="orthologs",
                      action="store_true",
                      help="lookup up orthologs of prediction ids.")

    parser.add_option("--multiple",
                      dest="multiple",
                      action="store_true",
                      help="""lookup up predictions in multiple species.
                       Identifiers should be given as schema|prediction_id[|additional_fields].
                       Note that the genome file locations have to be consistent."""
                      )

    parser.add_option("--id-format",
                      dest="id_format",
                      type="choice",
                      choices=("id", "schema-id", "full"),
                      help="output format for ids.")

    parser.add_option("--taboo-regions",
                      dest="taboo_regions",
                      type="choice",
                      choices=("same", "both"),
                      help="check for overlap in same/both strands.")

    parser.add_option("--filename-taboo-regions",
                      dest="filename_taboo_regions",
                      type="string",
                      help="filename with information about taboo regions.")

    parser.add_option(
        "--filename-properties",
        dest="filename_properties",
        type="string",
        help=
        "filename with mapping information between features and properties.")

    parser.add_option(
        "--invert-properties",
        dest="invert-properties",
        action="store_true",
        help=
        "instead of printing features which have properties, print those that have not."
    )

    parser.add_option(
        "--output-coordinate-format",
        dest="output_coordinate_format",
        type="choice",
        choices=("full", "long"),
        help=
        """output format of coordinates. Output format is contig:strand:from:to in zero based
/forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field"""
    )

    parser.set_defaults(genome_file="genome",
                        identifiers=None,
                        filename_ids="-",
                        ids=None,
                        extend_region=0,
                        shorten_region=0,
                        tablename_predictions="predictions",
                        tablename_exons="exons",
                        tablename_genes="genes",
                        tablename_quality="quality",
                        schema=None,
                        output_format="fasta",
                        fasta_format="id-coordinates",
                        type="mrnas",
                        min_length=1,
                        id_format="id",
                        mmultiple=False,
                        separator="|",
                        filename_taboo_regions=False,
                        output_coordinate_format="full",
                        filename_properties=None,
                        invert_property=False,
                        report_step=10000)

    (options, args) = E.Start(parser, add_psql_options=True)

    if options.orthologs: options.id_format = "schema-id"

    ## database handle for connecting to postgres
    dbhandle = pgdb.connect(options.psql_connection)

    ## Step 1 : Input of predictions

    ## read identifiers from file, command line arguments or stdin.

    if options.ids in ("all", "nr"):
        prediction_ids = options.ids
        if options.loglevel >= 1:
            options.stdlog.write("# using all prediction ids.\n")
            options.stdlog.flush()
    elif options.ids:
        prediction_ids = options.ids.split(",")
    elif len(args) > 0:
        prediction_ids = args

    elif options.filename_ids:
        prediction_ids = []

        if options.filename_ids == "-":
            prediction_ids += IOTools.ReadList(sys.stdin)[0]
        elif options.filename_ids:
            prediction_ids += IOTools.ReadList(open(options.filename_ids,
                                                    "r"))[0]

        if len(prediction_ids) == 0:
            raise "no prediction identifiers given."

        if options.loglevel >= 1:
            options.stdlog.write("# read %i prediction ids.\n" %
                                 len(prediction_ids))
            options.stdlog.flush()

    if options.filename_taboo_regions:
        ## Note: the input has to be in forward coordinates in order for option "both" to work.
        taboo_regions = Regions.RegionFilter()
        if options.taboo_regions == "both":
            ignore_strand = True
        else:
            ignore_strand = False
        taboo_regions.readFromFile(open(options.filename_taboo_regions, "r"),
                                   ignore_strand=ignore_strand)
    else:
        taboo_regions = None

    map_feature2property = getMapFeature2Property(options)

    processPredictions(dbhandle, options.schema, options, prediction_ids,
                       taboo_regions, map_feature2property)

    E.Stop()
コード例 #12
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-s", "--species", dest="species", type="string",
                      help="schema of master species."  )

    parser.set_defaults(
        tablename_orthologs = "orthology_pairwise1v5.orthologlinks_first",
        filename_ids = "-",
        schemas = None,
        species = None,
    )

    (options, args) = E.Start( parser, add_psql_options = True )

    dbhandle = pgdb.connect( options.psql_connection )

    if options.filename_ids == "-":
        ids, errors = IOTools.ReadList(sys.stdin)

    extra_options = ["schema1 = '%s'" % options.species,
                     "prediction_id1 IN ('%s')" % "','".join( ids ) ]
    
    if options.schemas:
        extra_options.append( "schema2 IN ('%s')" % "','".join(options.schemas))
        
    statement = """SELECT prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2
    FROM %s
    WHERE schema1 != schema2 AND %s
    ORDER BY prediction_id1""" % (options.tablename_orthologs,
                                  " AND ".join(extra_options))

    cc = dbhandle.cursor()
    cc.execute(statement)
    result = cc.fetchall()
    cc.close()

    if options.schemas:
        schemas = options.schemas
    else:
        schemas = set( map( lambda x: x[1], result) )

    ## compute counts
    degeneracies = {}
    for x in ids:
        degeneracies[x] = {}
        for s in schemas:
            degeneracies[x][s] = (0,0,0,0)
            
    for prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2 in result:
        degeneracies[prediction_id1][schema2] = (gd1, gd2, td1, td2)

    ## output
    options.stdout.write("%s\t%s\n" % ("prediction_id", "\t".join(schemas)))
    for x in ids:
        options.stdout.write("%s" % x)
        for s in schemas:
            options.stdout.write("\t%s:%s:%s:%s" % degeneracies[x][s])
        options.stdout.write("\n")
    
    E.Stop()
コード例 #13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_ribosomes.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--schemas",
                      dest="schemas",
                      type="string",
                      help="schemas in the set.")

    parser.add_option("-e",
                      "--field-extract",
                      dest="field_extract",
                      type="string",
                      help="pattern for the field to extract.")

    parser.add_option("-c",
                      "--field-compare",
                      dest="field_compare",
                      type="string",
                      help="pattern for the field to compare.")

    parser.add_option("-i",
                      "--filename-identifiers",
                      dest="filename_identifiers",
                      type="string",
                      help="identifiers in the positive set.")

    parser.add_option("-u",
                      "--filename-subset",
                      dest="filename_subset",
                      type="string",
                      help="subset in the positive set.")

    parser.add_option("--filter-min-ratio",
                      dest="filter_min_ratio",
                      type="float",
                      help="minimum boundary for filter.")

    parser.add_option("--filter-max-ratio",
                      dest="filter_max_ratio",
                      type="float",
                      help="maximum boundary for filter.")

    parser.add_option(
        "-o",
        "--output-fields",
        dest="output_fields",
        type="string",
        help=
        "output fields, choices are: zscore, val, nvals, sum, min, max, stddev, mean, median."
    )

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        "pattern for table headers, should contain %s for schema and %s for field anme."
    )

    parser.add_option(
        "-f",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("table", "list", "values"),
        help="output format. Tabular form (one row per ortholog) or list form."
    )

    parser.add_option("--format",
                      dest="format",
                      type="string",
                      help="output format for numbers.")

    parser.add_option("--remove-na",
                      dest="remove_na",
                      action="store_true",
                      help="remove entries with any na values.")

    parser.set_defaults(
        field_extract="%s_length",
        field_compare="%s_length",
        filename_identifiers=None,
        filename_subset=None,
        filter_min_ratio=0.00,
        filter_max_ratio=0.00,
        schemas="",
        output_fields="",
        output_pattern="%s_%s",
        output_format="table",
        format="%6.4f",
        remove_na=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    options.schemas = options.schemas.split(",")
    if not options.schemas:
        raise "please supply schemas."

    if options.output_fields:
        options.output_fields = options.output_fields.split(",")
    else:
        options.output_fields = ()

    fields, table = CSV.ReadTable(sys.stdin)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    if options.loglevel >= 1:
        options.stdlog.write("# read a %i x %i table.\n" %
                             (len(table), len(fields)))

    if options.filename_subset:
        subset, nerrors = IOTools.ReadList(open(options.filename_subset, "r"))
        subset = set(subset)

        table = filter(lambda x: x[0] in subset, table)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# subset of %i entries reduced table to a %i x %i table.\n" %
                (len(subset), len(table), len(fields)))

    if options.filename_identifiers:
        identifiers, nerrors = IOTools.ReadList(
            open(options.filename_identifiers, "r"))
    else:
        identifiers = []

    identifiers = set(identifiers)

    # extract rows with positive identifiers
    positive_rows = filter(lambda x: x[0] in identifiers, table)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# subset of %i identifiers gives %i positive entries.\n" %
            (len(identifiers), len(positive_rows)))

    if options.output_format == "table":
        options.stdout.write("id")
        for schema in options.schemas:
            if options.output_fields:
                for field in options.output_fields:
                    options.stdout.write("\t" + options.output_pattern %
                                         (schema, field))
            else:
                options.stdout.write("\t%s" % (schema))

        options.stdout.write("\n")
    else:
        options.stdout.write("schema\tvalue\n")

    if identifiers:
        for row in positive_rows:

            if options.output_format == "table":
                options.stdout.write(row[0])

            for schema in options.schemas:

                # set fields for extraction
                f_extract = map_fields2column[options.field_extract % schema]
                f_compare = map_fields2column[options.field_compare % schema]

                # get region for extraction
                if row[f_compare] != "na":
                    r = float(row[f_compare])
                    if options.filter_min_ratio or options.filter_max_ratio:
                        mi = r * options.filter_min_ratio
                        ma = r * options.filter_max_ratio
                        f = lambda x: x[f_compare] != "na" and float(
                            x[f_compare]
                        ) >= mi and float(x[f_compare]) <= ma and x[
                            0] not in identifiers and x[f_extract] != "na"
                    else:
                        f = lambda x: x[0] not in identifiers and x[f_extract
                                                                    ] != "na"
                    # extract values: filter by minimum and maximum range and remove
                    # positive identifiers.
                    v = float(row[f_extract])
                    values = map(lambda x: float(x[f_extract]),
                                 filter(f, table))

                    stats = Stats.DistributionalParameters(values)
                else:
                    v = None

                for field in options.output_fields:

                    if v is not None:
                        if field == "zscore":
                            f = options.format % stats.getZScore(v)
                        elif field == "diff":
                            f = options.format % (v - stats["mean"])
                        elif field == "reldiff":
                            f = options.format % (
                                (v - stats["mean"]) / stats["mean"])
                        elif field == "val":
                            f = options.format % v
                        else:
                            f = options.format % stats[field]
                    else:
                        f = "na"

                    if options.output_format == "table":
                        options.stdout.write("\t%s" % f)
                    elif options.output_format == "list":
                        options.stdout.write("%s\t%s\n" % (schema, f))
                    elif options.output_format == "values":
                        options.stdout.write(
                            "%s\t%s\t%5.2f\t%s\n" %
                            (row[0], schema, v, ",".join(
                                map(lambda x: options.format % x, values))))

            if options.output_format == "table":
                options.stdout.write("\n")

    else:

        extract_columns = []

        for schema in options.schemas:
            extract_columns.append(map_fields2column[options.field_extract %
                                                     schema])

        # simply dump a subset of values
        for row in table:

            skip = False

            if options.filter_min_ratio or options.filter_max_ratio:

                master = options.schemas[0]

                v = row[map_fields2column[options.field_compare % master]]

                if v == "na":
                    continue

                v = float(v)

                mi = v * options.filter_min_ratio
                ma = v * options.filter_max_ratio

                for schema in options.schemas[1:]:

                    r = row[map_fields2column[options.field_compare % schema]]

                    if r == "na":
                        if options.remove_na:
                            skip = True
                        continue

                    r = float(r)

                    if r < mi or r > ma:
                        skip = True
                        if options.loglevel >= 3:
                            if options.format == "table":
                                options.stdout.write("* ")
                                options.stdout.write("%s\t" % row[0])
                                options.stdout.write("\t".join(
                                    [row[y] for y in extract_columns]))
                                options.stdout.write("\n")
                        break

            if skip:
                continue

            if options.output_format == "table":
                options.stdout.write("%s\t" % row[0])
                options.stdout.write("\t".join(
                    [row[y] for y in extract_columns]))
                options.stdout.write("\n")

            elif options.output_format == "list":
                has_na = False
                for x in range(len(options.schemas)):
                    v = row[extract_columns[x]]
                    if v == "na":
                        has_na = True

                if has_na and options.remove_na:
                    continue

                for x in range(len(options.schemas)):
                    options.stdout.write(
                        "%s\t%s\n" %
                        (options.schemas[x], row[extract_columns[x]]))

    E.Stop()
コード例 #14
0
ファイル: matrix2matrix.py プロジェクト: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: matrix2matrix.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "normalize-by-min-diagonal",
                          "normalize-by-column",
                          "log",
                          "ln",
                          "negzero2value",
                          "set-diagonal",
                          "subtract-matrix",
                          "mix-matrix",
                          "normalize-by-matrix",
                          "normalize-by-column-max",
                          "normalize-by-row-max",
                          "normalize-by-column-min",
                          "normalize-by-row-min",
                          "normalize-by-column-median",
                          "normalize-by-row-median",
                          "normalize-by-column-mean",
                          "normalize-by-row-mean",
                          "normalize-by-column-total",
                          "normalize-by-row-total",
                          "correspondence-analysis",
                          "normalize-by-value",
                          "add-value",
                          "sort-rows",
                          "sort-columns",
                          "transpose",
                          "upper-bound",
                          "lower-bound",
                          "subtract-first-col",
                          "multiply-by-value",
                          "divide-by-value",
                          "mask-rows",
                          "mask-columns",
                          "mask-rows-and-columns",
                          "symmetrize-mean",
                          "symmetrize-max",
                          "symmetrize-min",
                      ),
                      help="""method to use [default=%default]""")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="factor to scale matrix by [default=%default].")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output number format [default=%default].")

    parser.add_option("--filename-rows",
                      dest="filename_rows",
                      type="string",
                      help="filename with rows to mask [default=%default].")

    parser.add_option("--filename-columns",
                      dest="filename_columns",
                      type="string",
                      help="filename with columns to mask [default=%default].")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers",
                      dest="headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-a",
                      "--value",
                      dest="value",
                      type="float",
                      help="value to use for various algorithms.")

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix.""")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix.""")

    parser.add_option(
        "--missing",
        dest="missing",
        type="float",
        help=
        "value to use for missing values. If not set, missing values will cause the script to fail [default=%default]."
    )

    parser.set_defaults(
        methods=[],
        scale=1.0,
        headers=True,
        format="%6.4f",
        output_format="full",
        input_format="full",
        value=0.0,
        parameters="",
        write_separators=True,
        filename_rows=None,
        filename_columns=None,
        missing=None,
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    lines = filter(lambda x: x[0] != "#", sys.stdin.readlines())

    if len(lines) == 0:
        raise IOError("no input")

    chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    if options.filename_rows:
        row_names, n = IOTools.ReadList(open(options.filename_rows, "r"))
    if options.filename_columns:
        column_names, n = IOTools.ReadList(open(options.filename_columns, "r"))

    for chunk in range(len(chunks) - 1):

        try:
            raw_matrix, row_headers, col_headers = MatlabTools.readMatrix(
                StringIO.StringIO("".join(lines[chunks[chunk] +
                                                1:chunks[chunk + 1]])),
                format=options.input_format,
                headers=options.headers,
                missing=options.missing)
        except ValueError, msg:
            E.warn("matrix could not be read: %s" % msg)
            continue

        nrows, ncols = raw_matrix.shape

        E.debug("read matrix: %i x %i, %i row titles, %i colum titles" %
                (nrows, ncols, len(row_headers), len(col_headers)))

        parameter = 0

        for method in options.methods:

            matrix = numpy.reshape(numpy.array(raw_matrix), raw_matrix.shape)

            if method in ("normalize-by-matrix", "subtract-matrix",
                          "mix-matrix", "add-matrix"):

                other_matrix, other_row_headers, other_col_headers = MatlabTools.ReadMatrix(
                    open(options.parameters[parameter], "r"),
                    headers=options.headers)

                other_nrows, other_ncols = other_matrix.shape

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# read second matrix from %s: %i x %i, %i row titles, %i colum titles.\n"
                        % (options.parameters[parameter], other_nrows,
                           other_ncols, len(other_row_headers),
                           len(other_col_headers)))

                parameter += 1

            elif method == "normalize-by-min-diagonal":
                for x in range(nrows):
                    for y in range(ncols):
                        m = min(raw_matrix[x, x], raw_matrix[y, y])
                        if m > 0:
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "normalize-by-column":
                if nrows != ncols:
                    raise "only supported for symmeric matrices."

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[y, y]

            elif method == "normalize-by-value":
                matrix = raw_matrix / float(options.parameters[parameter])
                parameter += 1

            elif method == "normalize-by-row":
                if nrows != ncols:
                    raise "only supported for symmeric matrices."

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[x, x]

            elif method == "subtract-first-col":
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] -= raw_matrix[x, 0]

            elif method.startswith("normalize-by-column"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for y in range(ncols):
                    m = f(matrix[:, y])
                    if m != 0:
                        for x in range(nrows):
                            matrix[x, y] = matrix[x, y] / m

            elif method.startswith("normalize-by-row"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for x in range(nrows):
                    m = f(matrix[x, :])
                    if m != 0:
                        for y in range(ncols):
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "negzero2value":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] <= 0:
                            matrix[x, y] = options.value

            elif method == "minmax":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y], matrix[y, x] = \
                            min(matrix[x, y], matrix[y, x]), \
                            max(matrix[x, y], matrix[y, x])

            elif method == "log":
                # apply log to all values.
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log10(matrix[x, y])

            elif method == "ln":
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log(matrix[x, y])

            elif method == "transpose":
                matrix = numpy.transpose(matrix)
                row_headers, col_headers = col_headers, row_headers
                nrows, ncols = ncols, nrows

            elif method == "mul":
                matrix = numpy.dot(matrix, numpy.transpose(matrix))
                col_headers = row_headers

            elif method == "multiply-by-value":
                matrix *= options.value

            elif method == "divide-by-value":
                matrix /= options.value

            elif method == "add-value":
                matrix += options.value

            elif method == "angle":
                # write angles between col vectors
                v1 = numpy.sqrt(numpy.sum(numpy.power(matrix, 2), 0))
                matrix = numpy.dot(numpy.transpose(matrix), matrix)
                row_headers = col_headers
                nrows = ncols
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] /= v1[x] * v1[y]

            elif method == "euclid":
                # convert to euclidean distance matrix
                matrix = numpy.zeros((ncols, ncols), numpy.float)
                for c1 in range(0, ncols - 1):
                    for c2 in range(c1 + 1, ncols):
                        for r in range(0, nrows):
                            d = raw_matrix[r][c1] - raw_matrix[r][c2]
                            matrix[c1, c2] += (d * d)
                        matrix[c2, c1] = matrix[c1, c2]
                matrix = numpy.sqrt(matrix)
                row_headers = col_headers
                nrows = ncols

            elif method.startswith("symmetrize"):
                f = method.split("-")[1]
                if f == "max":
                    f = max
                elif f == "min":
                    f = min
                elif f == "mean":
                    f = lambda x, y: float(x + y) / 2

                if nrows != ncols:
                    raise ValueError(
                        "symmetrize only available for symmetric matrices")
                if row_headers != col_headers:
                    raise ValueError(
                        "symmetrize not available for permuted matrices")
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] = matrix[y,
                                              x] = f(matrix[x, y], matrix[y,
                                                                          x])
            elif method == "sub":
                matrix = options.value - matrix

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[parameter])
                new_value = float(options.parameters[parameter + 1])
                parameter += 2
                if method == "upper-bound":
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] > boundary:
                                matrix[x, y] = new_value
                else:
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] < boundary:
                                matrix[x, y] = new_value

            elif method == "subtract-matrix":
                matrix = matrix - other_matrix

            elif method == "add-matrix":
                matrix = matrix + other_matrix

            elif method == "normalize-by-matrix":

                # set 0s to 1 in the other matrix
                for x in range(nrows):
                    for y in range(ncols):
                        if other_matrix[x, y] == 0:
                            other_matrix[x, y] = 1.0

                matrix = matrix / other_matrix

            elif method == "mix-matrix":
                for x in range(len(other_row_headers) - 1):
                    for y in range(x + 1, len(other_col_headers)):
                        matrix[x, y] = other_matrix[x, y]

            elif method == "set-diagonal":
                value = float(options.parameters[parameter])
                for x in range(min(nrows, ncols)):
                    matrix[x, x] = value
                parameter += 1

            elif method == "transpose":
                matrix = numpy.transpose(raw_matrix)
                row_headers, col_headers = col_headers, row_headers

            elif method == "correspondence-analysis":
                row_indices, col_indices = CorrespondenceAnalysis.GetIndices(
                    raw_matrix)
                map_row_new2old = numpy.argsort(row_indices)
                map_col_new2old = numpy.argsort(col_indices)

                matrix, row_headers, col_headers = CorrespondenceAnalysis.GetPermutatedMatrix(
                    raw_matrix,
                    map_row_new2old,
                    map_col_new2old,
                    row_headers=row_headers,
                    col_headers=col_headers)

            elif method == "mask-rows":
                r = set(row_names)
                for x in range(len(row_headers)):
                    if row_headers[x] in r:
                        matrix[x, :] = options.value

            elif method == "mask-columns":
                r = set(column_names)
                for x in range(len(col_headers)):
                    if col_headers[x] in r:
                        matrix[:, x] = options.value

            elif method == "mask-rows-and-columns":

                r = set(row_names)
                c = set(column_names)
                for x in range(len(row_headers)):
                    for y in range(len(col_headers)):
                        if row_headers[x] in r and col_headers[y] in c:
                            matrix[x, y] = options.value

            raw_matrix = numpy.reshape(numpy.array(matrix), matrix.shape)

        else:
            # for simple re-formatting jobs
            matrix = raw_matrix

        if options.write_separators:
            options.stdout.write(lines[chunks[chunk]])

        MatlabTools.writeMatrix(sys.stdout,
                                matrix,
                                value_format=options.format,
                                format=options.output_format,
                                row_headers=row_headers,
                                col_headers=col_headers)
コード例 #15
0
ファイル: codonbias_acai2tsv.py プロジェクト: lesheng/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codonbias_acai2tsv.py 865 2007-01-15 13:44:43Z andreas $"
    )

    parser.add_option("-o",
                      "--input-file-trace",
                      dest="input_filename_trace",
                      type="string",
                      help="input filename for cai.",
                      metavar="FILE")

    parser.add_option("-e",
                      "--input-file-genes",
                      dest="input_filename_genes",
                      type="string",
                      help="input filename for genes information from cai.",
                      metavar="FILE")

    parser.add_option("-c",
                      "--input-file-codons",
                      dest="input_filename_codons",
                      type="string",
                      help="input filename for codon usage information.",
                      metavar="FILE")

    parser.add_option("--input-file-sequences",
                      dest="input_filename_sequences",
                      type="string",
                      help="input filename with sequences.",
                      metavar="FILE")

    parser.add_option("-t",
                      "--input-file-subset",
                      dest="input_filename_subset",
                      type="string",
                      help="input filename with subset.",
                      metavar="FILE")

    parser.add_option("--codon-table-format",
                      dest="codon_table_format",
                      type="choice",
                      choices=("list", "matrix"),
                      help="output options for output codon tables.")

    parser.add_option("--codon-table-type",
                      dest="codon_table_type",
                      type="choice",
                      choices=("counts", "frequencies", "weights",
                               "absolute-frequencies"),
                      help="type of codon table.")

    parser.add_option("-r",
                      "--reference",
                      dest="reference",
                      type="string",
                      help="dump CAI reference weights for species.")

    parser.add_option("-s",
                      "--select",
                      dest="select",
                      type="string",
                      help="fields to select from genes table.")

    parser.add_option("-m",
                      "--map",
                      dest="input_filename_map",
                      type="string",
                      help="filename with mapping information for gene names.",
                      metavar="FILE")

    parser.add_option("-i",
                      "--invert-map",
                      dest="invert_map",
                      action="store_true",
                      help="invert map.")

    parser.add_option(
        "-d",
        "--dominant-set",
        dest="dominant_set",
        type="float",
        help="only print out dominant set (# fraction of most biased genes).")

    parser.add_option(
        "--reverse-set",
        dest="reverse_set",
        action="store_true",
        help="print the reverse set, i.e., then non-dominant set.")

    parser.add_option(
        "-u",
        "--codon-usage",
        dest="codon_usage",
        type="string",
        help="print codon usage for the full/biased set of genes [full|biased]."
    )

    parser.add_option(
        "-w",
        "--weights",
        dest="weights",
        type="string",
        help=
        "print weights [final-list|final-matrix|random|compute|weights|frequencies|absolute-frequencies]."
    )

    parser.add_option("--weights-matrix2table",
                      dest="weights_matrix2table",
                      action="store_true",
                      help="convert a weights matrix to a weights table.")

    parser.add_option("--get-preferred-codons",
                      dest="get_preferred_codons",
                      type="string",
                      help="compute overview of preferred codons.")

    parser.set_defaults(input_filename="-",
                        input_filename_trace=None,
                        input_filename_genes=None,
                        input_filename_codons=None,
                        input_filename_map=None,
                        input_filename_subset=None,
                        input_filename_sequences=None,
                        invert_map=False,
                        select=None,
                        codon_usage=None,
                        weights=None,
                        revserse_set=False,
                        pseudocounts=1,
                        codon_table_format="list",
                        codon_table_type="weights",
                        weights_matrix2table=False,
                        random_size=1000,
                        get_preferred_codons=None,
                        dominant_set=0.0)

    (options, args) = E.Start(parser)
    if options.select:
        options.select = options.select.split(",")

    outfile = options.stdout

    ###################################################################
    # convert weights table to a codon table
    if options.weights_matrix2table:
        lines = options.stdin.readlines()
        data = []
        for line in lines:
            if line[0] == "#":
                continue
            data += list(map(float, line[:-1].split(",")))

        weights = {}
        x = 0
        for cc in OUTPUT_ORDER_CODON_MATRIX:
            for c in cc:
                weights[c] = data[x]
                x += 1

        outfile.write("CODON\tWEIGHT\n")
        codons = weights.keys()
        codons.sort()
        for codon in codons:
            outfile.write("%s\t%f\n" % (codon, weights[codon]))

        E.Stop()
        sys.exit(1)

    ###################################################################
    map_genes = {}

    if options.input_filename_map:
        data = map(
            lambda x: x[:-1].split("\t")[:2],
            filter(lambda x: x[0] != "#",
                   open(options.input_filename_map, "r").readlines()))

        for a, b in data:
            if options.invert_map:
                a, b = b, a
            map_genes[a] = b

    result = WrapperAdaptiveCAI.AdaptiveCAIResult()

    if options.input_filename_genes:
        gene_file = open(options.input_filename_genes, "r")
    else:
        gene_file = None

    if options.input_filename_codons:
        codon_file = open(options.input_filename_codons, "r")
    else:
        codon_file = None

    if options.input_filename_trace:
        trace_file = open(options.input_filename_trace, "r")
    else:
        trace_file = None

    if options.input_filename_subset:
        l, e = IOTools.ReadList(open(options.input_filename_subset, "r"))
        subset = set(l)
        if options.loglevel >= 1:
            options.stdlog.write("# read %i entries into subset from %s.\n" %
                                 (len(subset), options.input_filename_subset))
    else:
        subset = None

    result.Read(gene_file=gene_file,
                codon_file=codon_file,
                trace_file=trace_file)

    if gene_file:
        gene_file.close()
    if codon_file:
        codon_file.close()
    if trace_file:
        trace_file.close()

    if options.reference:
        if options.reference not in CODON_PREFERENCES:
            raise "unknown species %s: possibles species are: %s" % (
                options.reference, str(CODON_PREFERNCES.keys()))

        weights = Genomics.CalculateCAIWeightsFromCounts(
            CODON_PREFERENCES[options.reference], options.pseudocounts)

        for x in range(len(OUTPUT_ORDER_CODON_MATRIX)):
            outfile.write(",".join(
                map(lambda z: "%5.3f" % z, [
                    weights[codon.upper()]
                    for codon in OUTPUT_ORDER_CODON_MATRIX[x]
                ])))
            outfile.write("\n")

    if options.dominant_set and gene_file:
        cai_threshold = result.GetDominantThreshold(options.dominant_set)
    else:
        if options.reverse_set:
            cai_threshold = 1.0
        else:
            cai_threshold = 0.0

    if options.select:

        fields = []
        titles = []
        for x in options.select:
            f = re.match("(\S+) (AS|as) (\S+)", x)
            if f:
                fields.append(f.groups()[0].upper())
                titles.append(f.groups()[2])
            else:
                fields.append(x.upper())
                titles.append(x)

        outfile.write("GENENAME\t" + string.join(titles, "\t") + "\n")

        for genename, data in result.mGeneInfo.items():
            if genename in map_genes:
                genename = map_genes[genename]

            if options.reverse_set:
                if data["CAICLASS"] >= cai_threshold:
                    continue
            else:
                if data["CAICLASS"] < cai_threshold:
                    continue

            outfile.write(genename)
            for c in fields:
                outfile.write("\t%s" % str(data[c]))
            outfile.write("\n")

    if options.weights:

        format = options.codon_table_format

        if options.weights in ("compute-counts", "compute-weights",
                               "compute-frequencies"):
            # compute codon usage weights from a set of sequences
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for x in codons:
                counts[x] = 0

            if options.input_filename_sequences:
                sequences = Genomics.ReadPeptideSequences(open(
                    options.input_filename_sequences, "r"),
                                                          filter=subset)
                for key, sequence in sequences.items():
                    sequence = re.sub(" ", "", sequence)
                    if len(sequence) % 3 != 0:
                        raise "warning: sequence %s is not multiple of 3" % key
                    for codon in [
                            sequence[x:x + 3]
                            for x in range(0, len(sequence), 3)
                    ]:
                        counts[codon.upper()] += 1

            if options.weights == "compute-frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.weights == "compute-weights":
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
            else:
                weights = counts

        elif options.weights in ("final-list", "final-matrix"):

            weights = result.mFinalWeights
            if options.weights == "final-list":
                format = "list"
            else:
                format = "matrix"

        elif options.weights == "random":
            # get random weights
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for x in codons:
                counts[x] = random.randint(1, options.random_size)

            weights = Genomics.CalculateCAIWeightsFromCounts(
                counts, options.pseudocounts)
            format = "matrix"

        elif options.weights == "biased":
            # get biased weights
            codons = Genomics.GetUniformCodonUsage()

            weights = Genomics.CalculateCAIWeightsFromCounts(
                counts, options.pseudocounts)
            format = "matrix"

        elif options.weights in ("uniform-weights", "uniform-frequencies"):
            # get uniform weights
            codons = Genomics.GetUniformCodonUsage()

            if options.weights == ("uniform-weights"):
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
                format = "matrix"
            else:
                weights = codons
                format = "list"

        elif options.weights in ("counts", "frequencies",
                                 "absolute-frequencies"):
            # get weights as frequencies
            # compute from scratch. In the caijava file, the absolute frequencey f / gene_length is
            # given. Thus the total number of codons is f * gene_length.
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for c in codons:
                counts[c] = 0

            for genename, data in result.mGeneInfo.items():

                if options.reverse_set:
                    if data["CAICLASS"] >= cai_threshold:
                        continue
                else:
                    if data["CAICLASS"] < cai_threshold:
                        continue

                l = data["GENELENGTH"]
                for c in codons:
                    counts[c] += int(data[c] * l)

            if options.weights == "frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.weights == "counts":
                weights = counts
            elif options.weights == "absolute-frequencies":
                # compute absolute frequencies (with pseudo-counts, but do not
                # normalize per aa)
                weights = {}
                m = sum(counts.values())
                for k, v in counts.items():
                    weights[k] = float(v) / m

            format = "list"

        elif options.weights == "subset":

            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for c in codons:
                counts[c] = 0

            for genename, data in result.mGeneInfo.items():

                found = genename in subset
                if (not found and not options.reverse_set) or (
                        found and options.reverse_set):
                    continue

                l = data["GENELENGTH"]
                for c in codons:
                    counts[c] += int(data[c] * l)

            if options.codon_table_type == "frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.codon_table_type == "weights":
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
            elif options.codon_table_type == "counts":
                weights = counts
            if options.codon_table_type == "absolute-frequencies":
                # compute absolute frequencies (with pseudo-counts, but do not
                # normalize per aa)
                weights = {}
                m = sum(counts.values())
                for k, v in counts.items():
                    weights[k] = float(v) / m

        else:
            raise "unknown weights %s" % options.weights

        if format == "list":
            outfile.write("CODON\tWEIGHT\n")
            codons = weights.keys()
            codons.sort()
            for codon in codons:
                outfile.write("%s\t%f\n" % (codon, weights[codon]))

        elif format == "matrix":

            for x in range(len(OUTPUT_ORDER_CODON_MATRIX)):
                outfile.write(",".join(
                    map(lambda z: "%5.3f" % z, [
                        weights[codon.upper()]
                        for codon in OUTPUT_ORDER_CODON_MATRIX[x]
                    ])))
                outfile.write("\n")

    if options.codon_usage:
        outfile.write("CODON\tFREQUENCY\n")

        if options.codon_usage == "biased":
            usages = result.mCodonUsages[-1]
        elif options.codon_usage == "full":
            usages = result.mCodonUsages[0]
        elif options.codon_usage == "weights":
            usages = WrapperAdaptiveCAI.CalculateWeightsFromUsage(
                result.mCodonUsages[0])
        else:
            raise "unknown option '%s' for codon-usage." % options.codon_usage

        codons = usages.keys()
        codons.sort()
        for codon in codons:
            outfile.write("%s\t%f\n" % (codon, usages[codon]))

    E.Stop()
コード例 #16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: diff_transcript_sets.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percent columns")
    parser.add_option("-d",
                      "--dump-sets",
                      dest="dump_sets",
                      action="append",
                      type="choice",
                      choices=("rest_genes1", "rest_genes2", "intersection",
                               "union"),
                      help="dump sets of transcripts/genes")
    parser.add_option(
        "-o",
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern to use for dumped sets. Should contain one %s.")

    parser.set_defaults(
        separator="|",
        add_percent="False",
        dump_sets=[],
        output_pattern="%s",
    )

    (options, args) = E.Start(parser)

    options.filename1, options.filename2 = args

    ids1, nerrors1 = IOTools.ReadList(open(options.filename1, "r"))
    ids2, nerrors2 = IOTools.ReadList(open(options.filename2, "r"))

    genes1, transcripts1 = countGenesTranscripts(ids1, options)
    genes2, transcripts2 = countGenesTranscripts(ids2, options)

    options.stdout.write(
        "species\tngenes1\tntranscripts1\tngenes2\tntranscripts2\ttr_inter\ttr_union\ttr_rest1\ttr_rest2\ttr_inter\tg_union\tg_rest1\tg_rest2"
    )
    options.stdout.write("\ttr_rest1\ttr_rest2\tg_rest1\tg_rest2")

    options.stdout.write("\n")

    for species in set(genes1.keys()).union(set(genes2.keys())):
        nt1, nt2, ng1, ng2 = "na", "na", "na", "na"

        if species in genes1:
            g1 = genes1[species]
            t1 = transcripts1[species]
            nt1 = "%i" % len(transcripts1[species])
            ng1 = "%i" % len(genes1[species])
        else:
            t1, g1 = None, None

        if species in genes2:
            g2 = genes2[species]
            t2 = transcripts2[species]
            nt2 = "%i" % len(transcripts2[species])
            ng2 = "%i" % len(genes2[species])
        else:
            t2, g2 = None, None

        if species in transcripts1 and transcripts2:
            ct = "%i" % len(t1.intersection(t2))
            ut = "%i" % len(t2.union(t1))
            rt1 = "%i" % len(t1.difference(t2))
            rt2 = "%i" % len(t2.difference(t1))
        else:
            ct, ut, rt1, rt2 = ["na"] * 4

        if species in genes1 and genes2:
            cg = "%i" % len(g1.intersection(g2))
            ug = "%i" % len(g2.union(g1))
            rg1 = "%i" % len(g1.difference(g2))
            rg2 = "%i" % len(g2.difference(g1))
        else:
            cg, ug, rg1, rg2 = ["na"] * 4

        options.stdout.write("\t".join((species, nt1, ng1, nt2, ng2)))
        options.stdout.write("\t")
        options.stdout.write("\t".join((ct, ut, rt1, rt2)))
        options.stdout.write("\t")
        options.stdout.write("\t".join((cg, ug, rg1, rg2)))

        if options.add_percent:
            if species in genes1 and genes2:
                rg1 = "%5.2f" % (100.0 * len(g1.difference(g2)) / len(g1))
                rg2 = "%5.2f" % (100.0 * len(g2.difference(g1)) / len(g2))
            if species in transcripts1 and transcripts2:
                rt1 = "%5.2f" % (100.0 * len(t1.difference(t2)) / len(t1))
                rt2 = "%5.2f" % (100.0 * len(t2.difference(t1)) / len(t2))
            options.stdout.write("\t")
            options.stdout.write("\t".join((rt1, rt2, rg1, rg2)))

        options.stdout.write("\n")

        for choice in options.dump_sets:

            output_set = None

            if choice == "rest_genes1" and g1 and g2:
                output_set = getTranscriptsForGenes(g1.difference(g2), ids1,
                                                    options)

            elif choice == "rest_genes2" and g1 and g2:
                output_set = getTranscriptsForGenes(g2.difference(g1), ids2,
                                                    options)

            if output_set:
                outfile = IOTools.openFile(options.output_pattern % (choice),
                                           "w")
                for x in output_set:
                    outfile.write("%s\n" % (x, ))
                outfile.close()

    E.Stop()
コード例 #17
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t",
                      "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes",
                      dest="merge_genes",
                      action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j",
                      "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes",
                      dest="unset_genes",
                      type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene", "gene+transcript", "transcript",
                               "position", "contig+gene", "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u",
                      "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i",
                      "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g",
                      "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G",
                      "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r",
                      "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes",
                      dest="renumber_genes",
                      type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a",
                      "--apply",
                      dest="filename_filter",
                      type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample_size",
                      type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length",
                      type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end "
        "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping",
                      dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates",
                      dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates",
                      dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates",
                      dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                           strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin),
                                         strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                               strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to "
                             "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id",
                             transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons
                                      ]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator(
            GTF.iterator(options.stdin)),
                                          sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(
                options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id",
                                     map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]),
                               max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript",
                                "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript
                                      if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([
                        exon_counts[(x.start, x.end)] for x in transcript
                        if x.feature == "exon"
                    ])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l
                    ]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs
                                      ]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [
            item for item in set(transcript_ids)
            if transcript_ids.count(item) > 1
        ]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute(
                        'gene_id',
                        gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute(
                        'transcript_id', gtf.transcript_id + "." +
                        str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin),
                                           strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple strands: %s" %
                    (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple contigs: %s" %
                    (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()
コード例 #18
0
def selectComponents(component_ids, map_component2seq_id,
                     map_component2input_id, id_filter, options):
    """select a set of components from component_ids.
    """

    map_sample2reference = {}

    if options.sample:

        if options.sample_method == "simple-without-replacement":
            random.shuffle(component_ids)

        elif options.sample_method == "length-without-replacement":

            map_component_id2length = {}

            for component_id in component_ids:

                mali = getMali(component_id, map_component2seq_id,
                               map_component2input_id, id_filter, options)

                if not mali: continue

                map_component_id2length[component_id] = mali.getWidth()

            reference_ids, nerrors = IOTools.ReadList(
                open(options.filename_sample_reference, "r"))

            # do not sample from the reference set
            sampled_components = set(reference_ids)

            new_component_ids = []

            ninput, noutput, nskipped = 0, 0, 0
            # now go through reference set
            for ref_id in reference_ids:

                ninput += 1
                if ref_id not in map_component_id2length:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# reference component %s not found.\n" %
                            (str(ref_id)))
                    nskipped += 1
                    continue

                ref_length = map_component_id2length[ref_id]
                ref_length_min = ref_length - 50
                ref_length_max = ref_length + 50

                # find all components with a length similar to ref_length excluding previously sampled ones.
                test_components = filter(
                    lambda x: ref_length_min < map_component_id2length[x] <
                    ref_length_max, component_ids)
                test_components = list(
                    set(test_components).difference(sampled_components))

                if len(test_components) == 0:
                    if options.loglevel >= 1:
                        options.stdlog.write("# reference components %s: skipped - no others with equivalent length around %i found." % \
                                                 ( ref_id, ref_length ) )
                    nskipped += 1
                    continue

                random.shuffle(test_components)

                component_id = test_components[0]
                sampled_components.add(component_id)

                map_sample2reference[component_id] = ref_id

                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# reference component mapping: %s\t%s\t%i\t%i\t%i\n" %
                        (ref_id, component_id, ref_length,
                         map_component_id2length[component_id],
                         len(test_components)))

                new_component_ids.append(component_id)
                noutput += 1

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sampling results: ninput=%i, noutput=%i, nskipped=%i\n"
                    % (ninput, noutput, nskipped))

            component_ids = new_component_ids
            options.sample = len(new_component_ids)

    return component_ids, map_sample2reference
コード例 #19
0
ファイル: orthologs2list.py プロジェクト: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/orthologs2list.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-s",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "-g",
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("-b",
                      "--only-best",
                      dest="only_best",
                      action="store_true",
                      help="write only the best pair for a pairing.")

    parser.add_option("-w",
                      "--no-within",
                      dest="within",
                      action="store_false",
                      help="do not write within species pairs.")

    parser.add_option("-d",
                      "--distances",
                      dest="filename_distances",
                      type="string",
                      help="filename with distances between transcripts.")

    parser.add_option(
        "-c",
        "--no-combine-genes",
        dest="combine_genes",
        action="store_false",
        help="do not combine orthologous clusters which contain the same gene."
    )

    parser.add_option("--filename-restrict-filter1",
                      dest="filename_restrict_filter1",
                      type="string",
                      help="filename with ids to filter out.")
    parser.add_option("--filename-restrict-filter2",
                      dest="filename_restrict_filter2",
                      type="string",
                      help="filename with ids to filter out.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("graph", "components"),
                      help="output format.")

    parser.add_option("-m",
                      "--mode",
                      dest="mode",
                      type="choice",
                      choices=("orthologs", "orphans"),
                      help="analyze either 'orthologs' or 'orphans'.")

    parser.add_option("--genome1",
                      dest="genome1",
                      type="string",
                      help="first genome.")
    parser.add_option("--genome2",
                      dest="genome2",
                      type="string",
                      help="second genome.")

    parser.set_defaults(
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        only_best=None,
        filename_distances=None,
        within=True,
        combine_genes=True,
        report_step=100000,
        use_networkx=False,
        separator="|",
        genome1=None,
        genome2=None,
        mode="orthologs",
        filename_restrict_filter1=None,
        filename_restrict_filter2=None,
        format="graph",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)

    t0 = time.time()
    # retrieve matches between pairs:
    pairs = {}
    max_dist = 0

    if options.filename_distances and options.only_best:
        infile = open(options.filename_distances, "r")
        for line in infile:
            if line[0] == "#":
                continue
            a, b, d = line[:-1].split("\t")[:3]
            d = float(d)
            if a < b:
                key = "%s-%s" % (a, b)
            else:
                key = "%s-%s" % (b, a)

            max_dist = max(d, max_dist)
            pairs[key] = d

        infile.close()

    cluster_id = 0
    ninput, noutput, nmissed, nskipped, nsingletons = 0, 0, 0, 0, 0

    # Read positive filter information:
    filter_restrict1 = {}
    if options.filename_restrict_filter1:
        xx, e = IOTools.ReadList(open(options.filename_restrict_filter1, "r"))
        for x in xx:
            filter_restrict1[Orthologs.Transcript(x).mTranscript] = True

    filter_restrict2 = {}
    if options.filename_restrict_filter2:
        xx, e = IOTools.ReadList(open(options.filename_restrict_filter2, "r"))
        for x in xx:
            filter_restrict2[Orthologs.Transcript(x).mTranscript] = True

    if options.loglevel >= 1:
        options.stdlog.write("# read filtering information: %i/%i\n" %
                             (len(filter_restrict1), len(filter_restrict2)))

    t1 = time.time()

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" % (t1 - t0))

    orthologs = []

    if options.mode == "orthologs":
        orthologs = Orthologs.ReadInterpretation(
            sys.stdin,
            options.separator,
            genome1=options.genome1,
            genome2=options.genome2,
            filter_restrict_transcripts1=filter_restrict1,
            filter_restrict_transcripts2=filter_restrict2)
    else:
        orthologs = Orthologs.ReadOrphans(
            sys.stdin,
            options.separator,
            genome1=options.genome1,
            genome2=options.genome2,
            filter_restrict_transcripts1=filter_restrict1,
            filter_restrict_transcripts2=filter_restrict2)

    ninput = len(orthologs)

    max_dist = map(lambda x: x[4], orthologs)

    t2 = time.time()

    if options.loglevel >= 1:
        options.stdlog.write("# reading %i groups in %i seconds.\n" %
                             (ninput, t2 - t1))

    if options.combine_genes:

        if options.use_networkx:

            nclusters = len(orthologs)
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# before combining genes: %i clusters\n" % len(orthologs))
                options.stdlog.flush()

            # build links between all genes
            # ignore warnings from networkx/matplotlib that a display
            # can not be found
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                import networkx

            graph = networkx.Graph()

            # This procedure skips genes with "0". This is a patch, because
            # these genes should not be there in the first place.
            iteration = 0
            for transcripts1, transcripts2, genes1, genes2, weight in orthologs:
                iteration += 1

                if options.loglevel >= 1:
                    if (iteration % options.report_step == 0):
                        options.stdlog.write(
                            "# iteration: %i/%i (%i%%) in %i seconds.\n" %
                            (iteration, nclusters, 100 * iteration / nclusters,
                             time.time() - t2))
                        options.stdlog.flush()

                for g in genes1.keys():
                    graph.add_node((1, g))
                for g in genes2.keys():
                    graph.add_node((2, g))
                for g1 in genes1.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes1.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge((1, g1), (2, g2))
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge((1, g1), (2, g2))
                for g1 in genes2.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge((2, g1), (2, g2))

            if options.loglevel >= 1:
                options.stdlog.write("# created graph in %i seconds.\n" %
                                     (time.time() - t2))
                options.stdlog.flush()

            tt2 = time.time()

            components = networkx.connected_components(graph)

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# calculated connected components in %i seconds\n" %
                    (time.time() - tt2))
                options.stdlog.flush()

        else:

            graph = GraphTools.ExternalGraph()

            iteration = 0
            nclusters = len(orthologs)

            for transcripts1, transcripts2, genes1, genes2, weight in orthologs:

                iteration += 1

                if options.loglevel >= 1:
                    if (iteration % options.report_step == 0):
                        options.stdlog.write(
                            "# iteration: %i/%i (%i%%) in %i seconds.\n" %
                            (iteration, nclusters, 100 * iteration / nclusters,
                             time.time() - t1))
                        options.stdlog.flush()

                f = "%s;%s"

                for g1 in genes1.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes1.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge(f % (1, g1), f % (2, g2))
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge(f % (1, g1), f % (2, g2))
                for g1 in genes2.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge(f % (2, g1), f % (2, g2))

            if options.loglevel >= 1:
                options.stdlog.write("# created graph in %i seconds\n" %
                                     (time.time() - t2))
                options.stdlog.flush()

            tt2 = time.time()

            graph.finalize()
            components = graph.connected_components()

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# retrieved %i connected components in %i seconds\n" %
                    (len(components), time.time() - tt2))
                options.stdlog.flush()

            for x in range(len(components)):
                components[x] = map(lambda y: y.split(";"), components[x])

        tt2 = time.time()

        map_gene2cluster = {}
        for x in range(len(components)):
            for a, b in components[x]:
                map_gene2cluster[b] = x

        new_orthologs = [[[], [], 0] for x in range(len(components))]

        singletons = []

        for transcripts1, transcripts2, genes1, genes2, weight in orthologs:
            if genes1:
                try:
                    cluster_id = map_gene2cluster[genes1.keys()[0]]
                except KeyError:
                    singletons.append(genes1)
            elif genes2:
                try:
                    cluster_id = map_gene2cluster[genes2.keys()[0]]
                except KeyError:
                    singletons.append(genes2)
            else:
                raise "Error, both genes1 and genes2 are emtpy."

            new_orthologs[cluster_id][0] += transcripts1
            new_orthologs[cluster_id][1] += transcripts2
            new_orthologs[cluster_id][2] = weight

        nsingletons = len(singletons)

        orthologs = map(
            lambda x: (x[0], x[1], Orthologs.GetGenes(x[0]),
                       Orthologs.GetGenes(x[1]), weight), new_orthologs)

        if options.loglevel >= 1:
            options.stdlog.write("# combining genes in %i seconds\n" %
                                 (time.time() - tt2))
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# after combining genes: %i clusters, %i singletons\n" %
                (len(orthologs), nsingletons))

    t3 = time.time()

    if options.loglevel >= 1:
        options.stdlog.write("# gene clustering in %i seconds.\n" % (t3 - t2))

    cluster_id = 0

    def getCode(s):
        if len(s) == 1:
            return "1"
        elif len(s) == 0:
            return "0"
        else:
            return "m"

    for transcripts1, transcripts2, genes1, genes2, weight in orthologs:

        cluster_id += 1

        g1 = getCode(genes1)
        g2 = getCode(genes2)
        t1 = getCode(transcripts1)
        t2 = getCode(transcripts2)

        if options.format == "graph":

            # find best transcripts
            best_transcripts = {}
            if options.only_best:
                # print only best match between each possible set of genes in
                # ortholog pair
                for gg1, tt1 in genes1.items():
                    for gg2, tt2 in genes2.items():
                        best = max_dist
                        best_pair = None
                        for x in tt1:
                            for y in tt2:
                                if x < y:
                                    key = "%s-%s" % (x, y)
                                else:
                                    key = "%s-%s" % (y, x)

                            if key in pairs:
                                if best > pairs[key]:
                                    best = pairs[key]
                                    best_pair = (x, y)
                        if best_pair:
                            best_transcripts[x] = 1
                            best_transcripts[y] = 1
                            options.stdout.write(
                                "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                                (best_pair[0], best_pair[1], weight, g1, g2,
                                 str(t1), str(t2), cluster_id))
                            noutput += 1
                        else:
                            options.stdlog.write(
                                "# missed link between: %s %s\n" %
                                (str(genes1), str(genes2)))
                            nmissed += 1
            else:
                for x in transcripts1:
                    for y in transcripts2:
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                            (x, y, weight, g1, g2, str(t1), str(t2),
                             cluster_id))
                        noutput += 1

            if options.within:

                # add self links for first species.
                for x in range(len(transcripts1) - 1):
                    for y in range(x + 1, len(transcripts1)):
                        if not best_transcripts or \
                           (transcripts1[x] in best_transcripts and
                                transcripts1[y] in best_transcripts):
                            options.stdout.write(
                                "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                                (str(transcripts1[x]), str(transcripts1[y]),
                                 weight, g1, g2, str(t1), str(t2), cluster_id))
                            noutput += 1

                # add self links for second species
                for x in range(len(transcripts2) - 1):
                    for y in range(x + 1, len(transcripts2)):
                        if not best_transcripts or \
                           (transcripts2[x] in best_transcripts and
                                transcripts2[y] in best_transcripts):
                            options.stdout.write(
                                "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                                (str(transcripts2[x]), str(transcripts2[y]),
                                 weight, g1, g2, str(t1), str(t2), cluster_id))
                            noutput += 1

                # If orphans, also add links for genes with a single
                # transripts.
                if options.mode == "orphans":
                    if len(transcripts1) == 1:
                        x, y = 0, 0
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                            (str(transcripts1[x]), str(transcripts1[y]),
                             weight, g1, g2, str(t1), str(t2), cluster_id))
                    elif len(transcripts2) == 1:
                        x, y = 0, 0
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                            (str(transcripts2[x]), str(transcripts2[y]),
                             weight, g1, g2, str(t1), str(t2), cluster_id))

        elif options.format == "components":

            for gg1, tt1 in genes1.items():
                for t in tt1:
                    options.stdout.write("%s\t%i\n" % (str(t), cluster_id))

            for gg2, tt2 in genes2.items():
                for t in tt2:
                    options.stdout.write("%s\t%i" % (str(t), cluster_id))

    if options.loglevel >= 1:
        options.stdout.write(
            "# ninput=%i, noutput=%i, nmissed=%i, skipped=%i, nsingletons=%i\n"
            % (ninput, noutput, nmissed, nskipped, nsingletons))

    E.Stop()
コード例 #20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: graph_check.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("--filename-missing", dest="filename_missing", type="string",
                      help="missing entries.")
    parser.add_option("--filename-found", dest="filename_found", type="string",
                      help="found entries.")
    parser.add_option("--report-step1", dest="report_step1", type="int",
                      help="report interval for input.")
    parser.add_option("--report-step2", dest="report_step2", type="int",
                      help="report interval for processing.")
    parser.add_option("-n", "--filename-vertices", dest="filename_vertices", type="string",
                      help="filename with vertices.")
    parser.add_option("-u", "--num-fields", dest="num_fields", type="int",
                      help="number of fields to expect.")
    parser.add_option("-o", "--filename-output-pattern", dest="filename_output_pattern", type="string",
                      help="filenames for output (should contain one %s for one section).")
    parser.add_option("-s", "--sort-order", dest="sort_order", type="choice",
                      choices=("numeric", "alphanumeric"),
                      help="sort order - if numeric, vertices are cast to int.")

    parser.set_defaults(
        filename_vertices=None,
        report_step1=100000,
        report_step2=10000,
        filename_output_pattern="%s",
        subsets=False,
        num_fields=11,
        sort_order="alphanumeric",
    )

    (options, args) = E.Start(parser)

    if options.loglevel >= 1:
        options.stdlog.write("# output goes to:\n")
        options.stdlog.write("# errors: %s\n" %
                             options.filename_output_pattern % "errors")
        options.stdlog.write("# missed query: %s\n" %
                             options.filename_output_pattern % "missed_queries")
        options.stdlog.write("# missed sbjct: %s\n" %
                             options.filename_output_pattern % "missed_sbjcts")
        options.stdlog.write("# missed self: %s\n" %
                             options.filename_output_pattern % "missed_self")

    outfile_errors = open(options.filename_output_pattern % "errors", "w")

    if options.sort_order == "numeric":
        f = int
    else:
        f = str

    if options.filename_vertices:
        vv, errors = IOTools.ReadList(
            open(options.filename_vertices, "r"), map_function=f)
        vertices = {}
        # use flags for vertices
        # 1st bit: is query: 1
        # 2nd bit: is sbjct: 2
        # 3rd bit: has self: 4
        for v in vv:
            vertices[v] = 0
    else:
        raise "for the time being, specify a vertex file."

    options.stdout.write(
        "nqueries\tnsbjcts\tnvertices\tnlinks\tnlines\tnerrors\tncomments\tis_sorted\tnexpected\tnmissed_queries\tnmissed_sbjcts\tnmissed_self\n")

    ncomments, nlinks, nerrors, nlines = 0, 0, 0, 0

    is_sorted = True

    last = None

    for line in sys.stdin:

        nlines += 1
        if line[0] == "#":
            ncomments += 1
            continue

        nlinks += 1

        data = line[:-1].split("\t")

        if len(data) != options.num_fields:
            nerrors += 1
            outfile_errors.write(line)
            outfile_errors.flush()
            continue

        q, s = f(data[0]), f(data[1])

        if q == s:
            vertices[q] |= 4
        vertices[q] |= 1
        vertices[s] |= 2

        if last and last > q:
            is_sorted = False
            outfile_errors.write(
                "# sort inconsistency between %s and %s at line %i\n" % (last, q, nlines))
            outfile_errors.flush()
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sort inconsistency between %s and %s at line %i\n" % (last, q, nlines))
                options.stdlog.flush()

        if options.report_step1 and nlines % options.report_step1 == 0:
            writeInfo(options.stdlog, vertices, nlinks,
                      nlines, nerrors, ncomments, is_sorted)

        last = q

    missed_queries, missed_sbjcts, missed_self = writeInfo(
        options.stdout, vertices, nlinks, nlines, nerrors, ncomments, is_sorted)

    if nerrors == 0:
        os.remove(options.filename_output_pattern % "errors")

    if missed_queries:
        writeSet(open(options.filename_output_pattern %
                      "missed_queries", "w"), missed_queries)

    if missed_sbjcts:
        writeSet(open(options.filename_output_pattern %
                      "missed_sbjcts", "w"), missed_sbjcts)

    if missed_self:
        writeSet(open(options.filename_output_pattern %
                      "missed_self", "w"), missed_self)

    E.Stop()
コード例 #21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/annotate_clusters.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--filename-map",
        dest="filename_map_id2cluster",
        type="string",
        help="filename with mapping information from id to cluster.")

    parser.add_option("--filename-interpro",
                      dest="filename_interpro",
                      type="string",
                      help="filename with interpro domain information.")

    parser.add_option("--filename-pfam",
                      dest="filename_pfam",
                      type="string",
                      help="filename with pfam domain information.")

    parser.set_defaults(
        master_species="dmel_vs_dmel4",
        separator="|",
        filename_map_id2cluster="input.map",
        filename_interpro="/home/andreas/projects/flies/data_1v5/interpro.list",
        filename_pfam="/home/andreas/projects/flies/data_1v5/pfam.list",
        write_no_annotation=True,
        separator_fields=";",
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    clusters, nerrors = IOTools.ReadList(sys.stdin)

    map_id2cluster, map_cluster2id = IOTools.ReadMap(open(
        options.filename_map_id2cluster, "r"),
                                                     both_directions=True)

    if len(clusters) == 0:
        clusters = map_cluster2id.keys()
        clusters.sort()

    if options.filename_interpro:
        map_id2interpro = readAnnotationInterpro(
            open(options.filename_interpro, "r"))

    if options.filename_pfam:
        map_id2pfam = readAnnotationPfam(open(options.filename_pfam, "r"))

    ninput, noutput, nnomaster, nnoannotation = 0, 0, 0, 0
    nskipped = 0

    options.stdout.write("cluster\tgenes")

    if map_id2interpro:
        options.stdout.write("\tinterpro\tidescription")
    if map_id2pfam:
        options.stdout.write("\tpfam\tpdescription")
    options.stdout.write("\n")

    for cluster in clusters:

        ninput += 1
        if cluster not in map_cluster2id:
            if options.loglevel >= 1:
                options.stdlog.write("# cluster %s not in map.\n" % cluster)
            nskipped += 1
            continue

        genes = set()

        for id in map_cluster2id[cluster]:

            s, t, g, q = id.split(options.separator)

            if s != options.master_species:
                continue

            genes.add(g)

        if not genes:
            nnomaster += 1
            continue

        annotations_interpro = {}
        if map_id2interpro:
            for gene in genes:
                if gene in map_id2interpro:
                    for annotation in map_id2interpro[gene]:
                        annotations_interpro[
                            annotation.mIdentifier] = annotation

        annotations_pfam = {}

        if map_id2pfam:
            for gene in genes:
                if gene in map_id2pfam:
                    for annotation in map_id2pfam[gene]:
                        annotations_pfam[annotation.mIdentifier] = annotation

        nannotations = max(len(annotations_pfam), len(annotations_interpro))

        if nannotations == 0 and not options.write_no_annotation:
            nnoannotation += 1
            continue

        options.stdout.write("%s\t%s" % (cluster, ";".join(genes)))

        if map_id2interpro:
            printAnnotations(options.stdout, annotations_interpro, options)

        if map_id2pfam:
            printAnnotations(options.stdout, annotations_pfam, options)

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i, nnomaster=%i, nnoannotation=%i\n"
            % (ninput, noutput, nskipped, nnomaster, nnoannotation))

    E.Stop()