Python IOTools.ReadList Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: ReadList

Examples at hotexamples.com: 21

CGAT.IOTools.ReadList is a Python function that allows users to read a list of items from a file or a stream of data. This function is particularly useful for reading lists stored in a plain text format, such as one item per line. It handles various file formats and can read data from standard input as well. By utilizing this function, programmers can efficiently extract the desired list of items from their input sources for further analysis or processing.

Python IOTools.ReadList - 21 examples found. These are the top rated real world Python examples of CGAT.IOTools.ReadList extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/export_clade_data.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--filename-groups",
                      dest="filename_groups",
                      type="string",
                      help="filename with orthologous groups to extract.")

    parser.set_defaults(
        table_name_malis="malis_genes_aa",
        table_name_members="groups_members",
        mode="sequences",
        filename_groups=None,
        output_format="fasta",
        separator="|",
    )

    (options, args) = E.Start(parser, add_psql_options=True)

    # database handle for connecting to postgres
    dbhandle = pgdb.connect(options.psql_connection)

    if options.filename_groups:
        data, errors = IOTools.ReadList(open(options.filename_groups, "r"))
        groups = map(lambda x: x.split(options.separator)[:2], data)

        result = getMembersOfGroups(dbhandle, groups, options)

    if options.output_format == "fasta":
        for schema, gene_id, sequence in result:
            options.stdlog.write(">%s%s%s\n%s\n" %
                                 (schema, options.separator, gene_id,
                                  re.sub("-", "", sequence)))

    E.Stop()

Example #2

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      help="method to use [t-test=t-test,wilcox=wilcox]",
                      choices=("t-test", "wilcox"))
    parser.add_option("-1",
                      "--infile",
                      dest="filename_input",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename with vector of values.")
    parser.add_option("--header",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")

    parser.set_defaults(
        method="t-test",
        filename_input=None,
        header="value",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_input:
        infile = open(options.filename_input, "r")
    else:
        infile = sys.stdin

    values, errors = IOTools.ReadList(infile, map_function=float)
    if options.filename_input:
        infile.close()

    if errors:
        E.warn("errors in input: %s" % ";".join(map(str, errors)))

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.filename_input2:
        infile = open(options.filename_input2, "r")
        values2, errors2 = IOTools.ReadList(infile, map_function=float)
        infile.close()
    else:
        values2 = None

    stat = Stats.Summary(values)

    power, diff_at_power95 = None, None
    if options.method == "t-test":
        if values2:
            result = R.t_test(values, values2, *xargs, **kwargs)
        else:
            result = R.t_test(values, *xargs, **kwargs)
            # compute power of test
            power = R.power_t_test(n=len(values),
                                   delta=abs(stat["mean"]),
                                   sd=stat["stddev"],
                                   sig_level=0.05)['power']
            diff_at_power95 = R.power_t_test(n=len(values),
                                             power=0.95,
                                             sd=stat["stddev"],
                                             sig_level=0.05)['delta']

    if options.method == "wilcox":
        result = R.wilcox_test(values, *xargs, **kwargs)

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key, value in sorted(result.items()):
        if key == "data.name":
            continue
        if key == "p.value":
            options.stdout.write("%s\t%5.2e\n" % (str(key), value))
        else:
            options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    for key, value in stat.items():
        options.stdout.write("%s\t%s\n" % (str(key), str(value)))

    if power:
        options.stdout.write("1-power\t%5.2e\n" % (1.0 - power))
        options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95)

    E.Stop()

Example #3

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="string",
        help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("-p",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
    )

    (options, args) = E.Start(
        parser,
        add_pipe_options=True,
        add_psql_options=True,
    )

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))

    values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"),
                                        map_category=map_category2value)
    values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                        map_category=map_category2value)

    E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" %
           (len(values1), len(errors1), len(values2), len(errors2)))

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2)
    elif options.method == "mwu":
        result = R.wilcox_test(values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)

    R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

    R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
      )

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')"""
      )
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')"""
      )
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )

    print "## Results for %s" % result['method']
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print x, result[x]

    E.Stop()

Example #4

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        help=
        "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
        choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for histograms."
                      "")
    parser.add_option("-f",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option(
        "-n",
        "--norm-test",
        dest="norm_test",
        action="store_true",
        help=
        """test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot",
                      dest="plot",
                      action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write(
            "# creating %i samples from normal distribution with mean %f and stddev %f\n"
            % (len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" %
            (len(values1), len(errors1), len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=False,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=True,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created."
            )
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
          )

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2malis.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-d",
        "--pattern-output",
        dest="pattern_output",
        type="string",
        help="filename pattern for output multiple alignment files.")

    parser.add_option("-f",
                      "--filename-filter",
                      dest="filename_filter",
                      type="string",
                      help="filename with strings to filter by.")

    parser.add_option("--list-filter",
                      dest="list_filter",
                      type="string",
                      help="list of strings to filter by.")

    parser.set_defaults(
        pattern_output="%s.mali",
        methods="",
        parameters="",
        filename_filter=None,
        list_filter=None,
    )

    addOptions(parser)

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    if not options.pattern_mali:
        raise "Please specifiy a pattern to find the malis using --pattern-mali"

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read components
    ####################################################################
    map_seq_id2component, map_component2seq_id, map_component2input_id = \
        readComponents( options )

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read filtering information
    ####################################################################
    if options.filename_filter:
        id_filter, nerrors = IOTools.ReadList(
            open(options.filename_filter, "r"))
        if options.loglevel >= 1:
            options.stdlog.write(
                "# read %i identifiers to filter each multiple alignment with.\n"
                % len(id_filter))
            options.stdlog.flush()
    elif options.list_filter:
        id_filter = options.list_filter.split(",")
    else:
        id_filter = None

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to mask
    ####################################################################
    map_component2masks = readMasks(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to extract
    ####################################################################
    map_component2extracts = readExtracts(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to annotate
    ####################################################################
    map_component2annotations = readAnnotations(options,
                                                map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Prepare for run
    ####################################################################
    component_ids = map_component2seq_id.keys()
    component_ids.sort()

    if options.loglevel >= 1:
        options.stdlog.write("# %i component ids to start with.\n" %
                             (len(component_ids)))

    component_ids, map_sample2reference = selectComponents(
        component_ids, map_component2seq_id, map_component2input_id, id_filter,
        options)

    if options.test:
        component_ids = component_ids[:options.test]

    if options.loglevel >= 1:
        options.stdlog.write("# %i component ids selected for output.\n" %
                             (len(component_ids)))

    ninput = 0
    noutput = 0
    nskipped = 0
    nskipped_length = 0

    for component_id in component_ids:

        ninput += 1

        if options.loglevel >= 3:
            options.stdlog.write("# processing component %s\n" %
                                 (component_id))

        mali = getMali(component_id, map_component2seq_id,
                       map_component2input_id, id_filter, options)

        if mali == None:
            E.warn("empty mali returned for component %s" % (component_id))
            nskipped += 1
            continue

        if mali.getNumColumns() == 0:
            E.warn("skipping output of empty alignment for component %s" %
                   (component_id))
            nskipped += 1
            continue

        mali.setName(str(component_id))

        ###############################################################
        ## add annotations
        if map_component2annotations != None:
            annotateAlignment(mali, map_component2annotations, options)

        ###############################################################
        ## mask the alignment
        maskAlignment(mali, map_component2masks, map_component2extracts,
                      map_sample2reference, options)

        if mali.getNumColumns() < options.minimum_mali_length:
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# component %s: skipped, because length %i less than threshold.\n"
                    % (component_id, mali.getNumColumns()))
            continue

        ###############################################################
        ## prepare the mali for output
        if "%s" not in options.pattern_output:
            append = True
        else:
            append = False

        output_filename = re.sub("%s", component_id, options.pattern_output)
        input_id = map_component2input_id[component_id]

        if options.loglevel >= 2:
            options.stdlog.write(
                "# component %s: input from %s, goes to %s\n" %
                (component_id, input_id, output_filename))

        dirname = os.path.dirname(output_filename)

        if dirname and not os.path.exists(dirname):
            os.makedirs(dirname)

        if not os.path.exists(output_filename):
            mali.writeToFile(open(output_filename, "w"),
                             format=options.output_format)
            noutput += 1
        else:
            if append:
                mali.writeToFile(open(output_filename, "a"),
                                 format=options.output_format)
                noutput += 1
            else:
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# skipping because output for component %s already exists: %s\n"
                        % (component_id, output_filename))
                nskipped += 1

        # if we only sample, stop if you have reached
        # the desired number
        if options.sample and noutput == options.sample:
            break

    E.info("ninput=%i, noutput=%i, nskipped=%i, nskipped_length=%i" %
           (ninput, noutput, nskipped, nskipped_length))

    E.Stop()

Example #6

Show file

                        aggregate="mean",
                        value_format="%5.2f",
                        method="counts")

    (options, args) = E.Start(parser)

    if not options.filename_map:
        raise "please supply filename mapping probesets to identifiers."

    map_probe2locus = IOTools.ReadMap(open(options.filename_map, "r"))

    matrix, row_headers, col_headers = MatlabTools.readMatrix(
        sys.stdin, format="full", headers=options.headers)

    if options.filename_tissues:
        tissues, nerrors = IOTools.ReadList(open(options.filename_tissues,
                                                 "r"))
        tissues = set(tissues)
        columns = []
        for x in range(len(col_headers)):
            if col_headers[x] in tissues:
                columns.append(x)
    else:
        columns = range(len(col_headers))

    nrows, ncols = len(row_headers), len(col_headers)

    ninput, noutput, nkept = 0, 0, 0

    no_map = []
    degenerate = []

Example #7

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2summary.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("plain", "fasta", "clustal", "stockholm"),
                      help="input format of multiple alignment")

    parser.add_option(
        "-a",
        "--alphabet",
        dest="alphabet",
        type="choice",
        choices=("aa", "na"),
        help="alphabet to use [default=%default].",
    )

    parser.add_option(
        "-p",
        "--pattern-mali",
        dest="pattern_mali",
        type="string",
        help="filename pattern for input multiple alignment files.")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        mask_chars="nN",
        gap_chars="-.",
        alphabet="na",
        pattern_mali=None,
    )

    (options, args) = E.Start(parser)

    if options.pattern_mali:
        prefix_header = "prefix\t"
        prefix_row = "\t"
    else:
        prefix_header = ""
        prefix_row = ""

    options.stdout.write(
        "%sncol_mean\tpcol_mean\tncol_median\tpcol_median\tnrow_mean\tprow_mean\tnrow_median\tprow_median\n"
        % (prefix_header, ))

    ninput, nskipped, noutput, nempty = 0, 0, 0, 0

    if options.pattern_mali:

        ids, errors = IOTools.ReadList(sys.stdin)

        E.debug("read %i identifiers.\n" % len(ids))

        nsubstitutions = len(re.findall("%s", options.pattern_mali))

        for id in ids:

            filename = options.pattern_mali % tuple([id] * nsubstitutions)
            ninput += 1

            if not os.path.exists(filename):
                nskipped += 1
                continue

            # read multiple alignment in various formats
            mali = Mali.Mali()
            mali.readFromFile(open(filename, "r"), format=options.input_format)

            if mali.isEmpty():
                nempty += 1
                continue

            E.debug("read mali with %i entries from %s.\n" %
                    (len(mali), filename))

            if analyzeMali(mali, options, prefix_row="%s\t" % id):
                noutput += 1

    else:

        # read multiple alignment in various formats
        mali = Mali.Mali()
        mali.readFromFile(sys.stdin, format=options.input_format)
        ninput += 1

        if mali.isEmpty():
            nempty += 1
        else:
            E.debug("read mali with %i entries." % (len(mali)))

            if analyzeMali(mali, options, prefix_row=""):
                noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i, nempty=%i." %
           (ninput, noutput, nskipped, nempty))

    E.Stop()

Example #8

Show file

File: filter_fasta.py Project: yangjl/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/filter_fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        choices=("longest-transcript", "ids", "quality"),
        help=
        """method to apply to sequences ["longest-transcript", "ids", "quality"]."""
    )

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("-t",
                      "--type",
                      dest="type",
                      type="choice",
                      choices=("aa", "na"),
                      help="sequence type (aa or na).")

    parser.set_defaults(
        methods="",
        parameters="",
        type="na",
        aa_mask_chars="xX",
        aa_mask_char="x",
        na_mask_chars="nN",
        na_mask_char="n",
        gap_chars="-.",
        gap_char="-",
        template_identifier="ID%06i",
        separator="|",
    )

    (options, args) = E.Start(parser)
    options.parameters = options.parameters.split(",")

    iterator = FastaIterator.FastaIterator(sys.stdin)

    if options.method == "quality":
        filter_quality = set(options.parameters)
    else:
        filter_quality = None

    sequences = []
    ninput, noutput, nskipped = 0, 0, 0

    while 1:
        try:
            cur_record = iterator.next()
        except StopIteration:
            break

        ninput += 1

        if filter_quality:
            id = re.split(" ", cur_record.title)[0]
            species, transcript, gene, quality = id.split(options.separator)

            if quality not in filter_quality:
                nskipped += 1
                continue

        sequences.append(cur_record)

    take = None

    if options.method == "longest-transcript":

        take = []
        lengths = []
        for x in range(len(sequences)):
            l = len(re.sub(" ", "", sequences[x].sequence))
            id = re.split(" ", sequences[x].title)[0]
            species, transcript, gene = id.split(options.separator)[:3]
            lengths.append((species, gene, -l, x))

        lengths.sort()

        last_species = None
        last_gene = None

        for species, gene, l, x in lengths:
            if last_species == species and last_gene == gene:
                continue
            take.append(x)
            last_species, last_gene = species, gene

    elif options.method == "ids":

        take = []
        ids, nerrors = IOTools.ReadList(open(options.parameters[0], "r"))
        del options.parameters[0]

        ids = set(ids)

        for x in range(len(sequences)):
            id = re.split(" ", sequences[x].title)[0]
            if id in ids:
                take.append(x)

    if take != None:
        sequences = map(lambda x: sequences[x], take)

    noutput = len(sequences)

    for sequence in sequences:
        options.stdout.write(">%s\n%s\n" % (sequence.title, sequence.sequence))

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()

Example #9

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("--filename-filter-positives",
                      dest="filename_filter_positives",
                      type="string",
                      help="filename with positive list of trees to analyze.")

    parser.add_option("-s",
                      "--filename-species-tree",
                      dest="filename_species_tree",
                      type="string",
                      help="filename with species tree.")

    parser.add_option(
        "--filename-species2colour",
        dest="filename_species2colour",
        type="string",
        help=
        "filename with map of species to colours. If not given, random colours are assigned to species."
    )

    parser.add_option("-t",
                      "--species-tree",
                      dest="species_tree",
                      type="string",
                      help="species tree.")

    parser.add_option(
        "-e",
        "--filename-locations",
        dest="filename_locations",
        type="string",
        help=
        "filename with map of transcript information to location information.")

    parser.add_option("--no-create",
                      dest="create",
                      action="store_false",
                      help="do not create files, but append to them.")

    parser.add_option(
        "--max-separation",
        dest="max_separation",
        type="int",
        help=
        "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)."
    )

    parser.add_option(
        "--filename-species2url",
        dest="filename_species2url",
        type="string",
        help="filename with mapping information of species to URL.")

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="prefix to add as first column.")

    parser.add_option(
        "--outgroup-species",
        dest="outgroup_species",
        type="string",
        help="species to used as outgroups. Separate multiple species by ','.")

    parser.add_option("--subtrees-trees",
                      dest="subtrees_trees",
                      action="store_true",
                      help="write trees for subtrees.")

    parser.add_option("--subtrees-identifiers",
                      dest="subtrees_identifiers",
                      action="store_true",
                      help="write identifiers of subtrees.")

    parser.add_option("--svg-add-ids",
                      dest="svg_add_ids",
                      action="store_true",
                      help="add node ids to svg plot.")

    parser.add_option("--svg-otus",
                      dest="svg_otus",
                      type="string",
                      help="otus to output in svg species tree.")

    parser.add_option("--svg-branch-lenghts",
                      dest="svg_branch_lengths",
                      type="choice",
                      choices=("contemporary", "uniform", "median"),
                      help="branch lengths in species tree.")

    parser.add_option("--print-totals",
                      dest="print_totals",
                      action="store_true",
                      help="output totals sections.")

    parser.add_option("--print-subtotals",
                      dest="print_subtotals",
                      action="store_true",
                      help="output subtotals sections.")

    parser.add_option(
        "--print-best",
        dest="print_best",
        action="store_true",
        help="output best node assignment for each node in gene tree.")

    parser.add_option("--print-svg",
                      dest="print_svg",
                      action="store_true",
                      help="output svg files.")

    parser.add_option("--print-species-svg",
                      dest="print_species_svg",
                      action="store_true",
                      help="output species svg files.")

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        """output pattern for separate output of sections [default: %default].
                       Set to None, if output to stdout. Can contain one %s to be substituted with section."""
    )

    parser.add_option(
        "--output-pattern-svg",
        dest="output_pattern_svg",
        type="string",
        help=
        "filename for svg output. If it contains %s, this is replaced by gene_tree name."
    )

    parser.add_option(
        "--filename-node-types",
        dest="filename_node_types",
        type="string",
        help="filename with node type information from a previous run.")

    parser.add_option("--analyze-resolution-data",
                      dest="analyze_resolution_data",
                      type="choice",
                      action="append",
                      choices=("stats", "histograms"),
                      help="stdin is resolution data.")

    parser.add_option("--filter-quality",
                      dest="filter_quality",
                      type="choice",
                      choices=("all", "genes", "pseudogenes"),
                      help="filter predictions by gene type.")

    parser.add_option("--filter-location",
                      dest="filter_location",
                      type="choice",
                      choices=("all", "local", "non-local", "cis", "unplaced"),
                      help="filter predictions by location.")

    parser.add_option("--remove-unplaced",
                      dest="remove_unplaced",
                      action="store_true",
                      help="remove predictions on unplaced contigs.")

    parser.add_option("--skip-without-outgroups",
                      dest="skip_without_outgroups",
                      action="store_true",
                      help="skip clusters without outgroups.")

    parser.set_defaults(
        filter_quality="all",
        filter_location="all",
        remove_unplaced=False,
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        filename_species_tree=None,
        priority={
            "Speciation": 0,
            "SpeciationDeletion": 1,
            "Transcripts": 2,
            "DuplicationLineage": 3,
            "Duplication": 4,
            "DuplicationDeletion": 5,
            "DuplicationInconsistency": 6,
            "Outparalogs": 7,
            "InconsistentTranscripts": 8,
            "Inconsistency": 9,
            "Masked": 10
        },
        species_tree=None,
        filename_species2colour=None,
        filename_locations=None,
        max_separation=0,
        filename_species2url=None,
        separator="|",
        prefix=None,
        output_pattern=None,
        output_pattern_svg=None,
        outgroup_species=None,
        svg_add_ids=False,
        svg_branch_lengths="median",
        svg_otus=None,
        subtrees=False,
        print_svg=False,
        print_subtotals=False,
        print_totals=False,
        print_best=False,
        subtrees_identifiers=False,
        create=True,
        min_branch_length=0.00,
        filename_node_types=None,
        format_branch_length="%6.4f",
        nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"),
        analyze_resolution_data=None,
        warning_small_branch_length=0.01,
        filename_filter_positives=None,
        skip_without_outgroups=False,
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    if options.outgroup_species:
        options.outgroup_species = set(options.outgroup_species.split(","))

    if options.svg_otus:
        options.svg_otus = set(options.svg_otus.split(","))

    rx_species = re.compile(options.species_regex)
    extract_species = lambda x: rx_species.match(x).groups()[0]
    if options.gene_regex:
        rx_gene = re.compile(options.gene_regex)
        extract_gene = lambda x: rx_gene.match(x).groups()[0]
    else:
        extract_gene = None

    extract_quality = lambda x: x.split(options.separator)[3]

    #########################################################################
    #########################################################################
    #########################################################################
    # read positive list of malis
    #########################################################################
    if options.filename_filter_positives:
        filter_positives, nerrors = IOTools.ReadList(
            open(options.filename_filter_positives, "r"))
        filter_positives = set(filter_positives)
    else:
        filter_positives = None

    #########################################################################
    #########################################################################
    #########################################################################
    # read location info
    #########################################################################
    if options.filename_locations:
        map_id2location = TreeReconciliation.readLocations(
            open(options.filename_locations, "r"), extract_species)
    else:
        map_id2location = {}

    if (options.remove_unplaced or options.filter_location != "all"
        ) and not options.filename_locations:
        raise "please supply a file with location information."

    #########################################################################
    #########################################################################
    #########################################################################
    # delete output files
    #########################################################################
    if options.create and options.output_pattern:
        for section in ("details", "subtrees", "subids", "details", "trees",
                        "nodes", "categories"):
            fn = options.output_pattern % section
            if os.path.exists(fn):
                if options.loglevel >= 1:
                    options.stdlog.write("# deleting file %s.\n" % fn)
                os.remove(fn)

    if options.loglevel >= 1:
        options.stdlog.write("# reading gene trees.\n")
        options.stdlog.flush()

    gene_nexus = TreeTools.Newick2Nexus(sys.stdin)

    Tree.updateNexus(gene_nexus)

    if options.loglevel >= 1:
        options.stdlog.write("# read %i gene trees from stdin.\n" %
                             len(gene_nexus.trees))
        options.stdlog.flush()

    #########################################################################
    #########################################################################
    #########################################################################
    # main loop over gene trees
    #########################################################################
    ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0
    nskipped_filter, nskipped_outgroups = 0, 0

    # total counts
    total_heights_per_species = {}
    total_relheights_per_species = {}
    total_heights_per_tree = []
    total_relheights_per_tree = []

    for gene_tree in gene_nexus.trees:

        ninput += 1

        xname = re.sub("_tree.*", "", gene_tree.name)
        xname = re.sub("subtree_", "", xname)

        if filter_positives and xname not in filter_positives:
            nskipped_filter += 1
            continue

        if options.loglevel >= 6:
            gene_tree.display()

        #######################################################################
        #######################################################################
        #######################################################################
        # get identifier for this tree and update prefixes accordingly
        #######################################################################
        if options.prefix:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix1\tprefix2\t"
                prefix_row = options.prefix + "\t" + gene_tree.name + "\t"
                prefix_prefix = options.prefix + "_" + gene_tree.name + "_"
                prefix_name = options.prefix + "_" + gene_tree.name
            else:
                prefix_header = "prefix\t"
                prefix_row = options.prefix + "\t"
                prefix_prefix = options.prefix + "_"
                prefix_name = options.prefix
        else:
            if len(gene_nexus.trees) > 0:
                prefix_header = "prefix\t"
                prefix_row = gene_tree.name + "\t"
                prefix_prefix = gene_tree.name + "\t"
                prefix_name = gene_tree.name
            else:
                prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", ""

        #######################################################################
        #######################################################################
        #######################################################################
        # apply filters to gene tree
        #######################################################################
        TreeReconciliation.filterTree(gene_tree, options, map_id2location)

        otus = TreeTools.GetTaxa(gene_tree)

        if len(otus) <= 1:
            nfiltered += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty after filtering - skipped.\n" %
                    gene_tree.name)
            continue

        this_species_list = map(extract_species, otus)
        # check, if only outgroups
        if options.outgroup_species:
            if not set(this_species_list).difference(options.outgroup_species):
                nfiltered += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: only outgroups after filtering - skipped.\n"
                        % gene_tree.name)
                continue

            if options.skip_without_outgroups and not set(
                    this_species_list).intersection(options.outgroup_species):
                nskipped_outgroups += 1
                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# tree %s: no outgroups - skipped.\n" %
                        gene_tree.name)
                continue

        #######################################################################
        #######################################################################
        #######################################################################
        # reroot gene tree, if outgroups have been given.
        #######################################################################
        if options.outgroup_species:
            TreeReconciliation.rerootTree(gene_tree, extract_species, options)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute distance to root for each node
        #######################################################################
        distance_to_root = TreeTools.GetDistanceToRoot(gene_tree)

        #######################################################################
        #######################################################################
        #######################################################################
        # compute counts
        #######################################################################
        # heights per tree
        heights_per_tree = []
        # relative heights per tree
        relheights_per_tree = []
        # distance to root
        heights_per_species = {}
        # distance to root (relative to maximum distance to root)
        relheights_per_species = {}

        analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets(
            gene_tree, extract_quality, options)

        if len(analysis_set) == 0:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: empty analysis set - skipped.\n" %
                    gene_tree.name)
            nskipped += 1
            continue

        reference_height = TreeReconciliation.getReferenceHeight(
            distance_to_root,
            gene_tree,
            gene_set,
            options,
            extract_species,
            method="median")

        if reference_height is None:
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# tree %s: reference height not computable or 0 - skipped.\n"
                    % gene_tree.name)
            nskipped += 1
            continue

        for node_id in analysis_set:

            node = gene_tree.node(node_id)
            species = extract_species(node.data.taxon)
            height = distance_to_root[node_id]

            if height < options.warning_small_branch_length:
                options.stdlog.write(
                    "# tree %s: small distance %s to root at node %i: %s\n" %
                    (gene_tree.name, options.format_branch_length % height,
                     node_id, node.data.taxon))

            relheight = height / reference_height
            try:
                heights_per_species[species].append(height)
            except KeyError:
                heights_per_species[species] = [height]
                relheights_per_species[species] = []

            relheights_per_species[species].append(relheight)

            # do not use outgroup species
            if options.outgroup_species and species in options.outgroup_species:
                continue

            heights_per_tree.append(height)
            relheights_per_tree.append(relheight)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# tree %s: reference_height=%s\n" %
                (gene_tree.name,
                 options.format_branch_length % reference_height))
            options.stdlog.flush()

        if options.print_subtotals:
            printCounts(heights_per_species, relheights_per_species,
                        heights_per_tree, relheights_per_tree, options,
                        prefix_header, prefix_row)

        #######################################################################
        #######################################################################
        #######################################################################
        # update total counts
        #######################################################################
        TreeReconciliation.appendCounts(total_heights_per_species,
                                        heights_per_species)
        TreeReconciliation.appendCounts(total_relheights_per_species,
                                        relheights_per_species)

        TreeReconciliation.appendCounts(total_heights_per_tree,
                                        heights_per_tree)
        TreeReconciliation.appendCounts(total_relheights_per_tree,
                                        relheights_per_tree)

        noutput += 1

    if options.print_totals:

        if options.prefix:
            prefix_header = "prefix1\tprefix2\t"
            prefix_row = options.prefix + "\t" + "total" + "\t"
            prefix_prefix = options.prefix + "_" + "total" + "_"
            prefix_name = options.prefix + "_" + "total"
        else:
            prefix_header = "prefix\t"
            prefix_row = "total" + "\t"
            prefix_prefix = "total" + "_"
            prefix_name = "total"

        printCounts(total_heights_per_species, total_relheights_per_species,
                    total_heights_per_tree, total_relheights_per_tree, options,
                    prefix_header, prefix_row)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n"
            % (ninput, nfiltered, nskipped, nskipped_filter,
               nskipped_outgroups, noutput))

    E.Stop()

Example #10

Show file

File: malis2mali.py Project: yangjl/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: malis2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    addOptions(parser)

    parser.add_option(
        "--filename-coordinates",
        dest="filename_coordinates",
        type="string",
        help="filename of coordinates that constitute the multiple alignment.")

    parser.add_option("--filename-identifiers",
                      dest="filename_identifiers",
                      type="string",
                      help="filename with list of identifiers to use.")

    parser.add_option(
        "-x",
        "--pattern-identifier",
        dest="pattern_identifier",
        type="string",
        help="pattern to extract identifier from a sequence header.")

    parser.add_option(
        "-w",
        "--width",
        dest="width",
        type="int",
        help=
        "width of an alignment column (choose 3 for codon alignments) [default=%default]."
    )

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      choices=("filter-variants", ),
                      help="methods to apply")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one.")

    parser.add_option("--mask-acgtn",
                      dest="mask_actgn",
                      action="store_true",
                      help="mask. Anything not [ACGTN] will be N.")

    parser.set_defaults(
        pattern_identifier="(^\S+)",
        methods=[],
        parameters="",
        filename_identifiers=None,
        filename_coordinates=None,
        mask_acgtn=False,
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    if not options.pattern_mali:
        raise ValueError(
            "Please specifiy a pattern to find the malis using --pattern-mali")

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read components
    ####################################################################
    map_seq_id2component, map_component2seq_id, map_component2input_id = \
        readComponents( options )

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to mask
    ####################################################################
    map_component2masks = readMasks(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## Read regions to extract
    ####################################################################
    map_component2extracts = readExtracts(options, map_component2input_id)

    ####################################################################
    ####################################################################
    ####################################################################
    ## read identifiers
    ####################################################################
    if options.filename_identifiers:
        identifiers, nerrors = IOTools.ReadList(
            open(options.filename_identifiers, "r"))
        identifiers_set = set(identifiers)
    else:
        identifiers = None
        identifiers_set = None

    ####################################################################
    ####################################################################
    ####################################################################
    ## Prepare for run
    ####################################################################

    rx_identifier = re.compile(options.pattern_identifier)

    ## build list of concatenated malis
    sequences = {}
    if identifiers:
        for id in identifiers_set:
            sequences[id] = []
    else:
        identifiers_set = set()
        for seq_id in map_seq_id2component.keys():
            id = rx_identifier.search(seq_id).groups()[0]
            sequences[id] = []
            identifiers_set.add(id)
        identifiers = list(identifiers_set)
        identifiers.sort()

    component_ids = map_component2seq_id.keys()
    component_ids.sort()

    if options.test:
        component_ids = component_ids[:options.test]

    ####################################################################
    ####################################################################
    ####################################################################
    ## Build list of components to output.
    ####################################################################
    component_ids, map_sample2reference = selectComponents(
        component_ids, map_component2seq_id, map_component2input_id, None,
        options)

    nskipped = 0
    new_component_ids = []

    for component_id in component_ids:

        try:
            mali = getMali(component_id, map_component2seq_id,
                           map_component2input_id, None, options)
        except OSError, msg:
            E.warn("could not find mali %s: %s" % (component_id, msg))
            nskipped += 1
            continue

        ###############################################################
        ###############################################################
        ###############################################################
        ## check if all identifiers in component are present in mali
        ## and build a temporary alignment with all of those found
        component_set = set(map_component2seq_id[component_id])
        if len(component_set.difference(set(mali.getIdentifiers()))) != 0:
            nskipped += 1
            continue

        found = {}
        is_double = None
        temp_mali = Mali.Mali()
        temp_mali.setName(str(component_id))

        for seq_id in map_component2seq_id[component_id]:
            id = rx_identifier.search(seq_id).groups()[0]
            if id not in identifiers_set:
                continue
            if id in found:

                if options.skip_doubles:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# component %s: removed double entry %s\n" %
                            (component_id, seq_id))
                    continue
                else:
                    is_double = id
                    break

            if options.output_format == "codeml":
                if len(mali[seq_id]) % 3 != 0:
                    raise "length of sequence %s is not a multiple of 3: %i" % (
                        seq_id, len(mali[seq_id]))

            ## change identifier to id
            found[id] = True
            entry = mali.getEntry(seq_id)
            temp_mali.addSequence(id, entry.mFrom, entry.mTo, entry.mString)

        if is_double:
            nskipped += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# component %s: skipped because it contains double entry %s\n"
                    % (component_id, is_double))
            continue

        if set(found.keys()) != identifiers_set:
            nskipped += 1
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# component %s: skipped because incomplete: %s\n" %
                    (component_id, str(found.keys())))
            continue

        ###############################################################
        ###############################################################
        ###############################################################
        ## mask the temporary alignment
        maskAlignment(temp_mali, map_component2masks, map_component2extracts,
                      map_sample2reference, options)

        for id, o in temp_mali.items():
            if options.mask_acgtn:
                s = re.sub("[^ACGTNactgn]", "N", o.mString)
            else:
                s = o.mString
            sequences[id].append(s)

        new_component_ids.append(component_id)

        ## if we only sample, stop if you have reached
        ## the desired number
        if options.sample and len(new_component_ids) == options.sample:
            break

Example #11

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/regions2gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="pattern to look for sequence filename.")

    parser.add_option(
        "-i",
        "--ids",
        dest="ids",
        type="string",
        help=
        "comma separated list of prediction ids. Use 'all' to use all predictions."
    )

    parser.add_option("-f",
                      "--filename-ids",
                      dest="filename_ids",
                      type="string",
                      help="filename with prediction ids.")

    parser.add_option("-t",
                      "--type",
                      dest="type",
                      type="choice",
                      choices=("genes", "mrnas", "introns", "intronic",
                               "exons", "exonic", "intergenic",
                               "exons-third-codons"),
                      help="type to output.")

    parser.add_option(
        "-e",
        "--extend-region",
        dest="extend_region",
        type="int",
        help="regions are extended by this margin at either end.")

    parser.add_option(
        "-r",
        "--shorten-region",
        dest="shorten_region",
        type="int",
        help="regions are shortened by this margin at either end.")

    parser.add_option("-m",
                      "--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of segment.")

    parser.add_option("-s",
                      "--schema",
                      dest="schema",
                      type="string",
                      help="schema to take data from.")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("fasta", "table", "region"),
                      help="output formats.")

    parser.add_option("--fasta-format",
                      dest="fasta_format",
                      type="choice",
                      choices=("id-coordinates", "coordinates",
                               "schema-coordinates"),
                      help="output formats for fasta formatted headers.")

    parser.add_option("--orthologs",
                      dest="orthologs",
                      action="store_true",
                      help="lookup up orthologs of prediction ids.")

    parser.add_option("--multiple",
                      dest="multiple",
                      action="store_true",
                      help="""lookup up predictions in multiple species.
                       Identifiers should be given as schema|prediction_id[|additional_fields].
                       Note that the genome file locations have to be consistent."""
                      )

    parser.add_option("--id-format",
                      dest="id_format",
                      type="choice",
                      choices=("id", "schema-id", "full"),
                      help="output format for ids.")

    parser.add_option("--taboo-regions",
                      dest="taboo_regions",
                      type="choice",
                      choices=("same", "both"),
                      help="check for overlap in same/both strands.")

    parser.add_option("--filename-taboo-regions",
                      dest="filename_taboo_regions",
                      type="string",
                      help="filename with information about taboo regions.")

    parser.add_option(
        "--filename-properties",
        dest="filename_properties",
        type="string",
        help=
        "filename with mapping information between features and properties.")

    parser.add_option(
        "--invert-properties",
        dest="invert-properties",
        action="store_true",
        help=
        "instead of printing features which have properties, print those that have not."
    )

    parser.add_option(
        "--output-coordinate-format",
        dest="output_coordinate_format",
        type="choice",
        choices=("full", "long"),
        help=
        """output format of coordinates. Output format is contig:strand:from:to in zero based
/forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field"""
    )

    parser.set_defaults(genome_file="genome",
                        identifiers=None,
                        filename_ids="-",
                        ids=None,
                        extend_region=0,
                        shorten_region=0,
                        tablename_predictions="predictions",
                        tablename_exons="exons",
                        tablename_genes="genes",
                        tablename_quality="quality",
                        schema=None,
                        output_format="fasta",
                        fasta_format="id-coordinates",
                        type="mrnas",
                        min_length=1,
                        id_format="id",
                        mmultiple=False,
                        separator="|",
                        filename_taboo_regions=False,
                        output_coordinate_format="full",
                        filename_properties=None,
                        invert_property=False,
                        report_step=10000)

    (options, args) = E.Start(parser, add_psql_options=True)

    if options.orthologs: options.id_format = "schema-id"

    ## database handle for connecting to postgres
    dbhandle = pgdb.connect(options.psql_connection)

    ## Step 1 : Input of predictions

    ## read identifiers from file, command line arguments or stdin.

    if options.ids in ("all", "nr"):
        prediction_ids = options.ids
        if options.loglevel >= 1:
            options.stdlog.write("# using all prediction ids.\n")
            options.stdlog.flush()
    elif options.ids:
        prediction_ids = options.ids.split(",")
    elif len(args) > 0:
        prediction_ids = args

    elif options.filename_ids:
        prediction_ids = []

        if options.filename_ids == "-":
            prediction_ids += IOTools.ReadList(sys.stdin)[0]
        elif options.filename_ids:
            prediction_ids += IOTools.ReadList(open(options.filename_ids,
                                                    "r"))[0]

        if len(prediction_ids) == 0:
            raise "no prediction identifiers given."

        if options.loglevel >= 1:
            options.stdlog.write("# read %i prediction ids.\n" %
                                 len(prediction_ids))
            options.stdlog.flush()

    if options.filename_taboo_regions:
        ## Note: the input has to be in forward coordinates in order for option "both" to work.
        taboo_regions = Regions.RegionFilter()
        if options.taboo_regions == "both":
            ignore_strand = True
        else:
            ignore_strand = False
        taboo_regions.readFromFile(open(options.filename_taboo_regions, "r"),
                                   ignore_strand=ignore_strand)
    else:
        taboo_regions = None

    map_feature2property = getMapFeature2Property(options)

    processPredictions(dbhandle, options.schema, options, prediction_ids,
                       taboo_regions, map_feature2property)

    E.Stop()

Example #12

Show file

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-s", "--species", dest="species", type="string",
                      help="schema of master species."  )

    parser.set_defaults(
        tablename_orthologs = "orthology_pairwise1v5.orthologlinks_first",
        filename_ids = "-",
        schemas = None,
        species = None,
    )

    (options, args) = E.Start( parser, add_psql_options = True )

    dbhandle = pgdb.connect( options.psql_connection )

    if options.filename_ids == "-":
        ids, errors = IOTools.ReadList(sys.stdin)

    extra_options = ["schema1 = '%s'" % options.species,
                     "prediction_id1 IN ('%s')" % "','".join( ids ) ]
    
    if options.schemas:
        extra_options.append( "schema2 IN ('%s')" % "','".join(options.schemas))
        
    statement = """SELECT prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2
    FROM %s
    WHERE schema1 != schema2 AND %s
    ORDER BY prediction_id1""" % (options.tablename_orthologs,
                                  " AND ".join(extra_options))

    cc = dbhandle.cursor()
    cc.execute(statement)
    result = cc.fetchall()
    cc.close()

    if options.schemas:
        schemas = options.schemas
    else:
        schemas = set( map( lambda x: x[1], result) )

    ## compute counts
    degeneracies = {}
    for x in ids:
        degeneracies[x] = {}
        for s in schemas:
            degeneracies[x][s] = (0,0,0,0)
            
    for prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2 in result:
        degeneracies[prediction_id1][schema2] = (gd1, gd2, td1, td2)

    ## output
    options.stdout.write("%s\t%s\n" % ("prediction_id", "\t".join(schemas)))
    for x in ids:
        options.stdout.write("%s" % x)
        for s in schemas:
            options.stdout.write("\t%s:%s:%s:%s" % degeneracies[x][s])
        options.stdout.write("\n")
    
    E.Stop()

Example #13

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/analyze_ribosomes.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--schemas",
                      dest="schemas",
                      type="string",
                      help="schemas in the set.")

    parser.add_option("-e",
                      "--field-extract",
                      dest="field_extract",
                      type="string",
                      help="pattern for the field to extract.")

    parser.add_option("-c",
                      "--field-compare",
                      dest="field_compare",
                      type="string",
                      help="pattern for the field to compare.")

    parser.add_option("-i",
                      "--filename-identifiers",
                      dest="filename_identifiers",
                      type="string",
                      help="identifiers in the positive set.")

    parser.add_option("-u",
                      "--filename-subset",
                      dest="filename_subset",
                      type="string",
                      help="subset in the positive set.")

    parser.add_option("--filter-min-ratio",
                      dest="filter_min_ratio",
                      type="float",
                      help="minimum boundary for filter.")

    parser.add_option("--filter-max-ratio",
                      dest="filter_max_ratio",
                      type="float",
                      help="maximum boundary for filter.")

    parser.add_option(
        "-o",
        "--output-fields",
        dest="output_fields",
        type="string",
        help=
        "output fields, choices are: zscore, val, nvals, sum, min, max, stddev, mean, median."
    )

    parser.add_option(
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help=
        "pattern for table headers, should contain %s for schema and %s for field anme."
    )

    parser.add_option(
        "-f",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("table", "list", "values"),
        help="output format. Tabular form (one row per ortholog) or list form."
    )

    parser.add_option("--format",
                      dest="format",
                      type="string",
                      help="output format for numbers.")

    parser.add_option("--remove-na",
                      dest="remove_na",
                      action="store_true",
                      help="remove entries with any na values.")

    parser.set_defaults(
        field_extract="%s_length",
        field_compare="%s_length",
        filename_identifiers=None,
        filename_subset=None,
        filter_min_ratio=0.00,
        filter_max_ratio=0.00,
        schemas="",
        output_fields="",
        output_pattern="%s_%s",
        output_format="table",
        format="%6.4f",
        remove_na=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    options.schemas = options.schemas.split(",")
    if not options.schemas:
        raise "please supply schemas."

    if options.output_fields:
        options.output_fields = options.output_fields.split(",")
    else:
        options.output_fields = ()

    fields, table = CSV.ReadTable(sys.stdin)

    map_fields2column = {}
    for x in fields:
        map_fields2column[x] = len(map_fields2column)

    if options.loglevel >= 1:
        options.stdlog.write("# read a %i x %i table.\n" %
                             (len(table), len(fields)))

    if options.filename_subset:
        subset, nerrors = IOTools.ReadList(open(options.filename_subset, "r"))
        subset = set(subset)

        table = filter(lambda x: x[0] in subset, table)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# subset of %i entries reduced table to a %i x %i table.\n" %
                (len(subset), len(table), len(fields)))

    if options.filename_identifiers:
        identifiers, nerrors = IOTools.ReadList(
            open(options.filename_identifiers, "r"))
    else:
        identifiers = []

    identifiers = set(identifiers)

    # extract rows with positive identifiers
    positive_rows = filter(lambda x: x[0] in identifiers, table)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# subset of %i identifiers gives %i positive entries.\n" %
            (len(identifiers), len(positive_rows)))

    if options.output_format == "table":
        options.stdout.write("id")
        for schema in options.schemas:
            if options.output_fields:
                for field in options.output_fields:
                    options.stdout.write("\t" + options.output_pattern %
                                         (schema, field))
            else:
                options.stdout.write("\t%s" % (schema))

        options.stdout.write("\n")
    else:
        options.stdout.write("schema\tvalue\n")

    if identifiers:
        for row in positive_rows:

            if options.output_format == "table":
                options.stdout.write(row[0])

            for schema in options.schemas:

                # set fields for extraction
                f_extract = map_fields2column[options.field_extract % schema]
                f_compare = map_fields2column[options.field_compare % schema]

                # get region for extraction
                if row[f_compare] != "na":
                    r = float(row[f_compare])
                    if options.filter_min_ratio or options.filter_max_ratio:
                        mi = r * options.filter_min_ratio
                        ma = r * options.filter_max_ratio
                        f = lambda x: x[f_compare] != "na" and float(
                            x[f_compare]
                        ) >= mi and float(x[f_compare]) <= ma and x[
                            0] not in identifiers and x[f_extract] != "na"
                    else:
                        f = lambda x: x[0] not in identifiers and x[f_extract
                                                                    ] != "na"
                    # extract values: filter by minimum and maximum range and remove
                    # positive identifiers.
                    v = float(row[f_extract])
                    values = map(lambda x: float(x[f_extract]),
                                 filter(f, table))

                    stats = Stats.DistributionalParameters(values)
                else:
                    v = None

                for field in options.output_fields:

                    if v is not None:
                        if field == "zscore":
                            f = options.format % stats.getZScore(v)
                        elif field == "diff":
                            f = options.format % (v - stats["mean"])
                        elif field == "reldiff":
                            f = options.format % (
                                (v - stats["mean"]) / stats["mean"])
                        elif field == "val":
                            f = options.format % v
                        else:
                            f = options.format % stats[field]
                    else:
                        f = "na"

                    if options.output_format == "table":
                        options.stdout.write("\t%s" % f)
                    elif options.output_format == "list":
                        options.stdout.write("%s\t%s\n" % (schema, f))
                    elif options.output_format == "values":
                        options.stdout.write(
                            "%s\t%s\t%5.2f\t%s\n" %
                            (row[0], schema, v, ",".join(
                                map(lambda x: options.format % x, values))))

            if options.output_format == "table":
                options.stdout.write("\n")

    else:

        extract_columns = []

        for schema in options.schemas:
            extract_columns.append(map_fields2column[options.field_extract %
                                                     schema])

        # simply dump a subset of values
        for row in table:

            skip = False

            if options.filter_min_ratio or options.filter_max_ratio:

                master = options.schemas[0]

                v = row[map_fields2column[options.field_compare % master]]

                if v == "na":
                    continue

                v = float(v)

                mi = v * options.filter_min_ratio
                ma = v * options.filter_max_ratio

                for schema in options.schemas[1:]:

                    r = row[map_fields2column[options.field_compare % schema]]

                    if r == "na":
                        if options.remove_na:
                            skip = True
                        continue

                    r = float(r)

                    if r < mi or r > ma:
                        skip = True
                        if options.loglevel >= 3:
                            if options.format == "table":
                                options.stdout.write("* ")
                                options.stdout.write("%s\t" % row[0])
                                options.stdout.write("\t".join(
                                    [row[y] for y in extract_columns]))
                                options.stdout.write("\n")
                        break

            if skip:
                continue

            if options.output_format == "table":
                options.stdout.write("%s\t" % row[0])
                options.stdout.write("\t".join(
                    [row[y] for y in extract_columns]))
                options.stdout.write("\n")

            elif options.output_format == "list":
                has_na = False
                for x in range(len(options.schemas)):
                    v = row[extract_columns[x]]
                    if v == "na":
                        has_na = True

                if has_na and options.remove_na:
                    continue

                for x in range(len(options.schemas)):
                    options.stdout.write(
                        "%s\t%s\n" %
                        (options.schemas[x], row[extract_columns[x]]))

    E.Stop()

Example #14

Show file

File: matrix2matrix.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: matrix2matrix.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "normalize-by-min-diagonal",
                          "normalize-by-column",
                          "log",
                          "ln",
                          "negzero2value",
                          "set-diagonal",
                          "subtract-matrix",
                          "mix-matrix",
                          "normalize-by-matrix",
                          "normalize-by-column-max",
                          "normalize-by-row-max",
                          "normalize-by-column-min",
                          "normalize-by-row-min",
                          "normalize-by-column-median",
                          "normalize-by-row-median",
                          "normalize-by-column-mean",
                          "normalize-by-row-mean",
                          "normalize-by-column-total",
                          "normalize-by-row-total",
                          "correspondence-analysis",
                          "normalize-by-value",
                          "add-value",
                          "sort-rows",
                          "sort-columns",
                          "transpose",
                          "upper-bound",
                          "lower-bound",
                          "subtract-first-col",
                          "multiply-by-value",
                          "divide-by-value",
                          "mask-rows",
                          "mask-columns",
                          "mask-rows-and-columns",
                          "symmetrize-mean",
                          "symmetrize-max",
                          "symmetrize-min",
                      ),
                      help="""method to use [default=%default]""")

    parser.add_option("-s",
                      "--scale",
                      dest="scale",
                      type="float",
                      help="factor to scale matrix by [default=%default].")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output number format [default=%default].")

    parser.add_option("--filename-rows",
                      dest="filename_rows",
                      type="string",
                      help="filename with rows to mask [default=%default].")

    parser.add_option("--filename-columns",
                      dest="filename_columns",
                      type="string",
                      help="filename with columns to mask [default=%default].")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="Parameters for various functions.")

    parser.add_option("-t",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers",
                      dest="headers",
                      action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-a",
                      "--value",
                      dest="value",
                      type="float",
                      help="value to use for various algorithms.")

    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix.""")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix.""")

    parser.add_option(
        "--missing",
        dest="missing",
        type="float",
        help=
        "value to use for missing values. If not set, missing values will cause the script to fail [default=%default]."
    )

    parser.set_defaults(
        methods=[],
        scale=1.0,
        headers=True,
        format="%6.4f",
        output_format="full",
        input_format="full",
        value=0.0,
        parameters="",
        write_separators=True,
        filename_rows=None,
        filename_columns=None,
        missing=None,
    )

    (options, args) = E.Start(parser)

    options.parameters = options.parameters.split(",")

    lines = filter(lambda x: x[0] != "#", sys.stdin.readlines())

    if len(lines) == 0:
        raise IOError("no input")

    chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    if options.filename_rows:
        row_names, n = IOTools.ReadList(open(options.filename_rows, "r"))
    if options.filename_columns:
        column_names, n = IOTools.ReadList(open(options.filename_columns, "r"))

    for chunk in range(len(chunks) - 1):

        try:
            raw_matrix, row_headers, col_headers = MatlabTools.readMatrix(
                StringIO.StringIO("".join(lines[chunks[chunk] +
                                                1:chunks[chunk + 1]])),
                format=options.input_format,
                headers=options.headers,
                missing=options.missing)
        except ValueError, msg:
            E.warn("matrix could not be read: %s" % msg)
            continue

        nrows, ncols = raw_matrix.shape

        E.debug("read matrix: %i x %i, %i row titles, %i colum titles" %
                (nrows, ncols, len(row_headers), len(col_headers)))

        parameter = 0

        for method in options.methods:

            matrix = numpy.reshape(numpy.array(raw_matrix), raw_matrix.shape)

            if method in ("normalize-by-matrix", "subtract-matrix",
                          "mix-matrix", "add-matrix"):

                other_matrix, other_row_headers, other_col_headers = MatlabTools.ReadMatrix(
                    open(options.parameters[parameter], "r"),
                    headers=options.headers)

                other_nrows, other_ncols = other_matrix.shape

                if options.loglevel >= 2:
                    options.stdlog.write(
                        "# read second matrix from %s: %i x %i, %i row titles, %i colum titles.\n"
                        % (options.parameters[parameter], other_nrows,
                           other_ncols, len(other_row_headers),
                           len(other_col_headers)))

                parameter += 1

            elif method == "normalize-by-min-diagonal":
                for x in range(nrows):
                    for y in range(ncols):
                        m = min(raw_matrix[x, x], raw_matrix[y, y])
                        if m > 0:
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "normalize-by-column":
                if nrows != ncols:
                    raise "only supported for symmeric matrices."

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[y, y]

            elif method == "normalize-by-value":
                matrix = raw_matrix / float(options.parameters[parameter])
                parameter += 1

            elif method == "normalize-by-row":
                if nrows != ncols:
                    raise "only supported for symmeric matrices."

                for x in range(nrows):
                    for y in range(ncols):
                        if raw_matrix[y, y] > 0:
                            matrix[x, y] = raw_matrix[x, y] / raw_matrix[x, x]

            elif method == "subtract-first-col":
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] -= raw_matrix[x, 0]

            elif method.startswith("normalize-by-column"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for y in range(ncols):
                    m = f(matrix[:, y])
                    if m != 0:
                        for x in range(nrows):
                            matrix[x, y] = matrix[x, y] / m

            elif method.startswith("normalize-by-row"):
                if method.endswith("max"):
                    f = max
                elif method.endswith("min"):
                    f = min
                elif method.endswith("median"):
                    f = scipy.median
                elif method.endswith("mean"):
                    f = scipy.mean
                elif method.endswith("total"):
                    f = sum

                for x in range(nrows):
                    m = f(matrix[x, :])
                    if m != 0:
                        for y in range(ncols):
                            matrix[x, y] = raw_matrix[x, y] / m

            elif method == "negzero2value":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] <= 0:
                            matrix[x, y] = options.value

            elif method == "minmax":
                # set zero/negative values to a value
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y], matrix[y, x] = \
                            min(matrix[x, y], matrix[y, x]), \
                            max(matrix[x, y], matrix[y, x])

            elif method == "log":
                # apply log to all values.
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log10(matrix[x, y])

            elif method == "ln":
                for x in range(nrows):
                    for y in range(ncols):
                        if matrix[x, y] > 0:
                            matrix[x, y] = math.log(matrix[x, y])

            elif method == "transpose":
                matrix = numpy.transpose(matrix)
                row_headers, col_headers = col_headers, row_headers
                nrows, ncols = ncols, nrows

            elif method == "mul":
                matrix = numpy.dot(matrix, numpy.transpose(matrix))
                col_headers = row_headers

            elif method == "multiply-by-value":
                matrix *= options.value

            elif method == "divide-by-value":
                matrix /= options.value

            elif method == "add-value":
                matrix += options.value

            elif method == "angle":
                # write angles between col vectors
                v1 = numpy.sqrt(numpy.sum(numpy.power(matrix, 2), 0))
                matrix = numpy.dot(numpy.transpose(matrix), matrix)
                row_headers = col_headers
                nrows = ncols
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] /= v1[x] * v1[y]

            elif method == "euclid":
                # convert to euclidean distance matrix
                matrix = numpy.zeros((ncols, ncols), numpy.float)
                for c1 in range(0, ncols - 1):
                    for c2 in range(c1 + 1, ncols):
                        for r in range(0, nrows):
                            d = raw_matrix[r][c1] - raw_matrix[r][c2]
                            matrix[c1, c2] += (d * d)
                        matrix[c2, c1] = matrix[c1, c2]
                matrix = numpy.sqrt(matrix)
                row_headers = col_headers
                nrows = ncols

            elif method.startswith("symmetrize"):
                f = method.split("-")[1]
                if f == "max":
                    f = max
                elif f == "min":
                    f = min
                elif f == "mean":
                    f = lambda x, y: float(x + y) / 2

                if nrows != ncols:
                    raise ValueError(
                        "symmetrize only available for symmetric matrices")
                if row_headers != col_headers:
                    raise ValueError(
                        "symmetrize not available for permuted matrices")
                for x in range(nrows):
                    for y in range(ncols):
                        matrix[x, y] = matrix[y,
                                              x] = f(matrix[x, y], matrix[y,
                                                                          x])
            elif method == "sub":
                matrix = options.value - matrix

            elif method in ("lower-bound", "upper-bound"):

                boundary = float(options.parameters[parameter])
                new_value = float(options.parameters[parameter + 1])
                parameter += 2
                if method == "upper-bound":
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] > boundary:
                                matrix[x, y] = new_value
                else:
                    for x in range(nrows):
                        for y in range(ncols):
                            if matrix[x, y] < boundary:
                                matrix[x, y] = new_value

            elif method == "subtract-matrix":
                matrix = matrix - other_matrix

            elif method == "add-matrix":
                matrix = matrix + other_matrix

            elif method == "normalize-by-matrix":

                # set 0s to 1 in the other matrix
                for x in range(nrows):
                    for y in range(ncols):
                        if other_matrix[x, y] == 0:
                            other_matrix[x, y] = 1.0

                matrix = matrix / other_matrix

            elif method == "mix-matrix":
                for x in range(len(other_row_headers) - 1):
                    for y in range(x + 1, len(other_col_headers)):
                        matrix[x, y] = other_matrix[x, y]

            elif method == "set-diagonal":
                value = float(options.parameters[parameter])
                for x in range(min(nrows, ncols)):
                    matrix[x, x] = value
                parameter += 1

            elif method == "transpose":
                matrix = numpy.transpose(raw_matrix)
                row_headers, col_headers = col_headers, row_headers

            elif method == "correspondence-analysis":
                row_indices, col_indices = CorrespondenceAnalysis.GetIndices(
                    raw_matrix)
                map_row_new2old = numpy.argsort(row_indices)
                map_col_new2old = numpy.argsort(col_indices)

                matrix, row_headers, col_headers = CorrespondenceAnalysis.GetPermutatedMatrix(
                    raw_matrix,
                    map_row_new2old,
                    map_col_new2old,
                    row_headers=row_headers,
                    col_headers=col_headers)

            elif method == "mask-rows":
                r = set(row_names)
                for x in range(len(row_headers)):
                    if row_headers[x] in r:
                        matrix[x, :] = options.value

            elif method == "mask-columns":
                r = set(column_names)
                for x in range(len(col_headers)):
                    if col_headers[x] in r:
                        matrix[:, x] = options.value

            elif method == "mask-rows-and-columns":

                r = set(row_names)
                c = set(column_names)
                for x in range(len(row_headers)):
                    for y in range(len(col_headers)):
                        if row_headers[x] in r and col_headers[y] in c:
                            matrix[x, y] = options.value

            raw_matrix = numpy.reshape(numpy.array(matrix), matrix.shape)

        else:
            # for simple re-formatting jobs
            matrix = raw_matrix

        if options.write_separators:
            options.stdout.write(lines[chunks[chunk]])

        MatlabTools.writeMatrix(sys.stdout,
                                matrix,
                                value_format=options.format,
                                format=options.output_format,
                                row_headers=row_headers,
                                col_headers=col_headers)

Example #15

Show file

File: codonbias_acai2tsv.py Project: lesheng/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codonbias_acai2tsv.py 865 2007-01-15 13:44:43Z andreas $"
    )

    parser.add_option("-o",
                      "--input-file-trace",
                      dest="input_filename_trace",
                      type="string",
                      help="input filename for cai.",
                      metavar="FILE")

    parser.add_option("-e",
                      "--input-file-genes",
                      dest="input_filename_genes",
                      type="string",
                      help="input filename for genes information from cai.",
                      metavar="FILE")

    parser.add_option("-c",
                      "--input-file-codons",
                      dest="input_filename_codons",
                      type="string",
                      help="input filename for codon usage information.",
                      metavar="FILE")

    parser.add_option("--input-file-sequences",
                      dest="input_filename_sequences",
                      type="string",
                      help="input filename with sequences.",
                      metavar="FILE")

    parser.add_option("-t",
                      "--input-file-subset",
                      dest="input_filename_subset",
                      type="string",
                      help="input filename with subset.",
                      metavar="FILE")

    parser.add_option("--codon-table-format",
                      dest="codon_table_format",
                      type="choice",
                      choices=("list", "matrix"),
                      help="output options for output codon tables.")

    parser.add_option("--codon-table-type",
                      dest="codon_table_type",
                      type="choice",
                      choices=("counts", "frequencies", "weights",
                               "absolute-frequencies"),
                      help="type of codon table.")

    parser.add_option("-r",
                      "--reference",
                      dest="reference",
                      type="string",
                      help="dump CAI reference weights for species.")

    parser.add_option("-s",
                      "--select",
                      dest="select",
                      type="string",
                      help="fields to select from genes table.")

    parser.add_option("-m",
                      "--map",
                      dest="input_filename_map",
                      type="string",
                      help="filename with mapping information for gene names.",
                      metavar="FILE")

    parser.add_option("-i",
                      "--invert-map",
                      dest="invert_map",
                      action="store_true",
                      help="invert map.")

    parser.add_option(
        "-d",
        "--dominant-set",
        dest="dominant_set",
        type="float",
        help="only print out dominant set (# fraction of most biased genes).")

    parser.add_option(
        "--reverse-set",
        dest="reverse_set",
        action="store_true",
        help="print the reverse set, i.e., then non-dominant set.")

    parser.add_option(
        "-u",
        "--codon-usage",
        dest="codon_usage",
        type="string",
        help="print codon usage for the full/biased set of genes [full|biased]."
    )

    parser.add_option(
        "-w",
        "--weights",
        dest="weights",
        type="string",
        help=
        "print weights [final-list|final-matrix|random|compute|weights|frequencies|absolute-frequencies]."
    )

    parser.add_option("--weights-matrix2table",
                      dest="weights_matrix2table",
                      action="store_true",
                      help="convert a weights matrix to a weights table.")

    parser.add_option("--get-preferred-codons",
                      dest="get_preferred_codons",
                      type="string",
                      help="compute overview of preferred codons.")

    parser.set_defaults(input_filename="-",
                        input_filename_trace=None,
                        input_filename_genes=None,
                        input_filename_codons=None,
                        input_filename_map=None,
                        input_filename_subset=None,
                        input_filename_sequences=None,
                        invert_map=False,
                        select=None,
                        codon_usage=None,
                        weights=None,
                        revserse_set=False,
                        pseudocounts=1,
                        codon_table_format="list",
                        codon_table_type="weights",
                        weights_matrix2table=False,
                        random_size=1000,
                        get_preferred_codons=None,
                        dominant_set=0.0)

    (options, args) = E.Start(parser)
    if options.select:
        options.select = options.select.split(",")

    outfile = options.stdout

    ###################################################################
    # convert weights table to a codon table
    if options.weights_matrix2table:
        lines = options.stdin.readlines()
        data = []
        for line in lines:
            if line[0] == "#":
                continue
            data += list(map(float, line[:-1].split(",")))

        weights = {}
        x = 0
        for cc in OUTPUT_ORDER_CODON_MATRIX:
            for c in cc:
                weights[c] = data[x]
                x += 1

        outfile.write("CODON\tWEIGHT\n")
        codons = weights.keys()
        codons.sort()
        for codon in codons:
            outfile.write("%s\t%f\n" % (codon, weights[codon]))

        E.Stop()
        sys.exit(1)

    ###################################################################
    map_genes = {}

    if options.input_filename_map:
        data = map(
            lambda x: x[:-1].split("\t")[:2],
            filter(lambda x: x[0] != "#",
                   open(options.input_filename_map, "r").readlines()))

        for a, b in data:
            if options.invert_map:
                a, b = b, a
            map_genes[a] = b

    result = WrapperAdaptiveCAI.AdaptiveCAIResult()

    if options.input_filename_genes:
        gene_file = open(options.input_filename_genes, "r")
    else:
        gene_file = None

    if options.input_filename_codons:
        codon_file = open(options.input_filename_codons, "r")
    else:
        codon_file = None

    if options.input_filename_trace:
        trace_file = open(options.input_filename_trace, "r")
    else:
        trace_file = None

    if options.input_filename_subset:
        l, e = IOTools.ReadList(open(options.input_filename_subset, "r"))
        subset = set(l)
        if options.loglevel >= 1:
            options.stdlog.write("# read %i entries into subset from %s.\n" %
                                 (len(subset), options.input_filename_subset))
    else:
        subset = None

    result.Read(gene_file=gene_file,
                codon_file=codon_file,
                trace_file=trace_file)

    if gene_file:
        gene_file.close()
    if codon_file:
        codon_file.close()
    if trace_file:
        trace_file.close()

    if options.reference:
        if options.reference not in CODON_PREFERENCES:
            raise "unknown species %s: possibles species are: %s" % (
                options.reference, str(CODON_PREFERNCES.keys()))

        weights = Genomics.CalculateCAIWeightsFromCounts(
            CODON_PREFERENCES[options.reference], options.pseudocounts)

        for x in range(len(OUTPUT_ORDER_CODON_MATRIX)):
            outfile.write(",".join(
                map(lambda z: "%5.3f" % z, [
                    weights[codon.upper()]
                    for codon in OUTPUT_ORDER_CODON_MATRIX[x]
                ])))
            outfile.write("\n")

    if options.dominant_set and gene_file:
        cai_threshold = result.GetDominantThreshold(options.dominant_set)
    else:
        if options.reverse_set:
            cai_threshold = 1.0
        else:
            cai_threshold = 0.0

    if options.select:

        fields = []
        titles = []
        for x in options.select:
            f = re.match("(\S+) (AS|as) (\S+)", x)
            if f:
                fields.append(f.groups()[0].upper())
                titles.append(f.groups()[2])
            else:
                fields.append(x.upper())
                titles.append(x)

        outfile.write("GENENAME\t" + string.join(titles, "\t") + "\n")

        for genename, data in result.mGeneInfo.items():
            if genename in map_genes:
                genename = map_genes[genename]

            if options.reverse_set:
                if data["CAICLASS"] >= cai_threshold:
                    continue
            else:
                if data["CAICLASS"] < cai_threshold:
                    continue

            outfile.write(genename)
            for c in fields:
                outfile.write("\t%s" % str(data[c]))
            outfile.write("\n")

    if options.weights:

        format = options.codon_table_format

        if options.weights in ("compute-counts", "compute-weights",
                               "compute-frequencies"):
            # compute codon usage weights from a set of sequences
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for x in codons:
                counts[x] = 0

            if options.input_filename_sequences:
                sequences = Genomics.ReadPeptideSequences(open(
                    options.input_filename_sequences, "r"),
                                                          filter=subset)
                for key, sequence in sequences.items():
                    sequence = re.sub(" ", "", sequence)
                    if len(sequence) % 3 != 0:
                        raise "warning: sequence %s is not multiple of 3" % key
                    for codon in [
                            sequence[x:x + 3]
                            for x in range(0, len(sequence), 3)
                    ]:
                        counts[codon.upper()] += 1

            if options.weights == "compute-frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.weights == "compute-weights":
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
            else:
                weights = counts

        elif options.weights in ("final-list", "final-matrix"):

            weights = result.mFinalWeights
            if options.weights == "final-list":
                format = "list"
            else:
                format = "matrix"

        elif options.weights == "random":
            # get random weights
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for x in codons:
                counts[x] = random.randint(1, options.random_size)

            weights = Genomics.CalculateCAIWeightsFromCounts(
                counts, options.pseudocounts)
            format = "matrix"

        elif options.weights == "biased":
            # get biased weights
            codons = Genomics.GetUniformCodonUsage()

            weights = Genomics.CalculateCAIWeightsFromCounts(
                counts, options.pseudocounts)
            format = "matrix"

        elif options.weights in ("uniform-weights", "uniform-frequencies"):
            # get uniform weights
            codons = Genomics.GetUniformCodonUsage()

            if options.weights == ("uniform-weights"):
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
                format = "matrix"
            else:
                weights = codons
                format = "list"

        elif options.weights in ("counts", "frequencies",
                                 "absolute-frequencies"):
            # get weights as frequencies
            # compute from scratch. In the caijava file, the absolute frequencey f / gene_length is
            # given. Thus the total number of codons is f * gene_length.
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for c in codons:
                counts[c] = 0

            for genename, data in result.mGeneInfo.items():

                if options.reverse_set:
                    if data["CAICLASS"] >= cai_threshold:
                        continue
                else:
                    if data["CAICLASS"] < cai_threshold:
                        continue

                l = data["GENELENGTH"]
                for c in codons:
                    counts[c] += int(data[c] * l)

            if options.weights == "frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.weights == "counts":
                weights = counts
            elif options.weights == "absolute-frequencies":
                # compute absolute frequencies (with pseudo-counts, but do not
                # normalize per aa)
                weights = {}
                m = sum(counts.values())
                for k, v in counts.items():
                    weights[k] = float(v) / m

            format = "list"

        elif options.weights == "subset":

            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for c in codons:
                counts[c] = 0

            for genename, data in result.mGeneInfo.items():

                found = genename in subset
                if (not found and not options.reverse_set) or (
                        found and options.reverse_set):
                    continue

                l = data["GENELENGTH"]
                for c in codons:
                    counts[c] += int(data[c] * l)

            if options.codon_table_type == "frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.codon_table_type == "weights":
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
            elif options.codon_table_type == "counts":
                weights = counts
            if options.codon_table_type == "absolute-frequencies":
                # compute absolute frequencies (with pseudo-counts, but do not
                # normalize per aa)
                weights = {}
                m = sum(counts.values())
                for k, v in counts.items():
                    weights[k] = float(v) / m

        else:
            raise "unknown weights %s" % options.weights

        if format == "list":
            outfile.write("CODON\tWEIGHT\n")
            codons = weights.keys()
            codons.sort()
            for codon in codons:
                outfile.write("%s\t%f\n" % (codon, weights[codon]))

        elif format == "matrix":

            for x in range(len(OUTPUT_ORDER_CODON_MATRIX)):
                outfile.write(",".join(
                    map(lambda z: "%5.3f" % z, [
                        weights[codon.upper()]
                        for codon in OUTPUT_ORDER_CODON_MATRIX[x]
                    ])))
                outfile.write("\n")

    if options.codon_usage:
        outfile.write("CODON\tFREQUENCY\n")

        if options.codon_usage == "biased":
            usages = result.mCodonUsages[-1]
        elif options.codon_usage == "full":
            usages = result.mCodonUsages[0]
        elif options.codon_usage == "weights":
            usages = WrapperAdaptiveCAI.CalculateWeightsFromUsage(
                result.mCodonUsages[0])
        else:
            raise "unknown option '%s' for codon-usage." % options.codon_usage

        codons = usages.keys()
        codons.sort()
        for codon in codons:
            outfile.write("%s\t%f\n" % (codon, usages[codon]))

    E.Stop()

Example #16

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: diff_transcript_sets.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percent columns")
    parser.add_option("-d",
                      "--dump-sets",
                      dest="dump_sets",
                      action="append",
                      type="choice",
                      choices=("rest_genes1", "rest_genes2", "intersection",
                               "union"),
                      help="dump sets of transcripts/genes")
    parser.add_option(
        "-o",
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern to use for dumped sets. Should contain one %s.")

    parser.set_defaults(
        separator="|",
        add_percent="False",
        dump_sets=[],
        output_pattern="%s",
    )

    (options, args) = E.Start(parser)

    options.filename1, options.filename2 = args

    ids1, nerrors1 = IOTools.ReadList(open(options.filename1, "r"))
    ids2, nerrors2 = IOTools.ReadList(open(options.filename2, "r"))

    genes1, transcripts1 = countGenesTranscripts(ids1, options)
    genes2, transcripts2 = countGenesTranscripts(ids2, options)

    options.stdout.write(
        "species\tngenes1\tntranscripts1\tngenes2\tntranscripts2\ttr_inter\ttr_union\ttr_rest1\ttr_rest2\ttr_inter\tg_union\tg_rest1\tg_rest2"
    )
    options.stdout.write("\ttr_rest1\ttr_rest2\tg_rest1\tg_rest2")

    options.stdout.write("\n")

    for species in set(genes1.keys()).union(set(genes2.keys())):
        nt1, nt2, ng1, ng2 = "na", "na", "na", "na"

        if species in genes1:
            g1 = genes1[species]
            t1 = transcripts1[species]
            nt1 = "%i" % len(transcripts1[species])
            ng1 = "%i" % len(genes1[species])
        else:
            t1, g1 = None, None

        if species in genes2:
            g2 = genes2[species]
            t2 = transcripts2[species]
            nt2 = "%i" % len(transcripts2[species])
            ng2 = "%i" % len(genes2[species])
        else:
            t2, g2 = None, None

        if species in transcripts1 and transcripts2:
            ct = "%i" % len(t1.intersection(t2))
            ut = "%i" % len(t2.union(t1))
            rt1 = "%i" % len(t1.difference(t2))
            rt2 = "%i" % len(t2.difference(t1))
        else:
            ct, ut, rt1, rt2 = ["na"] * 4

        if species in genes1 and genes2:
            cg = "%i" % len(g1.intersection(g2))
            ug = "%i" % len(g2.union(g1))
            rg1 = "%i" % len(g1.difference(g2))
            rg2 = "%i" % len(g2.difference(g1))
        else:
            cg, ug, rg1, rg2 = ["na"] * 4

        options.stdout.write("\t".join((species, nt1, ng1, nt2, ng2)))
        options.stdout.write("\t")
        options.stdout.write("\t".join((ct, ut, rt1, rt2)))
        options.stdout.write("\t")
        options.stdout.write("\t".join((cg, ug, rg1, rg2)))

        if options.add_percent:
            if species in genes1 and genes2:
                rg1 = "%5.2f" % (100.0 * len(g1.difference(g2)) / len(g1))
                rg2 = "%5.2f" % (100.0 * len(g2.difference(g1)) / len(g2))
            if species in transcripts1 and transcripts2:
                rt1 = "%5.2f" % (100.0 * len(t1.difference(t2)) / len(t1))
                rt2 = "%5.2f" % (100.0 * len(t2.difference(t1)) / len(t2))
            options.stdout.write("\t")
            options.stdout.write("\t".join((rt1, rt2, rg1, rg2)))

        options.stdout.write("\n")

        for choice in options.dump_sets:

            output_set = None

            if choice == "rest_genes1" and g1 and g2:
                output_set = getTranscriptsForGenes(g1.difference(g2), ids1,
                                                    options)

            elif choice == "rest_genes2" and g1 and g2:
                output_set = getTranscriptsForGenes(g2.difference(g1), ids2,
                                                    options)

            if output_set:
                outfile = IOTools.openFile(options.output_pattern % (choice),
                                           "w")
                for x in output_set:
                    outfile.write("%s\n" % (x, ))
                outfile.close()

    E.Stop()

Example #17

Show file

def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t",
                      "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes",
                      dest="merge_genes",
                      action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j",
                      "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes",
                      dest="unset_genes",
                      type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene", "gene+transcript", "transcript",
                               "position", "contig+gene", "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u",
                      "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i",
                      "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g",
                      "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G",
                      "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r",
                      "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes",
                      dest="renumber_genes",
                      type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a",
                      "--apply",
                      dest="filename_filter",
                      type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample_size",
                      type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length",
                      type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end "
        "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping",
                      dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates",
                      dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates",
                      dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates",
                      dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                           strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin),
                                         strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                               strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to "
                             "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id",
                             transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons
                                      ]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator(
            GTF.iterator(options.stdin)),
                                          sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(
                options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id",
                                     map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]),
                               max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript",
                                "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript
                                      if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([
                        exon_counts[(x.start, x.end)] for x in transcript
                        if x.feature == "exon"
                    ])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l
                    ]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs
                                      ]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [
            item for item in set(transcript_ids)
            if transcript_ids.count(item) > 1
        ]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute(
                        'gene_id',
                        gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute(
                        'transcript_id', gtf.transcript_id + "." +
                        str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin),
                                           strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple strands: %s" %
                    (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple contigs: %s" %
                    (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()

Example #18

Show file

def selectComponents(component_ids, map_component2seq_id,
                     map_component2input_id, id_filter, options):
    """select a set of components from component_ids.
    """

    map_sample2reference = {}

    if options.sample:

        if options.sample_method == "simple-without-replacement":
            random.shuffle(component_ids)

        elif options.sample_method == "length-without-replacement":

            map_component_id2length = {}

            for component_id in component_ids:

                mali = getMali(component_id, map_component2seq_id,
                               map_component2input_id, id_filter, options)

                if not mali: continue

                map_component_id2length[component_id] = mali.getWidth()

            reference_ids, nerrors = IOTools.ReadList(
                open(options.filename_sample_reference, "r"))

            # do not sample from the reference set
            sampled_components = set(reference_ids)

            new_component_ids = []

            ninput, noutput, nskipped = 0, 0, 0
            # now go through reference set
            for ref_id in reference_ids:

                ninput += 1
                if ref_id not in map_component_id2length:
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# reference component %s not found.\n" %
                            (str(ref_id)))
                    nskipped += 1
                    continue

                ref_length = map_component_id2length[ref_id]
                ref_length_min = ref_length - 50
                ref_length_max = ref_length + 50

                # find all components with a length similar to ref_length excluding previously sampled ones.
                test_components = filter(
                    lambda x: ref_length_min < map_component_id2length[x] <
                    ref_length_max, component_ids)
                test_components = list(
                    set(test_components).difference(sampled_components))

                if len(test_components) == 0:
                    if options.loglevel >= 1:
                        options.stdlog.write("# reference components %s: skipped - no others with equivalent length around %i found." % \
                                                 ( ref_id, ref_length ) )
                    nskipped += 1
                    continue

                random.shuffle(test_components)

                component_id = test_components[0]
                sampled_components.add(component_id)

                map_sample2reference[component_id] = ref_id

                if options.loglevel >= 1:
                    options.stdlog.write(
                        "# reference component mapping: %s\t%s\t%i\t%i\t%i\n" %
                        (ref_id, component_id, ref_length,
                         map_component_id2length[component_id],
                         len(test_components)))

                new_component_ids.append(component_id)
                noutput += 1

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sampling results: ninput=%i, noutput=%i, nskipped=%i\n"
                    % (ninput, noutput, nskipped))

            component_ids = new_component_ids
            options.sample = len(new_component_ids)

    return component_ids, map_sample2reference

Example #19

Show file

File: orthologs2list.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/orthologs2list.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-s",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")

    parser.add_option(
        "-g",
        "--gene-regex",
        dest="gene_regex",
        type="string",
        help="regular expression to extract gene from identifier.")

    parser.add_option("-b",
                      "--only-best",
                      dest="only_best",
                      action="store_true",
                      help="write only the best pair for a pairing.")

    parser.add_option("-w",
                      "--no-within",
                      dest="within",
                      action="store_false",
                      help="do not write within species pairs.")

    parser.add_option("-d",
                      "--distances",
                      dest="filename_distances",
                      type="string",
                      help="filename with distances between transcripts.")

    parser.add_option(
        "-c",
        "--no-combine-genes",
        dest="combine_genes",
        action="store_false",
        help="do not combine orthologous clusters which contain the same gene."
    )

    parser.add_option("--filename-restrict-filter1",
                      dest="filename_restrict_filter1",
                      type="string",
                      help="filename with ids to filter out.")
    parser.add_option("--filename-restrict-filter2",
                      dest="filename_restrict_filter2",
                      type="string",
                      help="filename with ids to filter out.")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="choice",
                      choices=("graph", "components"),
                      help="output format.")

    parser.add_option("-m",
                      "--mode",
                      dest="mode",
                      type="choice",
                      choices=("orthologs", "orphans"),
                      help="analyze either 'orthologs' or 'orphans'.")

    parser.add_option("--genome1",
                      dest="genome1",
                      type="string",
                      help="first genome.")
    parser.add_option("--genome2",
                      dest="genome2",
                      type="string",
                      help="second genome.")

    parser.set_defaults(
        species_regex="^([^|]+)\|",
        gene_regex="^[^|]+\|[^|]+\|([^|]+)\|",
        only_best=None,
        filename_distances=None,
        within=True,
        combine_genes=True,
        report_step=100000,
        use_networkx=False,
        separator="|",
        genome1=None,
        genome2=None,
        mode="orthologs",
        filename_restrict_filter1=None,
        filename_restrict_filter2=None,
        format="graph",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    rs = re.compile(options.species_regex)
    rg = re.compile(options.gene_regex)

    t0 = time.time()
    # retrieve matches between pairs:
    pairs = {}
    max_dist = 0

    if options.filename_distances and options.only_best:
        infile = open(options.filename_distances, "r")
        for line in infile:
            if line[0] == "#":
                continue
            a, b, d = line[:-1].split("\t")[:3]
            d = float(d)
            if a < b:
                key = "%s-%s" % (a, b)
            else:
                key = "%s-%s" % (b, a)

            max_dist = max(d, max_dist)
            pairs[key] = d

        infile.close()

    cluster_id = 0
    ninput, noutput, nmissed, nskipped, nsingletons = 0, 0, 0, 0, 0

    # Read positive filter information:
    filter_restrict1 = {}
    if options.filename_restrict_filter1:
        xx, e = IOTools.ReadList(open(options.filename_restrict_filter1, "r"))
        for x in xx:
            filter_restrict1[Orthologs.Transcript(x).mTranscript] = True

    filter_restrict2 = {}
    if options.filename_restrict_filter2:
        xx, e = IOTools.ReadList(open(options.filename_restrict_filter2, "r"))
        for x in xx:
            filter_restrict2[Orthologs.Transcript(x).mTranscript] = True

    if options.loglevel >= 1:
        options.stdlog.write("# read filtering information: %i/%i\n" %
                             (len(filter_restrict1), len(filter_restrict2)))

    t1 = time.time()

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" % (t1 - t0))

    orthologs = []

    if options.mode == "orthologs":
        orthologs = Orthologs.ReadInterpretation(
            sys.stdin,
            options.separator,
            genome1=options.genome1,
            genome2=options.genome2,
            filter_restrict_transcripts1=filter_restrict1,
            filter_restrict_transcripts2=filter_restrict2)
    else:
        orthologs = Orthologs.ReadOrphans(
            sys.stdin,
            options.separator,
            genome1=options.genome1,
            genome2=options.genome2,
            filter_restrict_transcripts1=filter_restrict1,
            filter_restrict_transcripts2=filter_restrict2)

    ninput = len(orthologs)

    max_dist = map(lambda x: x[4], orthologs)

    t2 = time.time()

    if options.loglevel >= 1:
        options.stdlog.write("# reading %i groups in %i seconds.\n" %
                             (ninput, t2 - t1))

    if options.combine_genes:

        if options.use_networkx:

            nclusters = len(orthologs)
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# before combining genes: %i clusters\n" % len(orthologs))
                options.stdlog.flush()

            # build links between all genes
            # ignore warnings from networkx/matplotlib that a display
            # can not be found
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                import networkx

            graph = networkx.Graph()

            # This procedure skips genes with "0". This is a patch, because
            # these genes should not be there in the first place.
            iteration = 0
            for transcripts1, transcripts2, genes1, genes2, weight in orthologs:
                iteration += 1

                if options.loglevel >= 1:
                    if (iteration % options.report_step == 0):
                        options.stdlog.write(
                            "# iteration: %i/%i (%i%%) in %i seconds.\n" %
                            (iteration, nclusters, 100 * iteration / nclusters,
                             time.time() - t2))
                        options.stdlog.flush()

                for g in genes1.keys():
                    graph.add_node((1, g))
                for g in genes2.keys():
                    graph.add_node((2, g))
                for g1 in genes1.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes1.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge((1, g1), (2, g2))
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge((1, g1), (2, g2))
                for g1 in genes2.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge((2, g1), (2, g2))

            if options.loglevel >= 1:
                options.stdlog.write("# created graph in %i seconds.\n" %
                                     (time.time() - t2))
                options.stdlog.flush()

            tt2 = time.time()

            components = networkx.connected_components(graph)

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# calculated connected components in %i seconds\n" %
                    (time.time() - tt2))
                options.stdlog.flush()

        else:

            graph = GraphTools.ExternalGraph()

            iteration = 0
            nclusters = len(orthologs)

            for transcripts1, transcripts2, genes1, genes2, weight in orthologs:

                iteration += 1

                if options.loglevel >= 1:
                    if (iteration % options.report_step == 0):
                        options.stdlog.write(
                            "# iteration: %i/%i (%i%%) in %i seconds.\n" %
                            (iteration, nclusters, 100 * iteration / nclusters,
                             time.time() - t1))
                        options.stdlog.flush()

                f = "%s;%s"

                for g1 in genes1.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes1.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge(f % (1, g1), f % (2, g2))
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge(f % (1, g1), f % (2, g2))
                for g1 in genes2.keys():
                    if g1 == "0":
                        continue
                    for g2 in genes2.keys():
                        if g2 == "0":
                            continue
                        graph.add_edge(f % (2, g1), f % (2, g2))

            if options.loglevel >= 1:
                options.stdlog.write("# created graph in %i seconds\n" %
                                     (time.time() - t2))
                options.stdlog.flush()

            tt2 = time.time()

            graph.finalize()
            components = graph.connected_components()

            if options.loglevel >= 1:
                options.stdlog.write(
                    "# retrieved %i connected components in %i seconds\n" %
                    (len(components), time.time() - tt2))
                options.stdlog.flush()

            for x in range(len(components)):
                components[x] = map(lambda y: y.split(";"), components[x])

        tt2 = time.time()

        map_gene2cluster = {}
        for x in range(len(components)):
            for a, b in components[x]:
                map_gene2cluster[b] = x

        new_orthologs = [[[], [], 0] for x in range(len(components))]

        singletons = []

        for transcripts1, transcripts2, genes1, genes2, weight in orthologs:
            if genes1:
                try:
                    cluster_id = map_gene2cluster[genes1.keys()[0]]
                except KeyError:
                    singletons.append(genes1)
            elif genes2:
                try:
                    cluster_id = map_gene2cluster[genes2.keys()[0]]
                except KeyError:
                    singletons.append(genes2)
            else:
                raise "Error, both genes1 and genes2 are emtpy."

            new_orthologs[cluster_id][0] += transcripts1
            new_orthologs[cluster_id][1] += transcripts2
            new_orthologs[cluster_id][2] = weight

        nsingletons = len(singletons)

        orthologs = map(
            lambda x: (x[0], x[1], Orthologs.GetGenes(x[0]),
                       Orthologs.GetGenes(x[1]), weight), new_orthologs)

        if options.loglevel >= 1:
            options.stdlog.write("# combining genes in %i seconds\n" %
                                 (time.time() - tt2))
            options.stdlog.flush()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# after combining genes: %i clusters, %i singletons\n" %
                (len(orthologs), nsingletons))

    t3 = time.time()

    if options.loglevel >= 1:
        options.stdlog.write("# gene clustering in %i seconds.\n" % (t3 - t2))

    cluster_id = 0

    def getCode(s):
        if len(s) == 1:
            return "1"
        elif len(s) == 0:
            return "0"
        else:
            return "m"

    for transcripts1, transcripts2, genes1, genes2, weight in orthologs:

        cluster_id += 1

        g1 = getCode(genes1)
        g2 = getCode(genes2)
        t1 = getCode(transcripts1)
        t2 = getCode(transcripts2)

        if options.format == "graph":

            # find best transcripts
            best_transcripts = {}
            if options.only_best:
                # print only best match between each possible set of genes in
                # ortholog pair
                for gg1, tt1 in genes1.items():
                    for gg2, tt2 in genes2.items():
                        best = max_dist
                        best_pair = None
                        for x in tt1:
                            for y in tt2:
                                if x < y:
                                    key = "%s-%s" % (x, y)
                                else:
                                    key = "%s-%s" % (y, x)

                            if key in pairs:
                                if best > pairs[key]:
                                    best = pairs[key]
                                    best_pair = (x, y)
                        if best_pair:
                            best_transcripts[x] = 1
                            best_transcripts[y] = 1
                            options.stdout.write(
                                "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                                (best_pair[0], best_pair[1], weight, g1, g2,
                                 str(t1), str(t2), cluster_id))
                            noutput += 1
                        else:
                            options.stdlog.write(
                                "# missed link between: %s %s\n" %
                                (str(genes1), str(genes2)))
                            nmissed += 1
            else:
                for x in transcripts1:
                    for y in transcripts2:
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                            (x, y, weight, g1, g2, str(t1), str(t2),
                             cluster_id))
                        noutput += 1

            if options.within:

                # add self links for first species.
                for x in range(len(transcripts1) - 1):
                    for y in range(x + 1, len(transcripts1)):
                        if not best_transcripts or \
                           (transcripts1[x] in best_transcripts and
                                transcripts1[y] in best_transcripts):
                            options.stdout.write(
                                "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                                (str(transcripts1[x]), str(transcripts1[y]),
                                 weight, g1, g2, str(t1), str(t2), cluster_id))
                            noutput += 1

                # add self links for second species
                for x in range(len(transcripts2) - 1):
                    for y in range(x + 1, len(transcripts2)):
                        if not best_transcripts or \
                           (transcripts2[x] in best_transcripts and
                                transcripts2[y] in best_transcripts):
                            options.stdout.write(
                                "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                                (str(transcripts2[x]), str(transcripts2[y]),
                                 weight, g1, g2, str(t1), str(t2), cluster_id))
                            noutput += 1

                # If orphans, also add links for genes with a single
                # transripts.
                if options.mode == "orphans":
                    if len(transcripts1) == 1:
                        x, y = 0, 0
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                            (str(transcripts1[x]), str(transcripts1[y]),
                             weight, g1, g2, str(t1), str(t2), cluster_id))
                    elif len(transcripts2) == 1:
                        x, y = 0, 0
                        options.stdout.write(
                            "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" %
                            (str(transcripts2[x]), str(transcripts2[y]),
                             weight, g1, g2, str(t1), str(t2), cluster_id))

        elif options.format == "components":

            for gg1, tt1 in genes1.items():
                for t in tt1:
                    options.stdout.write("%s\t%i\n" % (str(t), cluster_id))

            for gg2, tt2 in genes2.items():
                for t in tt2:
                    options.stdout.write("%s\t%i" % (str(t), cluster_id))

    if options.loglevel >= 1:
        options.stdout.write(
            "# ninput=%i, noutput=%i, nmissed=%i, skipped=%i, nsingletons=%i\n"
            % (ninput, noutput, nmissed, nskipped, nsingletons))

    E.Stop()

Example #20

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: graph_check.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("--filename-missing", dest="filename_missing", type="string",
                      help="missing entries.")
    parser.add_option("--filename-found", dest="filename_found", type="string",
                      help="found entries.")
    parser.add_option("--report-step1", dest="report_step1", type="int",
                      help="report interval for input.")
    parser.add_option("--report-step2", dest="report_step2", type="int",
                      help="report interval for processing.")
    parser.add_option("-n", "--filename-vertices", dest="filename_vertices", type="string",
                      help="filename with vertices.")
    parser.add_option("-u", "--num-fields", dest="num_fields", type="int",
                      help="number of fields to expect.")
    parser.add_option("-o", "--filename-output-pattern", dest="filename_output_pattern", type="string",
                      help="filenames for output (should contain one %s for one section).")
    parser.add_option("-s", "--sort-order", dest="sort_order", type="choice",
                      choices=("numeric", "alphanumeric"),
                      help="sort order - if numeric, vertices are cast to int.")

    parser.set_defaults(
        filename_vertices=None,
        report_step1=100000,
        report_step2=10000,
        filename_output_pattern="%s",
        subsets=False,
        num_fields=11,
        sort_order="alphanumeric",
    )

    (options, args) = E.Start(parser)

    if options.loglevel >= 1:
        options.stdlog.write("# output goes to:\n")
        options.stdlog.write("# errors: %s\n" %
                             options.filename_output_pattern % "errors")
        options.stdlog.write("# missed query: %s\n" %
                             options.filename_output_pattern % "missed_queries")
        options.stdlog.write("# missed sbjct: %s\n" %
                             options.filename_output_pattern % "missed_sbjcts")
        options.stdlog.write("# missed self: %s\n" %
                             options.filename_output_pattern % "missed_self")

    outfile_errors = open(options.filename_output_pattern % "errors", "w")

    if options.sort_order == "numeric":
        f = int
    else:
        f = str

    if options.filename_vertices:
        vv, errors = IOTools.ReadList(
            open(options.filename_vertices, "r"), map_function=f)
        vertices = {}
        # use flags for vertices
        # 1st bit: is query: 1
        # 2nd bit: is sbjct: 2
        # 3rd bit: has self: 4
        for v in vv:
            vertices[v] = 0
    else:
        raise "for the time being, specify a vertex file."

    options.stdout.write(
        "nqueries\tnsbjcts\tnvertices\tnlinks\tnlines\tnerrors\tncomments\tis_sorted\tnexpected\tnmissed_queries\tnmissed_sbjcts\tnmissed_self\n")

    ncomments, nlinks, nerrors, nlines = 0, 0, 0, 0

    is_sorted = True

    last = None

    for line in sys.stdin:

        nlines += 1
        if line[0] == "#":
            ncomments += 1
            continue

        nlinks += 1

        data = line[:-1].split("\t")

        if len(data) != options.num_fields:
            nerrors += 1
            outfile_errors.write(line)
            outfile_errors.flush()
            continue

        q, s = f(data[0]), f(data[1])

        if q == s:
            vertices[q] |= 4
        vertices[q] |= 1
        vertices[s] |= 2

        if last and last > q:
            is_sorted = False
            outfile_errors.write(
                "# sort inconsistency between %s and %s at line %i\n" % (last, q, nlines))
            outfile_errors.flush()
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sort inconsistency between %s and %s at line %i\n" % (last, q, nlines))
                options.stdlog.flush()

        if options.report_step1 and nlines % options.report_step1 == 0:
            writeInfo(options.stdlog, vertices, nlinks,
                      nlines, nerrors, ncomments, is_sorted)

        last = q

    missed_queries, missed_sbjcts, missed_self = writeInfo(
        options.stdout, vertices, nlinks, nlines, nerrors, ncomments, is_sorted)

    if nerrors == 0:
        os.remove(options.filename_output_pattern % "errors")

    if missed_queries:
        writeSet(open(options.filename_output_pattern %
                      "missed_queries", "w"), missed_queries)

    if missed_sbjcts:
        writeSet(open(options.filename_output_pattern %
                      "missed_sbjcts", "w"), missed_sbjcts)

    if missed_self:
        writeSet(open(options.filename_output_pattern %
                      "missed_self", "w"), missed_self)

    E.Stop()

Example #21

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/annotate_clusters.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--filename-map",
        dest="filename_map_id2cluster",
        type="string",
        help="filename with mapping information from id to cluster.")

    parser.add_option("--filename-interpro",
                      dest="filename_interpro",
                      type="string",
                      help="filename with interpro domain information.")

    parser.add_option("--filename-pfam",
                      dest="filename_pfam",
                      type="string",
                      help="filename with pfam domain information.")

    parser.set_defaults(
        master_species="dmel_vs_dmel4",
        separator="|",
        filename_map_id2cluster="input.map",
        filename_interpro="/home/andreas/projects/flies/data_1v5/interpro.list",
        filename_pfam="/home/andreas/projects/flies/data_1v5/pfam.list",
        write_no_annotation=True,
        separator_fields=";",
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    clusters, nerrors = IOTools.ReadList(sys.stdin)

    map_id2cluster, map_cluster2id = IOTools.ReadMap(open(
        options.filename_map_id2cluster, "r"),
                                                     both_directions=True)

    if len(clusters) == 0:
        clusters = map_cluster2id.keys()
        clusters.sort()

    if options.filename_interpro:
        map_id2interpro = readAnnotationInterpro(
            open(options.filename_interpro, "r"))

    if options.filename_pfam:
        map_id2pfam = readAnnotationPfam(open(options.filename_pfam, "r"))

    ninput, noutput, nnomaster, nnoannotation = 0, 0, 0, 0
    nskipped = 0

    options.stdout.write("cluster\tgenes")

    if map_id2interpro:
        options.stdout.write("\tinterpro\tidescription")
    if map_id2pfam:
        options.stdout.write("\tpfam\tpdescription")
    options.stdout.write("\n")

    for cluster in clusters:

        ninput += 1
        if cluster not in map_cluster2id:
            if options.loglevel >= 1:
                options.stdlog.write("# cluster %s not in map.\n" % cluster)
            nskipped += 1
            continue

        genes = set()

        for id in map_cluster2id[cluster]:

            s, t, g, q = id.split(options.separator)

            if s != options.master_species:
                continue

            genes.add(g)

        if not genes:
            nnomaster += 1
            continue

        annotations_interpro = {}
        if map_id2interpro:
            for gene in genes:
                if gene in map_id2interpro:
                    for annotation in map_id2interpro[gene]:
                        annotations_interpro[
                            annotation.mIdentifier] = annotation

        annotations_pfam = {}

        if map_id2pfam:
            for gene in genes:
                if gene in map_id2pfam:
                    for annotation in map_id2pfam[gene]:
                        annotations_pfam[annotation.mIdentifier] = annotation

        nannotations = max(len(annotations_pfam), len(annotations_interpro))

        if nannotations == 0 and not options.write_no_annotation:
            nnoannotation += 1
            continue

        options.stdout.write("%s\t%s" % (cluster, ";".join(genes)))

        if map_id2interpro:
            printAnnotations(options.stdout, annotations_interpro, options)

        if map_id2pfam:
            printAnnotations(options.stdout, annotations_pfam, options)

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i, nnomaster=%i, nnoannotation=%i\n"
            % (ninput, noutput, nskipped, nnomaster, nnoannotation))

    E.Stop()