Example #1
0
def readComponents(options):
    """read components from filename supplied in the options.
    """
    if options.filename_components:
        map_seq_id2component =\
                             IOTools.ReadMap( open(options.filename_components, "r"),
                                              columns = "all",
                                              both_directions = False)

        map_component2seq_id = {}
        map_component2input_id = {}
        for key, val in map_seq_id2component.items():
            if type(val) == types.StringType:
                input_id = val
                output_id = val
            elif type(val) == types.TupleType:
                if len(val) == 2:
                    input_id = val[0]
                    output_id = val[1]
                else:
                    input_id = val[0]
                    output_id = val[0]
            else:
                raise ValueError("error in reading %s: %s->%s" %
                                 (options.filename_components, key, val))

            if output_id not in map_component2seq_id:
                map_component2seq_id[output_id] = []
            map_component2seq_id[output_id].append(key)
            map_component2input_id[output_id] = input_id
        return map_seq_id2component, map_component2seq_id, map_component2input_id
    else:
        return None, None, None
Example #2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $"
    )

    parser.add_option("-f",
                      "--file",
                      dest="input_filename",
                      type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option(
        "-i",
        "--input-pattern",
        dest="input_pattern",
        type="string",
        help="input pattern. Parses description line in order to extract id.")

    parser.add_option(
        "-o",
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern. Gives filename for a given sequence.")

    parser.add_option(
        "-n",
        "--num-sequences",
        dest="num_sequences",
        type="int",
        help="split by number of sequences (not implemented yet).")

    parser.add_option("-m",
                      "--map",
                      dest="map_filename",
                      type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s",
                      "--skip-identifiers",
                      dest="skip_identifiers",
                      action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size",
                      dest="min_size",
                      type="int",
                      help="minimum cluster size.")

    parser.set_defaults( \
        input_filename = None,
        map_filename = None,
        skip_identifiers = False,
        input_pattern = "^(\S+)",
        min_size = 0,
        num_sequences = None,
        output_pattern = "%s" )

    (options, args) = E.Start(parser)

    if options.input_filename:
        infile = IOTools.openFile(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = IOTools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print "# parsing error in description line %s" % (seq.title)
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    ## delete all clusters below a minimum size
    ## Note: this has to be done at the end, because
    ## clusters sizes are only available once both the fasta
    ## file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print "# input=%i, output=%i, ndeleted=%i" % (ninput, noutput,
                                                      ndeleted)

    E.Stop()
Example #3
0
def Process(lines, other_trees, options, map_old2new, ntree):

    nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines))

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees.\n" % len(nexus.trees))

    nskipped = 0
    ntotal = len(nexus.trees)
    extract_pattern = None
    species2remove = None
    write_map = False

    phylip_executable = None
    phylip_options = None

    index = 0

    # default: do not output internal node names
    write_all_taxa = False

    for tree in nexus.trees:

        if options.outgroup:
            tree.root_with_outgroup(options.outgroup)

        for method in options.methods:

            if options.loglevel >= 3:
                options.stdlog.write("# applying method %s to tree %i.\n" %
                                     (method, index))

            if method == "midpoint-root":
                tree.root_midpoint()

            elif method == "balanced-root":
                tree.root_balanced()

            elif method == "unroot":
                TreeTools.Unroot(tree)

            elif method == "phylip":
                if not phylip_executable:
                    phylip_executable = options.parameters[0]
                    del options.parameters[0]
                    phylip_options = re.split("@", options.parameters[0])
                    del options.parameters[0]

                    phylip = WrapperPhylip.Phylip()
                    phylip.setProgram(phylip_executable)
                    phylip.setOptions(phylip_options)

                phylip.setTree(tree)

                result = phylip.run()

                nexus.trees[index] = result.mNexus.trees[0]

            elif method == "normalize":
                if options.value == 0:
                    v = 0
                    for n in tree.chain.keys():
                        v = max(v, tree.node(n).data.branchlength)
                else:
                    v = options.value

                for n in tree.chain.keys():
                    tree.node(n).data.branchlength /= float(options.value)

            elif method == "divide-by-tree":

                if len(other_trees) > 1:
                    other_tree = other_trees[ntree]
                else:
                    other_tree = other_trees[0]

                # the trees have to be exactly the same!!
                if options.loglevel >= 2:
                    print tree.display()
                    print other_tree.display()

                if not tree.is_identical(other_tree):
                    nskipped += 1
                    continue

                # even if the trees are the same (in topology), the node numbering might not be
                # the same. Thus build a map of node ids.
                map_a2b = TreeTools.GetNodeMap(tree, other_tree)

                for n in tree.chain.keys():
                    try:
                        tree.node(n).data.branchlength /= float(
                            other_tree.node(map_a2b[n]).data.branchlength)
                    except ZeroDivisionError:
                        options.stdlog.write(
                            "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n"
                            % (n, map_a2b[n], ntree))
                        continue

            elif method == "rename":
                if not map_old2new:

                    map_old2new = IOTools.ReadMap(open(options.parameters[0],
                                                       "r"),
                                                  columns=(0, 1))

                    if options.invert_map:
                        map_old2new = IOTools.getInvertedDictionary(
                            map_old2new, make_unique=True)

                    del options.parameters[0]

                unknown = []
                for n, node in tree.chain.items():
                    if node.data.taxon:
                        try:
                            node.data.taxon = map_old2new[node.data.taxon]
                        except KeyError:
                            unknown.append(node.data.taxon)

                for taxon in unknown:
                    tree.prune(taxon)

            # reformat terminals
            elif method == "extract-with-pattern":

                if not extract_pattern:
                    extract_pattern = re.compile(options.parameters[0])
                    del options.parameters[0]

                for n in tree.get_terminals():
                    node = tree.node(n)
                    node.data.taxon = extract_pattern.search(
                        node.data.taxon).groups()[0]

            elif method == "set-uniform-branchlength":
                for n in tree.chain.keys():
                    tree.node(n).data.branchlength = options.value

            elif method == "build-map":
                # build a map of identifiers
                options.write_map = True
                for n in tree.get_terminals():
                    node = tree.node(n)
                    if node.data.taxon not in map_old2new:
                        new = options.template_identifier % (len(map_old2new) +
                                                             1)
                        map_old2new[node.data.taxon] = new
                    node.data.taxon = map_old2new[node.data.taxon]

            elif method == "remove-pattern":
                if species2remove is None:
                    species2remove = re.compile(options.parameters[0])
                    del options.parameters
                taxa = []
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    skip = False
                    if species2remove.search(t):
                        continue
                    if not skip:
                        taxa.append(t)
                TreeTools.PruneTree(tree, taxa)

            elif method == "add-node-names":

                inode = 0
                write_all_taxa = True
                for n, node in tree.chain.items():
                    if not node.data.taxon:
                        node.data.taxon = "inode%i" % inode
                        inode += 1

            elif method == "newick2nhx":
                # convert names to species names
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    d = t.split("|")
                    if len(d) >= 2:
                        tree.node(n).data.species = d[0]

        index += 1
        ntree += 1

    if options.output_format == "nh":
        options.stdout.write(
            TreeTools.Nexus2Newick(
                nexus,
                write_all_taxa=True,
                with_branchlengths=options.with_branchlengths) + "\n")
    else:
        for tree in nexus.trees:
            tree.writeToFile(options.stdout, format=options.output_format)

    return ntotal, nskipped, ntree
Example #4
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--input-format",
        dest="input_format",
        type="choice",
        choices=("plain", "fasta", "clustal", "stockholm", "phylip"),
        help="input format of multiple alignment [default=%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("plain", "fasta", "stockholm", "phylip", "nexus",
                 "plain-fasta"),
        help="output format of multiple alignment [default=%default].")

    parser.add_option(
        "--with-ranges",
        dest="with_ranges",
        action="store_true",
        help=
        "output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option(
        "--without-ranges",
        dest="with_ranges",
        action="store_false",
        help=
        "do not output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option("-u",
                      "--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="string",
        help=
        """methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""
    )

    parser.add_option(
        "-p",
        "--parameters",
        dest="parameters",
        type="string",
        help="parameter stack for methods that require one [default=%default]."
    )

    parser.add_option(
        "-a",
        "--mask-char",
        dest="mask_char",
        type="string",
        help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions",
                        "filter-even-transitions", "keep-even-segments",
                        "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(
                    open(options.parameters[0], "r"), map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(open(options.parameters[0], "r"),
                                    format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()
Example #5
0
                      help="aggregation function.")

    parser.set_defaults(filename_map=None,
                        filename_info=None,
                        filename_tissues=None,
                        headers=True,
                        aggregate="mean",
                        value_format="%5.2f",
                        method="counts")

    (options, args) = E.Start(parser)

    if not options.filename_map:
        raise "please supply filename mapping probesets to identifiers."

    map_probe2locus = IOTools.ReadMap(open(options.filename_map, "r"))

    matrix, row_headers, col_headers = MatlabTools.readMatrix(
        sys.stdin, format="full", headers=options.headers)

    if options.filename_tissues:
        tissues, nerrors = IOTools.ReadList(open(options.filename_tissues,
                                                 "r"))
        tissues = set(tissues)
        columns = []
        for x in range(len(col_headers)):
            if col_headers[x] in tissues:
                columns.append(x)
    else:
        columns = range(len(col_headers))
Example #6
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type="string",
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_option("-s",
                      "--section",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "sequence", "hid", "na", "aa", "cpg",
                               "dn", "degeneracy", "gaps", "codons",
                               "codon-usage", "codon-translator",
                               "codon-bias"),
                      help="which sections to output [%default]")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids [%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_option("--split-fasta-identifier",
                      dest="split_id",
                      action="store_true",
                      help="split fasta description line (starting >) and use "
                      "only text before first space")

    parser.add_option(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table"
        "[%default]")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        options.filename_weights = options.filename_weights.split(",")
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(IOTools.openFile(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        options.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                options.stdlog.write("# tablediff\t%s\t%s\t%f\n" %
                                     (options.filename_weights[x],
                                      options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(
                    options.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if options.split_id is True:
            options.stdout.write("%s" % id.split()[0])
        else:
            options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence, options.seqtype)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    if options.add_total:
        options.stdout.write("total")
        for section in options.sections:
            options.stdout.write("\t" + "\t".join(totals[section].getFields()))
        options.stdout.write("\n")

    E.Stop()
Example #7
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "remove-stops", "upper", "lower", "reverse-complement",
                 "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one "
                      "[default=%default].")

    parser.add_option("-x",
                      "--ignore-errors",
                      dest="ignore_errors",
                      action="store_true",
                      help="ignore errors [default = %default].")

    parser.add_option("--sample-proportion",
                      dest="sample_proportion",
                      type="float",
                      help="sample proportion [default = %default].")

    parser.add_option("--exclude-pattern",
                      dest="exclude_pattern",
                      type="string",
                      help="exclude all sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--include-pattern",
                      dest="include_pattern",
                      type="string",
                      help="include only sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      type="string",
                      action="append",
                      help="filtering methods to apply "
                      "[default = %default].")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="type",
        type="choice",
        choices=("aa", "na"),
        help="sequence type (aa or na) [%default]. This option determines "
        "which characters to use for masking [default = %default].")

    parser.add_option(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type="string",
        help="template for numerical identifier [default = %default] "
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.set_defaults(
        methods=[],
        parameters="",
        type="na",
        aa_mask_chars="xX",
        aa_mask_char="x",
        na_mask_chars="nN",
        na_mask_char="n",
        gap_chars="-.",
        gap_char="-",
        template_identifier="ID%06i",
        ignore_errors=False,
        exclude_pattern=None,
        include_pattern=None,
        sample_proportion=None,
        filter_methods=[],
    )

    (options, args) = E.Start(parser)
    options.parameters = options.parameters.split(",")

    rx_include, rx_exclude = None, None
    if options.include_pattern:
        rx_include = re.compile(options.include_pattern)
    if options.exclude_pattern:
        rx_exclude = re.compile(options.exclude_pattern)

    iterator = FastaIterator.FastaIterator(options.stdin)

    nseq = 0

    map_seq2nid = {}

    if "apply-map" in options.methods:
        map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if options.type == "na":
        mask_chars = options.na_mask_chars
        mask_char = options.na_mask_char
    else:
        mask_chars = options.aa_mask_chars
        mask_char = options.aa_mask_char

    if "map-codons" in options.methods:
        map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if "mask-soft" in options.methods:
        f = options.parameters[0]
        del options.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in options.methods or "back-translate" in options.methods:

        # open a second stream to read sequences from
        f = options.parameters[0]
        del options.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    ninput, noutput, nerrors, nskipped = 0, 0, 0, 0

    if "sample" in options.methods:
        if not options.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = options.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in options.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in IOTools.openFile(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        nseq += 1
        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(cur_record.title):
            nskipped += 1
            continue

        if rx_exclude and rx_exclude.search(cur_record.title):
            nskipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or cur_record.title in filter_id_list):
            nskipped += 1
            continue

        for method in options.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % options.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        cur_record.title, ls)
                    nerrors += 1
                    if options.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if cur_record.title != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        cur_record.title, other_record.title)

                other_sequence = re.sub("[ %s]" % options.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % options.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in options.gap_chars:
                        c = options.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = string.translate(
                    sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = options.na_mask_char
                elif method == "remove-stops":
                    char = options.gap_char

                for x in sequence:

                    if x not in options.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in options.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (cur_record.title))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in options.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                if id in map_seq2nid:
                    rest = cur_record.title[len(id):]
                    cur_record.title = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                new_id = options.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                cur_record.title = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if cur_record.title != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (cur_record.title, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        % (cur_record.title, len(other_sequence) * 3,
                           len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in options.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [options.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [options.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            nskipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            nskipped += 1
            continue

        options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence))
        noutput += 1

    if "build-map" in options.methods:
        p = options.parameters[0]
        if p:
            outfile = IOTools.openFile(p, "w")
        else:
            outfile = options.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" %
           (ninput, noutput, nskipped, nerrors))

    E.Stop()
Example #8
0
def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: plot_tree.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-i",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-s",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree.")
    parser.add_option("-t", "--tree", dest="tree", type="string", help="tree.")
    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")
    parser.add_option("--colour-by-species",
                      dest="colour_by_species",
                      action="store_true",
                      help="colour by species.")
    parser.add_option("--support-style",
                      dest="support_style",
                      type="choice",
                      choices=("pie", "number"),
                      help="style for support information.")
    parser.add_option("--error-style",
                      dest="error_style",
                      type="choice",
                      choices=("pie", "number"),
                      help="style for error information.")
    parser.add_option("--branch-scale",
                      dest="branch_scale",
                      type="float",
                      help="branch length scale factor.")
    parser.add_option("--height-scale",
                      dest="height_scale",
                      type="float",
                      help="height scale factor.")
    parser.add_option("-a",
                      "--annotations",
                      dest="annotations",
                      type="choice",
                      action="append",
                      choices=("support", "error", "kaks", "master", "value",
                               "tables"),
                      help="annotations given by further trees.")
    parser.add_option(
        "--filename-tables",
        dest="filename_tables",
        type="string",
        help="add tables from file (need also set options -a tables) [%default]"
    )
    parser.add_option("--show-branchlengths",
                      dest="show_branchlengths",
                      action="store_true",
                      help="show branch lengths.")
    parser.add_option("--leaf-symbol",
                      dest="plot_leaf_symbol",
                      type="choice",
                      choices=("square", "circle"),
                      help="Symbol for leaves.")
    parser.add_option("--font-size-branches",
                      dest="font_size_branches",
                      type="int",
                      help="set font size for branches.")
    parser.add_option("--font-size-tips",
                      dest="font_size_tips",
                      type="int",
                      help="set font size for tips.")
    parser.add_option("--font-style-tips",
                      dest="font_style_tips",
                      type="choice",
                      choices=(
                          "normal",
                          "italic",
                      ),
                      help="set font style for tips.")
    parser.add_option("--filename-map",
                      dest="filename_map",
                      type="string",
                      help="filename with a name translation table.")
    parser.add_option("--filename-map-species2colour",
                      dest="filename_colour_map",
                      type="string",
                      help="filename with a map of species to colour.")
    parser.add_option("--no-leaf-labels",
                      dest="plot_leaf_labels",
                      action="store_false",
                      help="do not show labels at leafs.")
    parser.add_option("--no-ruler",
                      dest="plot_ruler",
                      action="store_false",
                      help="do not plot ruler.")

    parser.set_defaults(
        titles="",
        title="",
        footer="",
        filename_tree=None,
        species_regex="^([^|]+)\|",
        colour_by_species=None,
        tree=None,
        branch_scale=0,
        height_scale=0,
        support_style=None,
        error_style="number",
        kaks_style="number",
        annotations=None,
        show_branchlengths=False,
        branch_length_format="%5.2f",
        font_size_tips=None,
        font_size_branches=None,
        font_style_tips=None,
        filename_map=None,
        filename_colour_map=None,
        plot_leaf_labels=True,
        plot_leaf_symbol=None,
        plot_ruler=True,
        filename_tables=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_tree:
        tree_lines = open(options.filename_tree, "r").readlines()
    elif options.tree:
        tree_lines = options.tree
    else:
        tree_lines = sys.stdin.readlines()

    nexus = TreeTools.Newick2Nexus(tree_lines)
    master_tree = nexus.trees[0]

    if options.filename_map:
        map_names = IOTools.ReadMap(open(options.filename_map, "r"))

        for id, node in master_tree.chain.items():
            if node.data.taxon in map_names:
                node.data.taxon = map_names[node.data.taxon]

    if options.loglevel >= 2:
        master_tree.display()

    plot = SVGTree.SVGTree(master_tree)

    if options.branch_scale:
        plot.setBranchScale(options.branch_scale)

    if options.height_scale != None:
        plot.setHeightScale(options.height_scale)

    if options.font_size_tips != None:
        plot.setFontSize(options.font_size_tips)

    if options.plot_ruler == False:
        plot.setRulerElements([])

    if options.show_branchlengths:
        b = SVGTree.BranchDecoratorHorizontalBranchLength(master_tree)
        if options.font_size_branches:
            b.setFontSize(options.font_size_branches)
        plot.setDecoratorHorizontalBranches(b)

    if options.colour_by_species:
        if options.filename_colour_map:
            map_species2colour = IOTools.ReadMap(
                open(options.filename_colour_map, "r"))
        else:
            map_species2colour = None

        rx = re.compile(options.species_regex)
        extract_species = lambda x: rx.search(x).groups()[0]
        plot.setDecoratorExternalNodes(
            SVGTree.NodeDecoratorBySpecies(
                master_tree,
                plot_symbol=options.plot_leaf_symbol,
                plot_label=options.plot_leaf_labels,
                map_species2colour=map_species2colour,
                extract_species=extract_species))

    if options.font_style_tips:
        plot.getDecoratorExternalNodes().setFontStyle(options.font_style_tips)

    plot.getDecoratorExternalNodes().setPlotLabel(options.plot_leaf_labels)

    current_tree = 1

    ## add annotations by further trees given on the command line
    branch_length_annotations = []

    current_reference_tree = master_tree

    if options.annotations:
        for annotation in options.annotations:

            tree = nexus.trees[current_tree]

            if annotation == "support":

                tree.branchlength2support()
                for id, node in tree.chain.items():
                    node.data.branchlength = 1.0

                if options.support_style == "pie":
                    plot.setDecoratorInternalNodes(
                        NodeDecoratorSupportPieChart(
                            nexus.trees[current_tree]))

            elif annotation == "error":

                if options.error_style == "number":
                    b = SVGTree.BranchDecoratorHorizontalBranchLengthError(
                        current_reference_tree, tree)
                    if options.font_size_branches:
                        b.setFontSize(options.font_size_branches)
                    branch_length_annotations.append(b)

            elif annotation == "kaks":

                if options.kaks_style == "number":
                    b = SVGTree.BranchDecoratorHorizontalBranchLengthWithKaks(
                        current_reference_tree, tree)
                    if options.font_size_branches:
                        b.setFontSize(options.font_size_branches)
                    branch_length_annotations.append(b)

            elif annotation == "value":

                b = SVGTree.BranchDecoratorHorizontalBranchLength(tree)
                if options.font_size_branches:
                    b.setFontSize(options.font_size_branches)
                branch_length_annotations.append(b)

            elif annotation == "master":
                current_reference_tree = tree

            elif annotation == "tables":
                b = BranchDecoratorTable(tree,
                                         filename=options.filename_tables)
                plot.setDecoratorHorizontalBranches(b)

            current_tree += 1

        if len(branch_length_annotations) == 1:
            b = branch_length_annotations[0]
        elif len(branch_length_annotations) == 2:
            b1, b2 = branch_length_annotations
            b1.setFontColour(SVGTree.BLUE)
            b2.setFontColour(SVGTree.RED)
            b = SVGTree.BranchDecoratorHorizontalAboveBelow(
                master_tree, b1, b2)
        elif len(branch_length_annotations) > 2:
            raise "obtained more than three branch length annotations. Layout not implemented"

        plot.setDecoratorHorizontalBranches(b)

    plot.initializePlot()

    plot.writeToFile(sys.stdout)

    E.Stop()
Example #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: compare_clusters.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-o",
                      "--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for filenames.")

    parser.set_defaults(
        output_pattern=None,
        format="%5.2f",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) != 2:
        raise "please supply to filenames with the clusters."

    map_id2cluster1, map_cluster2ids1 = IOTools.ReadMap(open(args[0]),
                                                        both_directions=True)
    map_id2cluster2, map_cluster2ids2 = IOTools.ReadMap(open(args[1]),
                                                        both_directions=True)

    graph = networkx.Graph()

    for a in map_cluster2ids1.keys():
        graph.add_node((1, a))
    for b in map_cluster2ids2.keys():
        graph.add_node((2, b))

    # build graph between clusters
    for cluster1, ids1 in map_cluster2ids1.items():
        for id1 in ids1:
            if id1 in map_id2cluster2:
                graph.add_edge((1, cluster1), (2, map_id2cluster2[id1]))

    components = networkx.connected_components(graph)

    #######################################################
    #######################################################
    #######################################################
    # write components and compute counts
    #######################################################
    outfile = getFile("components", options)
    outfile.write("id\ttotal\tn1\tn2\tmembers1\tmembers2\n")
    n = 0
    counts = {}
    subsets = []
    for component in components:

        m1, m2 = [], []

        for x in component:
            if x[0] == 1:
                m1.append(x[1])
            else:
                m2.append(x[1])

        t = len(component)
        n1 = len(m1)
        n2 = len(m2)
        cc = (n1, n2)
        if cc not in counts:
            counts[cc] = 0
        counts[cc] += 1

        if cc == (1, 1):
            subsets.append(n)

        n += 1
        outfile.write("%i\t%i\t%i\t%i\t%s\t%s\n" %
                      (n, t, n1, n2, ",".join(m1), ",".join(m2)))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    #######################################################
    #######################################################
    #######################################################
    # write counts
    #######################################################
    outfile = getFile("counts", options)
    outfile.write("n1\tn2\tcounts\tpcounts1\tpcounts2\n")
    for cc, c in counts.items():
        outfile.write(
            "%i\t%i\t%i\t%s\t%s\n" %
            (cc[0], cc[1], c, options.format %
             (100.0 * float(c) / len(map_cluster2ids1)), options.format %
             (100.0 * float(c) / len(map_cluster2ids2))))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    #######################################################
    #######################################################
    #######################################################
    # analyze subsets - how many of the 1:1 clusters
    # contain the exact members?
    #######################################################
    outfile = getFile("subsets", options)
    outfile.write("id\tn1\tn2\tunion\tinter\tunique1\tunique2\n")

    ntrue = 0
    nrest1 = 0
    nrest2 = 0
    nother = 0

    for component_id in subsets:
        component = components[component_id]
        if component[0][0] == 1:
            id1, id2 = component[0][1], component[1][1]
        else:
            id1, id2 = component[1][1], component[0][1]

        members1 = set(map_cluster2ids1[id1])
        members2 = set(map_cluster2ids2[id2])

        union = len(members1.union(members2))
        intersection = len(members1.intersection(members2))
        rest1 = len(members1.difference(members2))
        rest2 = len(members2.difference(members1))

        if rest1 == 0 and rest2 == 0:
            ntrue += 1
        elif rest1 == 0:
            nrest1 += 1
        elif rest2 == 0:
            nrest2 += 1
        else:
            nother += 1

        outfile.write("%i\t%i\t%i\t%i\t%i\t%i\t%i\n" %
                      (component_id, len(members1), len(members2), union,
                       intersection, rest1, rest2))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    # write subset statistics
    ntotal = len(subsets)
    options.stdout.write("# subset statistics of 1:1 corresponding clusters\n")
    options.stdout.write("class\tcounts\ttotal\n")
    options.stdout.write("%s\t%i\t%s\n" %
                         ("total", ntotal, options.format % 100))
    options.stdout.write("%s\t%i\t%s\n" % ("true", ntrue, options.format %
                                           (100.0 * ntrue / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("unique1", nrest1, options.format %
                                           (100.0 * nrest1 / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("unique2", nrest2, options.format %
                                           (100.0 * nrest2 / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("other", nother, options.format %
                                           (100.0 * nother / ntotal)))

    E.Stop()
Example #10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/analyze_queries.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-g",
                      "--genomes",
                      dest="genomes",
                      type="string",
                      help="genomes to analyse.")
    parser.add_option("-i",
                      "--priority",
                      dest="priority",
                      type="string",
                      help="quality priority.")
    parser.add_option("-s",
                      "--sort",
                      dest="sort",
                      type="string",
                      help="sort order.")
    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with template peptide sequences.")
    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="string",
                      help="methods to apply [missed].")
    parser.add_option(
        "-f",
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "filename with schema|prediction_id|gene to use as filter. The prediction_ids are used for filtering."
    )
    parser.add_option("-q",
                      "--filter-quality",
                      dest="filter_quality",
                      type="string",
                      help="only consider predictions of given qualities.")
    parser.add_option("--pattern-output",
                      dest="pattern_output",
                      type="string",
                      help="output pattern for multiple file output.")
    parser.add_option("--pattern-stats",
                      dest="pattern_stats",
                      type="string",
                      help="output pattern for multiple statistics output.")
    parser.add_option("--outfile-clusters",
                      dest="outfile_clusters",
                      type="string",
                      help="output filename for clusters.")
    parser.add_option("--infile-clusters",
                      dest="infile_clusters",
                      type="string",
                      help="input filename for clusters.")
    parser.add_option("-n",
                      "--non-redundant",
                      dest="non_redundant",
                      action="store_true",
                      help="use non-redundant set for output.")
    parser.add_option("--clustering-method",
                      dest="clustering_method",
                      type="choice",
                      choices=("fragment", "hid"),
                      help="clustering method to use.")

    parser.set_defaults(
        genomes="",
        priority="CG,PG,SG,RG,CP,PP,SP,RP,CF,PF,SF,UG,UP,UF,BF,UK",
        sort="CG,PG,SG,RG,CP,PP,SP,RP,CF,PF,SF,UG,UP,UF,BF,UK",
        methods="missed",
        peptides=None,
        filename_filter=None,
        separator="|",
        filter_quality=None,
        pattern_output="%s",
        pattern_stats=None,
        clustering_method="fragment",
        outfile_clusters=None,
        infile_clusters=None,
        non_redundant=False,
        format_percent="%5.2f",
    )

    (options, args) = E.Start(parser, add_psql_options=True)

    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
    else:
        peptides = {}

    if options.genomes:
        options.genomes = options.genomes.split(",")
    if options.priority:
        options.priority = options.priority.split(",")
    if options.methods:
        options.methods = options.methods.split(",")
    if options.sort:
        options.sort = options.sort.split(",")
    if options.filter_quality:
        options.filter_quality = options.filter_quality.split(",")

    subset = {}
    if options.filename_filter:
        data = map(
            lambda x: x[:-1].split(options.separator)[:3],
            filter(lambda x: x[0] != "#",
                   open(options.filename_filter, "r").readlines()))
        for s, p, g in data:
            if s not in subset:
                subset[s] = {}
            subset[s][p] = 1

    if len(options.sort) != len(options.priority):
        raise "different number of classes in sort order and priority order"

    dbhandle = pgdb.connect(options.psql_connection)

    # Cluster peptides
    if options.infile_clusters:
        map_peptide2cluster, map_cluster2peptide = IOTools.ReadMap(
            open(options.infile_clusters, "r"), both_directions=True)
    elif peptides:
        if options.clustering_method == "fragment":
            map_cluster2peptide, map_peptide2cluster = ClusterPeptidesByFragment(
                peptides)
        elif options.clustering_method == "hid":
            map_cluster2peptide, map_peptide2cluster = ClusterPeptidesByHid(
                peptides)
    else:
        map_cluster2peptide = {}
        map_peptide2cluster = {}

    if map_cluster2peptide and options.loglevel >= 1:
        options.stdlog.write(
            "# clustering of peptides: %i cluster for %i peptides\n" %
            (len(map_cluster2peptide), len(map_peptide2cluster)))
        sys.stdout.flush()

    if options.outfile_clusters and not options.infile_clusters:
        options.stdlog.write("# writing clusters to %s\n" %
                             options.outfile_clusters)
        outfile = open(options.outfile_clusters, "w")
        for k, v in map_peptide2cluster.items():
            outfile.write("%s\t%s\n" % (k, v))
        outfile.close()

    for method in options.methods:

        if method == "stats":
            # Count number of missed unique genes/transcripts

            headers = (
                "species",
                "genes",
                "found_genes",
                "missed_genes",
                "pfound_genes",
                "pmissed_genes",
                "nr_found_genes",
                "nr_missed_genes",
                "pnr_found_genes",
                "pnr_missed_genes",
                "transcripts",
                "found_transcripts",
                "missed_transcripts",
                "pfound_transcripts",
                "pmissed_transcripts",
                "nr_found_transcripts",
                "nr_missed_transcripts",
                "pnr_found_transcripts",
                "pnr_missed_transcripts",
            )

            options.stdout.write("\t".join(headers) + "\n")

            for genome in options.genomes:

                r = GetQueryInfo(dbhandle, genome, options, subset)

                found_genes, genes, found_transcripts, transcripts =\
                    CountFoundGenes(map(lambda x: (x[0], x[1], x[2]), r),
                                    map_peptide2cluster)

                nrfound_genes, nrgenes, nrfound_transcripts, nrtranscripts =\
                    CountFoundGenes(map(lambda x: (x[0], x[1], x[3]), r),
                                    map_peptide2cluster)

                nfound_genes = len(found_genes)
                nfound_transcripts = len(found_transcripts)
                nnrfound_genes = len(nrfound_genes)
                nnrfound_transcripts = len(nrfound_transcripts)

                ngenes = len(genes)
                ntranscripts = len(transcripts)

                if ngenes == 0 or ntranscripts == 0:
                    continue

                f1 = lambda x: 100 * float(x) / ngenes
                f2 = lambda x: 100 * float(x) / ntranscripts
                options.stdout.write(
                    "%s\t%i\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f\n"
                    %
                    (genome, ngenes, nfound_genes, ngenes - nfound_genes,
                     f1(nfound_genes), f1(ngenes - nfound_genes),
                     nnrfound_genes, ngenes - nnrfound_genes,
                     f1(nnrfound_genes), f1(ngenes - nnrfound_genes),
                     ntranscripts, nfound_transcripts,
                     ntranscripts - nfound_transcripts, f2(nfound_transcripts),
                     f2(ntranscripts - nfound_transcripts),
                     nnrfound_transcripts, ntranscripts - nnrfound_transcripts,
                     f2(nnrfound_transcripts),
                     f2(ntranscripts - nnrfound_transcripts)))

        elif method == "missed":

            headers = (
                "species",
                "genes",
                "transcripts",
                "missed_genes",
                "missed_transcripts",
                "percent_missed_genes",
                "percent_missed_transcripts",
            )

            options.stdout.write("\t".join(headers) + "\n")

            all_missed_genes = {}
            all_missed_transcripts = {}

            for genome in options.genomes:

                r = GetQueryInfo(dbhandle, genome, options, subset)

                if options.non_redundant:
                    found_genes, genes, found_transcripts, transcripts =\
                        CountFoundGenes(map(lambda x: (x[0], x[1], x[3]), r),
                                        map_peptide2cluster)
                else:
                    found_genes, genes, found_transcripts, transcripts =\
                        CountFoundGenes(map(lambda x: (x[0], x[1], x[2]), r),
                                        map_peptide2cluster)

                sg = set(genes)
                missed_genes = sg.difference(found_genes)

                for x in missed_genes:
                    if x not in all_missed_genes:
                        all_missed_genes[x] = []
                    all_missed_genes[x].append(genome)

                sm = set(transcripts)
                missed_transcripts = sm.difference(found_transcripts)

                for x in missed_transcripts:
                    if x not in all_missed_transcripts:
                        all_missed_transcripts[x] = []
                    all_missed_transcripts[x].append(genome)

                options.stdout.write(
                    "%s\t%i\t%i\t%i\t%i\t%s\t%s\n" %
                    (genome, len(genes), len(transcripts), len(missed_genes),
                     len(missed_transcripts), options.format_percent %
                     (100.0 * float(len(missed_genes)) / len(genes)),
                     options.format_percent %
                     (100.0 * float(len(missed_transcripts)) /
                      len(transcripts))))

            for section in ("genes", "transcripts"):

                if section == "genes":
                    missed = all_missed_genes
                else:
                    missed = all_missed_transcripts

                writeListMissed(open(options.pattern_output % section, "w"),
                                missed, options.genomes, options)

                if options.pattern_stats:
                    outfile = open(options.pattern_stats % section, "w")
                else:
                    outfile = options.stdout
                    outfile.write("# statistics for %s\n" % section)

                writeStatsMissed(outfile, missed, options.genomes, options)

                if outfile != options.stdout:
                    outfile.close()

    E.Stop()
Example #11
0
        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={'CG': "circle", 'PG': "circle", 'SG': "circle"},
        quality2mask=(
            "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK"),
        sort_by_size = True,
        input_format = "pairwise",
    )

    (options, args) = Experiment.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes, "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):
Example #12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2predictions.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-l",
                      "--filename-locations",
                      dest="filename_locations",
                      type="string",
                      help="filename with locations")

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="the master determines the frame.")

    parser.set_defaults(filename_locations=None, gap_chars="-.", master=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    identifiers = mali.getIdentifiers()

    aligned_columns, aligned_exons = getAlignedColumns(mali, options)

    map_id2location = {}

    if options.filename_locations:
        map_id2location = IOTools.ReadMap(open(options.filename_locations,
                                               "r"))

    options.stdout.write(Prediction.Prediction().getHeader() + "\n")

    nid = 1

    for identifier in identifiers:

        if options.loglevel >= 2:
            options.stdlog.write("# processing %s\n" % (identifier))

        entry = mali.getEntry(identifier)

        sequence = entry.mString
        if sequence[0] not in string.lowercase:
            raise "all sequences should start with an exon."

        was_exon = True
        d = 0
        alignment = []
        carry_over = 0

        last_codon = []
        codon = []
        nchars_in_codon = 0
        n = 0

        last_master_residue = 0
        master_residue = 0
        for column in range(len(sequence)):

            c = sequence[column]
            is_gap = c in options.gap_chars
            is_aligned = column in aligned_columns
            is_exon = column in aligned_exons

            if is_gap:
                continue

            if is_exon:
                master_residue = aligned_exons[column]
                codon.append((n, master_residue))

            n += 1

            # check if we have a complete codon
            if is_exon:
                # A codon is complete, if it ends at frame 2 or
                # it spans more than one codons in the master.
                # Gaps in the master that are a multiple of 3 are ignored
                d = master_residue - last_master_residue - 1

                if master_residue % 3 == 2 or (d % 3 != 0 and d > 0):

                    if last_codon:
                        d = codon[0][0] - last_codon[-1][0] - 1
                        if d > 0:
                            # add in-frame introns
                            if d > 10:
                                alignment.append(["5", 0, 2])
                                alignment.append(["I", 0, d - 4])
                                alignment.append(["3", 0, 2])
                            else:
                                raise "untreated case"

                    alignment += processCodon(codon)
                    last_codon = codon
                    codon = []

            last_master_residue = master_residue

        last = alignment[0]
        new_alignment = []
        for this in alignment[1:]:
            if this[0] == last[0]:
                last[1] += this[1]
                last[2] += this[2]
                continue

            new_alignment.append(last)
            last = this

        new_alignment.append(last)

        if options.loglevel >= 4:
            options.stdlog.write("# output=%s\n" % (str(new_alignment)))

        assert (new_alignment[-1][2] % 3 == 0)

        lalignment = sum(map(lambda x: x[2], new_alignment))

        prediction = Prediction.Prediction()

        prediction.mQueryToken = identifier

        genomic_sequence = re.sub("[%s]" % options.gap_chars, "",
                                  mali[identifier])

        prediction.mPredictionId = nid
        nid += 1

        if identifier in map_id2location:

            prediction.mSbjctToken, prediction.mSbjctStrand, sfrom, sto = map_id2location[
                identifier].split(":")[:4]

            prediction.mSbjctGenomeFrom = int(sfrom) + entry.mFrom
            prediction.mSbjctGenomeTo = int(sto)

        else:
            prediction.mSbjctToken = "unk"
            prediction.mSbjctStrand = "+"
            prediction.mSbjctGenomeFrom = 0

        prediction.mQueryCoverage = 100
        prediction.mPercentIdentity = 100
        prediction.mPercentSimilarity = 100

        prediction.mQueryLength = prediction.mQueryTo

        prediction.mSbjctGenomeTo = prediction.mSbjctGenomeFrom + lalignment

        prediction.mMapPeptide2Genome = new_alignment
        prediction.mAlignmentString = string.join(
            map(lambda x: string.join(map(str, x), " "),
                prediction.mMapPeptide2Genome), " ")

        prediction.mMapPeptide2Translation, prediction.mTranslation = Genomics.Alignment2PeptideAlignment(
            prediction.mMapPeptide2Genome, 0, 0, genomic_sequence)

        (prediction.mNIntrons, prediction.mNFrameShifts, prediction.mNGaps, prediction.mNSplits, prediction.mNStopCodons, disruptions) = \
            Genomics.CountGeneFeatures(0,
                                       prediction.mMapPeptide2Genome,
                                       genomic_sequence)

        options.stdout.write(str(prediction) + "\n")

    E.Stop()
Example #13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--template-bam",
                      dest="filename_genome_bam",
                      type="string",
                      help="input bam file for header information [%default]")

    parser.add_option("-s",
                      "--contig-sizes",
                      dest="filename_contigs",
                      type="string",
                      help="filename with contig sizes [%default]")

    parser.add_option(
        "-o",
        "--colour",
        dest="colour_mismatches",
        action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option("-i",
                      "--ignore-mismatches",
                      dest="ignore_mismatches",
                      action="store_true",
                      help="ignore mismatches [%default]")

    parser.add_option(
        "-c",
        "--remove-contigs",
        dest="remove_contigs",
        type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option("-f",
                      "--force",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files [%default]")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.set_defaults(
        filename_genome_bam=None,
        filename_gtf=None,
        filename_mismapped=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    genomefile, referencenames, referencelengths = None, None, None

    if options.filename_genome_bam:
        genomefile = pysam.Samfile(options.filename_genome_bam, "rb")
    elif options.filename_contigs:
        contigs = IOTools.ReadMap(IOTools.openFile(options.filename_contigs))
        data = zip(*list(contigs.iteritems()))
        referencenames, referencelengths = data[0], map(int, data[1])
    else:
        raise ValueError(
            "please provide either --template-bam or --contig-sizes")

    infile = pysam.Samfile("-", "rb")
    outfile = pysam.Samfile("-",
                            "wb",
                            template=genomefile,
                            referencenames=referencenames,
                            referencelengths=referencelengths)

    if options.colour_mismatches:
        tag = "CM"
    else:
        tag = "NM"

    nambiguous = 0
    ninput = 0
    nunmapped = 0
    ncigar = 0
    nfull = 0
    noutput = 0

    contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)])

    for qname, readgroup in itertools.groupby(infile, lambda x: x.qname):
        ninput += 1
        reads = list(readgroup)
        if reads[0].is_unmapped:
            nunmapped += 1
            continue

        # filter for best match
        best = min([x.opt(tag) for x in reads])
        reads = [x for x in reads if x.opt(tag) == best]
        if len(reads) > 1:
            nambiguous += 1
            continue

        read = reads[0]

        # reject complicated matches (indels, etc)
        # to simplify calculations below.
        if len(read.cigar) > 1:
            ncigar += 1
            continue

        # set NH flag to latest count
        t = dict(read.tags)
        t['NH'] = 1
        read.tags = list(t.iteritems())

        sname = infile.getrname(read.tid)

        contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split(
            "|")
        first_exon_end, last_exon_start = middle.split("-")
        first_exon_start, first_exon_end, last_exon_start, last_exon_end = map(int, (\
                first_exon_start, first_exon_end, last_exon_start, last_exon_end ) )
        first_exon_end += 1

        total = first_exon_end - first_exon_start + last_exon_end - last_exon_start
        first_exon_length = first_exon_end - first_exon_start

        match1 = first_exon_length - read.pos
        intron_length = last_exon_start - first_exon_end
        match2 = read.qlen - match1

        # match lies fully in one exon - ignore
        if match1 <= 0 or match2 <= 0:
            nfull += 1
            continue

        # increment pos
        read.pos = first_exon_start + read.pos
        read.tid = contig2tid[contig]
        # 3 = BAM_CREF_SKIP
        read.cigar = [(0, match1), (3, intron_length), (0, match2)]

        outfile.write(read)

        noutput += 1

    outfile.close()
    if genomefile:
        genomefile.close()

    c = E.Counter()
    c.input = ninput
    c.output = noutput
    c.full = nfull
    c.cigar = ncigar
    c.ambiguous = nambiguous
    c.unmapped = nunmapped

    E.info("%s" % str(c))

    ## write footer and output benchmark information.
    E.Stop()
Example #14
0
    parser.add_option("--dump", dest="dump", action="store_true",
                      help="dump output.")

    parser.set_defaults(
        separator="|",
        dump=False,
        filename_map=None,
        filename_alignment="-",
        filename_tree=None,
    )

    (options, args) = E.Start(parser)

    if options.filename_map:
        map_species2sp = IOTools.ReadMap(open(options.filename_map, "r"))

    E.debug("species map: %s" % str(map_species2sp))

    identifier_parser = IdentifierParserGPipe(map_species2sp=map_species2sp)

    njtree = NJTree(identifier_parser=identifier_parser)

    njtree.SetLog(options.stdlog)
    njtree.SetErr(options.stderr)

    if options.filename_tree:
        njtree.SetSpeciesTree(options.filename_tree)

    mali = Mali.Mali()
    if options.filename_alignment == "-":
Example #15
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--filename-weights",
        dest="filename_weights",
        type="string",
        help=
        "filename with codon frequencies. Multiple filenames can be separated by comma [default=%default]."
    )

    parser.add_option("-s",
                      "--sections",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "hid", "na", "aa", "degeneracy",
                               "bias", "codons", "codon-usage",
                               "codon-translator"),
                      help="which sections to output [default=%default]")

    parser.add_option(
        "-t",
        "--type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help=
        "type of sequence: na=nucleotides, aa=amino acids [default=%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression to extract identifier from fasta description line [default=%default]."
    )

    parser.set_defaults(
        filename_weights="uniform",
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
    )

    (options, args) = E.Start(parser, argv=argv)
    options.filename_weights = options.filename_weights.split(",")

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(open(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        ## print codon table differences
        E.info("difference between supplied codon usage preferences.")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y: continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon): continue
                    d += b[codon] * math.log(b[codon] / p)
                E.info("tablediff\t%s\t%s\t%f" %
                       (options.filename_weights[x],
                        options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "na":
                s = SequencePropertiesNA()
            elif section == "aa":
                s = SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequencePropertiesDegeneracy()
            elif section == "bias":
                s = SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "aa":
                s = SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    ## setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            E.warning("empty sequence %s" % cur_record.title)
            continue

        id = rx.search(cur_record.title).groups()[0]

        options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    options.stdout.write("total")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getFields()))
    options.stdout.write("\n")

    E.Stop()
Example #16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: matrix2stats.py 2795 2009-09-16 15:29:23Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("chi-squared", "pearson-chi-squared"),
                      help="statistical methods to apply.")

    parser.add_option("-t", "--header-names", dest="headers", action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers", dest="headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix."""  )

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix."""  )

    parser.add_option("-p", "--parameters", dest="parameters", action="append", type="string",
                      help="parameters for various functions.")

    parser.add_option("-a", "--iteration", dest="iteration", type="choice",
                      choices=("pairwise", "all-vs-all"),
                      help="""how to compute stats [%default]."""  )

    parser.set_defaults(
        method="chi-squared",
        headers=True,
        value_format="%6.4f",
        pvalue_format="%6.4e",
        input_format="full",
        write_separators=True,
        parameters=[],
        iteration=None,
    )

    (options, args) = E.Start(parser)

    lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

    chunks = [x for x in range(len(lines)) if lines[x][0] == ">"]

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    ninput, noutput, nskipped = 0, 0, 0

    if options.write_separators:
        options.stdout.write("test\t")

    header_prefix = ""

    if options.method == "chi-squared":
        header_prefix = "observed\texpected"
        options.stdout.write("\t".join(
            (header_prefix, "n", "min", "max", "chi", "df", "P", "passed", "phi")) + "\n")

    elif options.method in ("pearson-chi-squared",):
        options.stdout.write("column\t")
        options.stdout.write("\t".join(
            (header_prefix, "n", "prob", "obs", "exp", "chi", "df", "P", "passed", "phi")) + "\n")

        if len(options.parameters) == 0:
            raise "out of parameters - please supply probability or filename with probabilities."

        param = options.parameters[0]
        del options.parameters[0]

        if options.write_separators:
            probabilities = IOTools.ReadMap(
               IOTools.openFile(param, "r"), map_functions=(str, float))
        else:
            probability = float(param)

    for x in range(len(chunks) - 1):
        ninput += 1
        matrix, row_headers, col_headers = MatlabTools.readMatrix(
            StringIO("".join(lines[chunks[x] + 1:chunks[x + 1]])),
            format=options.input_format,
            headers=options.headers)
        nrows, ncols = matrix.shape

        if options.loglevel >= 2:
            options.stdlog.write("# read matrix: %i x %i, %i row titles, %i colum titles.\n" %
                                 (nrows, ncols, len(row_headers), len(col_headers)))

        if options.write_separators:
            options.stdout.write(lines[chunks[x]][1:-1] + "\t")

        pairs = []
        if options.iteration == "pairwise":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(row1 + 1, len(row_headers)):
                    pairs.append((row1, row2))
        elif options.iteration == "all-vs-all":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(0, len(row_headers)):
                    if row1 == row2:
                        continue
                    pairs.append((row1, row2))

        if options.method == "chi-squared":

            for row1, row2 in pairs:
                row_header1 = row_headers[row1]
                row_header2 = row_headers[row2]
                try:
                    result = Stats.doChiSquaredTest(
                        numpy.vstack((matrix[row1], matrix[row2])))
                except ValueError:
                    nskipped += 1
                    continue

                noutput += 1
                options.stdout.write("\t".join((
                    "%s" % row_header1,
                    "%s" % row_header2,
                    "%i" % result.mSampleSize,
                    "%i" % min(matrix.flat),
                    "%i" % max(matrix.flat),
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)) + "\n")

        elif options.method == "pearson-chi-squared":

            if nrows != 2:
                raise ValueError("only implemented for 2xn table")

            if options.write_separators:
                id = re.match("(\S+)", lines[chunks[x]][1:-1]).groups()[0]
                probability = probabilities[id]

            for col in range(ncols):
                options.stdout.write("%s\t" % col_headers[col])
                result = Stats.doPearsonChiSquaredTest(
                    probability, sum(matrix[:, col]), matrix[0, col])
                options.stdout.write("\t".join((
                    "%i" % result.mSampleSize,
                    "%f" % probability,
                    "%i" % result.mObserved,
                    "%f" % result.mExpected,
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)))
                if col < ncols - 1:
                    options.stdout.write("\n")
                    if options.write_separators:
                        options.stdout.write(lines[chunks[x]][1:-1] + "\t")

            options.stdout.write("\n")

    E.info("# ninput=%i, noutput=%i, nskipped=%i\n" %
           (ninput, noutput, nskipped))

    E.Stop()
Example #17
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: tree_strain2species.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--filename-synonyms", dest="filename_synonyms", type="string",
                      help="filename with synonyms. Use this to aggregate several strains for a species.")

    parser.add_option("--filename-genes", dest="output_filename_genes", type="string",
                      help="output filename with new gene names.")

    parser.add_option("--species-tree", dest="species_tree", action="store_true",
                      help="input tree are species trees. If not given, the trees are assumed to be gene trees.")

    parser.add_option("--merge-mode", dest="merge_mode", type="choice",
                      choices=("ignore", "add-mean", "add-max", "add-min"),
                      help="how to deal with branch lengths of merged nodes.")

    parser.set_defaults(
        filename_synonyms="map_strain2species",
        pattern_gene="J%06i",
        output_format="nh",
        separator="|",
        output_filename_genes=None,
        keep_old_names=False,
        species_tree=False,
        merge_mode="ignore",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    ########################################################################
    ########################################################################
    ########################################################################
    # read synonyms
    if options.filename_synonyms:
        infile = open(options.filename_synonyms, "r")
        map_strain2species = IOTools.ReadMap(infile)
        infile.close()
    else:
        map_strain2species = {}

    lines = map(lambda x: x[:-1], sys.stdin.readlines())

    ninput, noutput, nskipped, nmerged = 0, 0, 0, 0

    # iterate over chunks
    chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))
    if len(chunks) == 0:
        chunks = [0]
    chunks.append(len(lines))

    if options.species_tree:
        processSpeciesTrees(chunks, lines, map_strain2species, options)
    else:
        processGeneTrees(chunks, lines, map_strain2species, options)

    E.Stop()
Example #18
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="first row is a header [ignored].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-c",
                      "--contig-sizes",
                      dest="filename_contig_sizes",
                      type="string",
                      help="filname with contig sizes.")
    parser.add_option("-r",
                      "--radius",
                      dest="radius",
                      type="int",
                      help="radius.")
    parser.add_option("-i",
                      "--increment",
                      dest="radius_increment",
                      type="int",
                      help="radius increment.")
    parser.add_option("-u",
                      "--url",
                      dest="url",
                      type="string",
                      help="string to build url for annotation.")
    parser.add_option("--min-contig",
                      dest="min_contig_size",
                      type="string",
                      help="minimum contig size to delineate.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum branch length.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum branch length.")

    parser.set_defaults(
        filename_contig_sizes=None,
        headers=False,
        titles="",
        pattern_filename=None,
        title="",
        footer="",
        radius=3000,
        min_value=0.0,
        max_value=0.2,
        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={
            'CG': "circle",
            'PG': "circle",
            'SG': "circle"
        },
        quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG",
                      "UP", "UF", "BF", "UK"),
        sort_by_size=True,
        input_format="pairwise",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes,
                                               "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
        for k in map_contig2size.keys():
            if k not in chrs:
                del map_contig2size[k]

    k = map_contig2size.keys()

    if len(k) == 0:
        E.Stop()
        sys.exit(0)

    k.sort()

    if options.sort_by_size:
        k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y]))

    plot = DuplicationPlot(k, map_contig2size, num_entries=0)

    plot.mRadiusIncrement = options.radius_increment
    plot.mRadius = options.radius
    plot.mMaxValue = options.max_value
    plot.mMinValue = options.min_value

    if options.title:
        plot.setTitle(options.title)
    if options.footer:
        plot.setFooter(options.footer)

    plot.initializePlot()

    data = []

    if options.input_format == "pairwise":

        # read data from pairwise analysis
        # format is: cluster_id, locations of duplications, tree of
        # duplications

        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            mi, ma = 0, 0
            found = False
            n = 0
            chrs = {}
            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
                sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)

                xi = plot.getPosition(chr, strand, sbjct_from)
                xa = plot.getPosition(chr, strand, sbjct_to)

                if not mi:
                    mi = xi
                else:
                    mi = min(mi, xi)

                n += 1
                ma = max(ma, xa)
                found = True

            if not found:
                continue
            cis = len(chrs) == 1
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# adding duplications in cluster %s: %s with tree %s\n" %
                    (cluster_id, in_locations, in_tree))
            data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree))

    data.sort()

    plot.mNumEntries = len(data)
    plot.initializePlot()

    last_ndups = 0

    for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]:

        if ndups != last_ndups:
            plot.pushRadius()
            plot.addSeparator()

        last_ndups = ndups

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)

        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            c = map(lambda x: x.split(options.separator), children)
            plot.addDuplication(c,
                                map_gene2location,
                                height,
                                url=options.url,
                                with_separator=is_first,
                                link_to_previous=not is_first,
                                quality2symbol=options.quality2symbol,
                                quality2mask=options.quality2mask)
            is_first = False

    plot.writeToFile(sys.stdout)

    E.Stop()
Example #19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        help=
        "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
        choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for histograms."
                      "")
    parser.add_option("-f",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option(
        "-n",
        "--norm-test",
        dest="norm_test",
        action="store_true",
        help=
        """test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot",
                      dest="plot",
                      action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write(
            "# creating %i samples from normal distribution with mean %f and stddev %f\n"
            % (len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" %
            (len(values1), len(errors1), len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=False,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=True,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created."
            )
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
          )

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()
Example #20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fasta2nj.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option(
        "-m",
        "--map",
        dest="filename_map",
        type="string",
        help="filename with mapping of species ids to swissprot species ids.")

    parser.set_defaults(
        separator="|",
        filename_map=None,
    )

    (options, args) = E.Start(parser)

    if options.filename_map:
        map_species2sp = IOTools.ReadMap(open(options.filename_map, "r"))

    ninput, noutput, nerrors = 0, 0, 0
    for line in sys.stdin:
        if line[0] == ">":
            ninput += 1

            id = re.match(">([^/ \t]+)", line[:-1]).groups()[0]
            data = id.split(options.separator)

            species = data[0]

            if len(data) == 2:
                gene = data[1]
                transcript = None
            elif len(data) >= 3:
                gene = data[2]
                transcript = data[1]

            if map_species2sp:
                try:
                    species = map_species2sp[species]
                except IndexError:
                    nerrors += 1
                    if options.loglevel >= 1:
                        options.stdlog.write("# could not map species %s\n" %
                                             species)
            if transcript:
                options.stdout.write(">%s_%s GENEID=%s\n" %
                                     (transcript, species, gene))
            else:
                options.stdout.write(">%s_%s\n" % (species, gene))
            noutput += 1
        else:
            options.stdout.write(line)

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nerrors=%i\n" %
                             (ninput, noutput, nerrors))
    E.Stop()
Example #21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="string",
        help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("-p",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
    )

    (options, args) = E.Start(
        parser,
        add_pipe_options=True,
        add_psql_options=True,
    )

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))

    values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"),
                                        map_category=map_category2value)
    values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                        map_category=map_category2value)

    E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" %
           (len(values1), len(errors1), len(values2), len(errors2)))

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2)
    elif options.method == "mwu":
        result = R.wilcox_test(values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)

    R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

    R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
      )

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')"""
      )
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')"""
      )
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )

    print "## Results for %s" % result['method']
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print x, result[x]

    E.Stop()
Example #22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: shuffle_fasta.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-c",
        "--codons",
        dest="codons",
        action="store_true",
        help="make sure that shuffled sequences only contain valid codons.")

    parser.add_option("-a",
                      "--conserve-aminos",
                      dest="conserve_aminos",
                      action="store_true",
                      help="conserve amino acids.")

    parser.add_option(
        "-b",
        "--bias",
        dest="bias",
        type="float",
        help=
        "introduce bias into codon usage choice. Complete bias is 1.0, while no bias is 0.0."
    )

    parser.add_option(
        "-i",
        "--biased-codon-usage",
        dest="filename_biased_codon_usage",
        type="string",
        help="Filename with reference codon usage table for biased codon usage."
    )

    parser.add_option(
        "-u",
        "--bulk-codon-usage",
        dest="filename_bulk_codon_usage",
        type="string",
        help=
        "Filename with reference codon usage table for unbiased codon usage.")

    parser.set_defaults(
        codons=False,
        conserve_aminos=False,
        bias=0.0,
        filename_biased_codon_usage=None,
        filename_bulk_codon_usage=None,
        stop_codons=("TAG", "TAA", "TGA"),
        precision=10000,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    # get map of amino acids to codons
    map_aa2codons = Genomics.GetMapAA2Codons()

    # for codon based shuffling: build ranges based on strength of bias and on reference codon usage
    # Bias switches from completely biased to unbiased. Unbiased is uniform
    # usage.
    if options.filename_biased_codon_usage:

        map_codon2frequency = IOTools.ReadMap(open(
            options.filename_biased_codon_usage, "r"),
                                              map_functions=(str, float),
                                              has_header=True)

        if options.filename_bulk_codon_usage:
            map_codon2frequency_bulk = IOTools.ReadMap(
                open(options.filename_bulk_codon_usage, "r"),
                map_functions=(str, float),
                has_header=True)

        codon_ranges = {}
        for aa in map_aa2codons.keys():
            c = []
            x = 0
            for codon in map_aa2codons[aa]:

                if options.filename_bulk_codon_usage:
                    u = map_codon2frequency_bulk[codon]
                else:
                    # uniform usage
                    u = 1.0 / len(map_aa2codons[aa])

                g = map_codon2frequency[codon]
                f = g + (u - g) * (1.0 - options.bias)
                x += f * options.precision
                c.append(x)
            codon_ranges[aa] = c

    while 1:
        cur_record = iterator.next()

        if cur_record is None:
            break

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if options.conserve_aminos:
            n = []
            for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                aa = Genomics.MapCodon2AA(codon)
                if aa not in map_aa2codons:
                    continue
                if options.bias or options.filename_biased_codon_usage:
                    # get random number from 0 to precision
                    v = random.randint(0, options.precision)
                    # find the corresponding intervall:
                    l = len(map_aa2codons[aa])
                    x = 0
                    while x < l - 1:
                        if v < codon_ranges[aa][x]:
                            break
                        x += 1
                else:
                    x = random.randint(0, len(map_aa2codons[aa]) - 1)
                n.append(map_aa2codons[aa][x])
            sequence = "".join(n)
        else:
            sequence = list(sequence)
            if options.codons:
                while 1:
                    random.shuffle(sequence)
                    for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                        if codon in options.stop_codons:
                            redo = True
                            break
                    else:
                        break
            else:
                random.shuffle(sequence)
            sequence = "".join(sequence)
        options.stdout.write(">%s\n%s\n" %
                             (cur_record.title, "".join(sequence)))

    E.Stop()
Example #23
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/annotate_clusters.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--filename-map",
        dest="filename_map_id2cluster",
        type="string",
        help="filename with mapping information from id to cluster.")

    parser.add_option("--filename-interpro",
                      dest="filename_interpro",
                      type="string",
                      help="filename with interpro domain information.")

    parser.add_option("--filename-pfam",
                      dest="filename_pfam",
                      type="string",
                      help="filename with pfam domain information.")

    parser.set_defaults(
        master_species="dmel_vs_dmel4",
        separator="|",
        filename_map_id2cluster="input.map",
        filename_interpro="/home/andreas/projects/flies/data_1v5/interpro.list",
        filename_pfam="/home/andreas/projects/flies/data_1v5/pfam.list",
        write_no_annotation=True,
        separator_fields=";",
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    clusters, nerrors = IOTools.ReadList(sys.stdin)

    map_id2cluster, map_cluster2id = IOTools.ReadMap(open(
        options.filename_map_id2cluster, "r"),
                                                     both_directions=True)

    if len(clusters) == 0:
        clusters = map_cluster2id.keys()
        clusters.sort()

    if options.filename_interpro:
        map_id2interpro = readAnnotationInterpro(
            open(options.filename_interpro, "r"))

    if options.filename_pfam:
        map_id2pfam = readAnnotationPfam(open(options.filename_pfam, "r"))

    ninput, noutput, nnomaster, nnoannotation = 0, 0, 0, 0
    nskipped = 0

    options.stdout.write("cluster\tgenes")

    if map_id2interpro:
        options.stdout.write("\tinterpro\tidescription")
    if map_id2pfam:
        options.stdout.write("\tpfam\tpdescription")
    options.stdout.write("\n")

    for cluster in clusters:

        ninput += 1
        if cluster not in map_cluster2id:
            if options.loglevel >= 1:
                options.stdlog.write("# cluster %s not in map.\n" % cluster)
            nskipped += 1
            continue

        genes = set()

        for id in map_cluster2id[cluster]:

            s, t, g, q = id.split(options.separator)

            if s != options.master_species:
                continue

            genes.add(g)

        if not genes:
            nnomaster += 1
            continue

        annotations_interpro = {}
        if map_id2interpro:
            for gene in genes:
                if gene in map_id2interpro:
                    for annotation in map_id2interpro[gene]:
                        annotations_interpro[
                            annotation.mIdentifier] = annotation

        annotations_pfam = {}

        if map_id2pfam:
            for gene in genes:
                if gene in map_id2pfam:
                    for annotation in map_id2pfam[gene]:
                        annotations_pfam[annotation.mIdentifier] = annotation

        nannotations = max(len(annotations_pfam), len(annotations_interpro))

        if nannotations == 0 and not options.write_no_annotation:
            nnoannotation += 1
            continue

        options.stdout.write("%s\t%s" % (cluster, ";".join(genes)))

        if map_id2interpro:
            printAnnotations(options.stdout, annotations_interpro, options)

        if map_id2pfam:
            printAnnotations(options.stdout, annotations_pfam, options)

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i, nnomaster=%i, nnoannotation=%i\n"
            % (ninput, noutput, nskipped, nnomaster, nnoannotation))

    E.Stop()