Python IOTools.ReadMap Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: ReadMap

Examples at hotexamples.com: 23

The python CGAT.IOTools.ReadMap is a function that is a part of the CGAT package, specifically the IOTools module. This function is used to read in a map file in a specific format into a Python dictionary. The map file contains key-value pairs, where each line in the file represents one pair. The function reads the file and constructs a dictionary mapping the keys to their corresponding values. This function is useful for quickly accessing values based on their keys without having to search through the entire file.

Python IOTools.ReadMap - 23 examples found. These are the top rated real world Python examples of CGAT.IOTools.ReadMap extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

def readComponents(options):
    """read components from filename supplied in the options.
    """
    if options.filename_components:
        map_seq_id2component =\
                             IOTools.ReadMap( open(options.filename_components, "r"),
                                              columns = "all",
                                              both_directions = False)

        map_component2seq_id = {}
        map_component2input_id = {}
        for key, val in map_seq_id2component.items():
            if type(val) == types.StringType:
                input_id = val
                output_id = val
            elif type(val) == types.TupleType:
                if len(val) == 2:
                    input_id = val[0]
                    output_id = val[1]
                else:
                    input_id = val[0]
                    output_id = val[0]
            else:
                raise ValueError("error in reading %s: %s->%s" %
                                 (options.filename_components, key, val))

            if output_id not in map_component2seq_id:
                map_component2seq_id[output_id] = []
            map_component2seq_id[output_id].append(key)
            map_component2input_id[output_id] = input_id
        return map_seq_id2component, map_component2seq_id, map_component2input_id
    else:
        return None, None, None

Example #2

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $"
    )

    parser.add_option("-f",
                      "--file",
                      dest="input_filename",
                      type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option(
        "-i",
        "--input-pattern",
        dest="input_pattern",
        type="string",
        help="input pattern. Parses description line in order to extract id.")

    parser.add_option(
        "-o",
        "--output-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern. Gives filename for a given sequence.")

    parser.add_option(
        "-n",
        "--num-sequences",
        dest="num_sequences",
        type="int",
        help="split by number of sequences (not implemented yet).")

    parser.add_option("-m",
                      "--map",
                      dest="map_filename",
                      type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s",
                      "--skip-identifiers",
                      dest="skip_identifiers",
                      action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size",
                      dest="min_size",
                      type="int",
                      help="minimum cluster size.")

    parser.set_defaults( \
        input_filename = None,
        map_filename = None,
        skip_identifiers = False,
        input_pattern = "^(\S+)",
        min_size = 0,
        num_sequences = None,
        output_pattern = "%s" )

    (options, args) = E.Start(parser)

    if options.input_filename:
        infile = IOTools.openFile(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = IOTools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print "# parsing error in description line %s" % (seq.title)
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    ## delete all clusters below a minimum size
    ## Note: this has to be done at the end, because
    ## clusters sizes are only available once both the fasta
    ## file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print "# input=%i, output=%i, ndeleted=%i" % (ninput, noutput,
                                                      ndeleted)

    E.Stop()

Example #3

Show file

def Process(lines, other_trees, options, map_old2new, ntree):

    nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines))

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees.\n" % len(nexus.trees))

    nskipped = 0
    ntotal = len(nexus.trees)
    extract_pattern = None
    species2remove = None
    write_map = False

    phylip_executable = None
    phylip_options = None

    index = 0

    # default: do not output internal node names
    write_all_taxa = False

    for tree in nexus.trees:

        if options.outgroup:
            tree.root_with_outgroup(options.outgroup)

        for method in options.methods:

            if options.loglevel >= 3:
                options.stdlog.write("# applying method %s to tree %i.\n" %
                                     (method, index))

            if method == "midpoint-root":
                tree.root_midpoint()

            elif method == "balanced-root":
                tree.root_balanced()

            elif method == "unroot":
                TreeTools.Unroot(tree)

            elif method == "phylip":
                if not phylip_executable:
                    phylip_executable = options.parameters[0]
                    del options.parameters[0]
                    phylip_options = re.split("@", options.parameters[0])
                    del options.parameters[0]

                    phylip = WrapperPhylip.Phylip()
                    phylip.setProgram(phylip_executable)
                    phylip.setOptions(phylip_options)

                phylip.setTree(tree)

                result = phylip.run()

                nexus.trees[index] = result.mNexus.trees[0]

            elif method == "normalize":
                if options.value == 0:
                    v = 0
                    for n in tree.chain.keys():
                        v = max(v, tree.node(n).data.branchlength)
                else:
                    v = options.value

                for n in tree.chain.keys():
                    tree.node(n).data.branchlength /= float(options.value)

            elif method == "divide-by-tree":

                if len(other_trees) > 1:
                    other_tree = other_trees[ntree]
                else:
                    other_tree = other_trees[0]

                # the trees have to be exactly the same!!
                if options.loglevel >= 2:
                    print tree.display()
                    print other_tree.display()

                if not tree.is_identical(other_tree):
                    nskipped += 1
                    continue

                # even if the trees are the same (in topology), the node numbering might not be
                # the same. Thus build a map of node ids.
                map_a2b = TreeTools.GetNodeMap(tree, other_tree)

                for n in tree.chain.keys():
                    try:
                        tree.node(n).data.branchlength /= float(
                            other_tree.node(map_a2b[n]).data.branchlength)
                    except ZeroDivisionError:
                        options.stdlog.write(
                            "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n"
                            % (n, map_a2b[n], ntree))
                        continue

            elif method == "rename":
                if not map_old2new:

                    map_old2new = IOTools.ReadMap(open(options.parameters[0],
                                                       "r"),
                                                  columns=(0, 1))

                    if options.invert_map:
                        map_old2new = IOTools.getInvertedDictionary(
                            map_old2new, make_unique=True)

                    del options.parameters[0]

                unknown = []
                for n, node in tree.chain.items():
                    if node.data.taxon:
                        try:
                            node.data.taxon = map_old2new[node.data.taxon]
                        except KeyError:
                            unknown.append(node.data.taxon)

                for taxon in unknown:
                    tree.prune(taxon)

            # reformat terminals
            elif method == "extract-with-pattern":

                if not extract_pattern:
                    extract_pattern = re.compile(options.parameters[0])
                    del options.parameters[0]

                for n in tree.get_terminals():
                    node = tree.node(n)
                    node.data.taxon = extract_pattern.search(
                        node.data.taxon).groups()[0]

            elif method == "set-uniform-branchlength":
                for n in tree.chain.keys():
                    tree.node(n).data.branchlength = options.value

            elif method == "build-map":
                # build a map of identifiers
                options.write_map = True
                for n in tree.get_terminals():
                    node = tree.node(n)
                    if node.data.taxon not in map_old2new:
                        new = options.template_identifier % (len(map_old2new) +
                                                             1)
                        map_old2new[node.data.taxon] = new
                    node.data.taxon = map_old2new[node.data.taxon]

            elif method == "remove-pattern":
                if species2remove is None:
                    species2remove = re.compile(options.parameters[0])
                    del options.parameters
                taxa = []
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    skip = False
                    if species2remove.search(t):
                        continue
                    if not skip:
                        taxa.append(t)
                TreeTools.PruneTree(tree, taxa)

            elif method == "add-node-names":

                inode = 0
                write_all_taxa = True
                for n, node in tree.chain.items():
                    if not node.data.taxon:
                        node.data.taxon = "inode%i" % inode
                        inode += 1

            elif method == "newick2nhx":
                # convert names to species names
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    d = t.split("|")
                    if len(d) >= 2:
                        tree.node(n).data.species = d[0]

        index += 1
        ntree += 1

    if options.output_format == "nh":
        options.stdout.write(
            TreeTools.Nexus2Newick(
                nexus,
                write_all_taxa=True,
                with_branchlengths=options.with_branchlengths) + "\n")
    else:
        for tree in nexus.trees:
            tree.writeToFile(options.stdout, format=options.output_format)

    return ntotal, nskipped, ntree

Example #4

Show file

def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--input-format",
        dest="input_format",
        type="choice",
        choices=("plain", "fasta", "clustal", "stockholm", "phylip"),
        help="input format of multiple alignment [default=%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("plain", "fasta", "stockholm", "phylip", "nexus",
                 "plain-fasta"),
        help="output format of multiple alignment [default=%default].")

    parser.add_option(
        "--with-ranges",
        dest="with_ranges",
        action="store_true",
        help=
        "output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option(
        "--without-ranges",
        dest="with_ranges",
        action="store_false",
        help=
        "do not output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option("-u",
                      "--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="string",
        help=
        """methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""
    )

    parser.add_option(
        "-p",
        "--parameters",
        dest="parameters",
        type="string",
        help="parameter stack for methods that require one [default=%default]."
    )

    parser.add_option(
        "-a",
        "--mask-char",
        dest="mask_char",
        type="string",
        help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions",
                        "filter-even-transitions", "keep-even-segments",
                        "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(
                    open(options.parameters[0], "r"), map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(open(options.parameters[0], "r"),
                                    format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()

Example #5

Show file

                      help="aggregation function.")

    parser.set_defaults(filename_map=None,
                        filename_info=None,
                        filename_tissues=None,
                        headers=True,
                        aggregate="mean",
                        value_format="%5.2f",
                        method="counts")

    (options, args) = E.Start(parser)

    if not options.filename_map:
        raise "please supply filename mapping probesets to identifiers."

    map_probe2locus = IOTools.ReadMap(open(options.filename_map, "r"))

    matrix, row_headers, col_headers = MatlabTools.readMatrix(
        sys.stdin, format="full", headers=options.headers)

    if options.filename_tissues:
        tissues, nerrors = IOTools.ReadList(open(options.filename_tissues,
                                                 "r"))
        tissues = set(tissues)
        columns = []
        for x in range(len(col_headers)):
            if col_headers[x] in tissues:
                columns.append(x)
    else:
        columns = range(len(col_headers))

Example #6

Show file

File: fasta2table.py Project: zpeng1989/cgat

def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type="string",
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_option("-s",
                      "--section",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "sequence", "hid", "na", "aa", "cpg",
                               "dn", "degeneracy", "gaps", "codons",
                               "codon-usage", "codon-translator",
                               "codon-bias"),
                      help="which sections to output [%default]")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids [%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_option("--split-fasta-identifier",
                      dest="split_id",
                      action="store_true",
                      help="split fasta description line (starting >) and use "
                      "only text before first space")

    parser.add_option(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table"
        "[%default]")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        options.filename_weights = options.filename_weights.split(",")
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(IOTools.openFile(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        options.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                options.stdlog.write("# tablediff\t%s\t%s\t%f\n" %
                                     (options.filename_weights[x],
                                      options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(
                    options.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if options.split_id is True:
            options.stdout.write("%s" % id.split()[0])
        else:
            options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence, options.seqtype)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    if options.add_total:
        options.stdout.write("total")
        for section in options.sections:
            options.stdout.write("\t" + "\t".join(totals[section].getFields()))
        options.stdout.write("\n")

    E.Stop()

Example #7

Show file

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "remove-stops", "upper", "lower", "reverse-complement",
                 "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one "
                      "[default=%default].")

    parser.add_option("-x",
                      "--ignore-errors",
                      dest="ignore_errors",
                      action="store_true",
                      help="ignore errors [default = %default].")

    parser.add_option("--sample-proportion",
                      dest="sample_proportion",
                      type="float",
                      help="sample proportion [default = %default].")

    parser.add_option("--exclude-pattern",
                      dest="exclude_pattern",
                      type="string",
                      help="exclude all sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--include-pattern",
                      dest="include_pattern",
                      type="string",
                      help="include only sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      type="string",
                      action="append",
                      help="filtering methods to apply "
                      "[default = %default].")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="type",
        type="choice",
        choices=("aa", "na"),
        help="sequence type (aa or na) [%default]. This option determines "
        "which characters to use for masking [default = %default].")

    parser.add_option(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type="string",
        help="template for numerical identifier [default = %default] "
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.set_defaults(
        methods=[],
        parameters="",
        type="na",
        aa_mask_chars="xX",
        aa_mask_char="x",
        na_mask_chars="nN",
        na_mask_char="n",
        gap_chars="-.",
        gap_char="-",
        template_identifier="ID%06i",
        ignore_errors=False,
        exclude_pattern=None,
        include_pattern=None,
        sample_proportion=None,
        filter_methods=[],
    )

    (options, args) = E.Start(parser)
    options.parameters = options.parameters.split(",")

    rx_include, rx_exclude = None, None
    if options.include_pattern:
        rx_include = re.compile(options.include_pattern)
    if options.exclude_pattern:
        rx_exclude = re.compile(options.exclude_pattern)

    iterator = FastaIterator.FastaIterator(options.stdin)

    nseq = 0

    map_seq2nid = {}

    if "apply-map" in options.methods:
        map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if options.type == "na":
        mask_chars = options.na_mask_chars
        mask_char = options.na_mask_char
    else:
        mask_chars = options.aa_mask_chars
        mask_char = options.aa_mask_char

    if "map-codons" in options.methods:
        map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if "mask-soft" in options.methods:
        f = options.parameters[0]
        del options.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in options.methods or "back-translate" in options.methods:

        # open a second stream to read sequences from
        f = options.parameters[0]
        del options.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    ninput, noutput, nerrors, nskipped = 0, 0, 0, 0

    if "sample" in options.methods:
        if not options.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = options.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in options.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in IOTools.openFile(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        nseq += 1
        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(cur_record.title):
            nskipped += 1
            continue

        if rx_exclude and rx_exclude.search(cur_record.title):
            nskipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or cur_record.title in filter_id_list):
            nskipped += 1
            continue

        for method in options.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % options.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        cur_record.title, ls)
                    nerrors += 1
                    if options.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if cur_record.title != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        cur_record.title, other_record.title)

                other_sequence = re.sub("[ %s]" % options.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % options.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in options.gap_chars:
                        c = options.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = string.translate(
                    sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = options.na_mask_char
                elif method == "remove-stops":
                    char = options.gap_char

                for x in sequence:

                    if x not in options.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in options.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (cur_record.title))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in options.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                if id in map_seq2nid:
                    rest = cur_record.title[len(id):]
                    cur_record.title = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                new_id = options.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                cur_record.title = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if cur_record.title != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (cur_record.title, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        % (cur_record.title, len(other_sequence) * 3,
                           len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in options.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [options.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [options.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            nskipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            nskipped += 1
            continue

        options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence))
        noutput += 1

    if "build-map" in options.methods:
        p = options.parameters[0]
        if p:
            outfile = IOTools.openFile(p, "w")
        else:
            outfile = options.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" %
           (ninput, noutput, nskipped, nerrors))

    E.Stop()

Example #8

Show file

File: tree2svg.py Project: yangjl/cgat

def main():

    parser = E.OptionParser(
        version=
        "%prog version: $Id: plot_tree.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-i",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-s",
                      "--filename-tree",
                      dest="filename_tree",
                      type="string",
                      help="filename with tree.")
    parser.add_option("-t", "--tree", dest="tree", type="string", help="tree.")
    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extract species from identifier.")
    parser.add_option("--colour-by-species",
                      dest="colour_by_species",
                      action="store_true",
                      help="colour by species.")
    parser.add_option("--support-style",
                      dest="support_style",
                      type="choice",
                      choices=("pie", "number"),
                      help="style for support information.")
    parser.add_option("--error-style",
                      dest="error_style",
                      type="choice",
                      choices=("pie", "number"),
                      help="style for error information.")
    parser.add_option("--branch-scale",
                      dest="branch_scale",
                      type="float",
                      help="branch length scale factor.")
    parser.add_option("--height-scale",
                      dest="height_scale",
                      type="float",
                      help="height scale factor.")
    parser.add_option("-a",
                      "--annotations",
                      dest="annotations",
                      type="choice",
                      action="append",
                      choices=("support", "error", "kaks", "master", "value",
                               "tables"),
                      help="annotations given by further trees.")
    parser.add_option(
        "--filename-tables",
        dest="filename_tables",
        type="string",
        help="add tables from file (need also set options -a tables) [%default]"
    )
    parser.add_option("--show-branchlengths",
                      dest="show_branchlengths",
                      action="store_true",
                      help="show branch lengths.")
    parser.add_option("--leaf-symbol",
                      dest="plot_leaf_symbol",
                      type="choice",
                      choices=("square", "circle"),
                      help="Symbol for leaves.")
    parser.add_option("--font-size-branches",
                      dest="font_size_branches",
                      type="int",
                      help="set font size for branches.")
    parser.add_option("--font-size-tips",
                      dest="font_size_tips",
                      type="int",
                      help="set font size for tips.")
    parser.add_option("--font-style-tips",
                      dest="font_style_tips",
                      type="choice",
                      choices=(
                          "normal",
                          "italic",
                      ),
                      help="set font style for tips.")
    parser.add_option("--filename-map",
                      dest="filename_map",
                      type="string",
                      help="filename with a name translation table.")
    parser.add_option("--filename-map-species2colour",
                      dest="filename_colour_map",
                      type="string",
                      help="filename with a map of species to colour.")
    parser.add_option("--no-leaf-labels",
                      dest="plot_leaf_labels",
                      action="store_false",
                      help="do not show labels at leafs.")
    parser.add_option("--no-ruler",
                      dest="plot_ruler",
                      action="store_false",
                      help="do not plot ruler.")

    parser.set_defaults(
        titles="",
        title="",
        footer="",
        filename_tree=None,
        species_regex="^([^|]+)\|",
        colour_by_species=None,
        tree=None,
        branch_scale=0,
        height_scale=0,
        support_style=None,
        error_style="number",
        kaks_style="number",
        annotations=None,
        show_branchlengths=False,
        branch_length_format="%5.2f",
        font_size_tips=None,
        font_size_branches=None,
        font_style_tips=None,
        filename_map=None,
        filename_colour_map=None,
        plot_leaf_labels=True,
        plot_leaf_symbol=None,
        plot_ruler=True,
        filename_tables=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_tree:
        tree_lines = open(options.filename_tree, "r").readlines()
    elif options.tree:
        tree_lines = options.tree
    else:
        tree_lines = sys.stdin.readlines()

    nexus = TreeTools.Newick2Nexus(tree_lines)
    master_tree = nexus.trees[0]

    if options.filename_map:
        map_names = IOTools.ReadMap(open(options.filename_map, "r"))

        for id, node in master_tree.chain.items():
            if node.data.taxon in map_names:
                node.data.taxon = map_names[node.data.taxon]

    if options.loglevel >= 2:
        master_tree.display()

    plot = SVGTree.SVGTree(master_tree)

    if options.branch_scale:
        plot.setBranchScale(options.branch_scale)

    if options.height_scale != None:
        plot.setHeightScale(options.height_scale)

    if options.font_size_tips != None:
        plot.setFontSize(options.font_size_tips)

    if options.plot_ruler == False:
        plot.setRulerElements([])

    if options.show_branchlengths:
        b = SVGTree.BranchDecoratorHorizontalBranchLength(master_tree)
        if options.font_size_branches:
            b.setFontSize(options.font_size_branches)
        plot.setDecoratorHorizontalBranches(b)

    if options.colour_by_species:
        if options.filename_colour_map:
            map_species2colour = IOTools.ReadMap(
                open(options.filename_colour_map, "r"))
        else:
            map_species2colour = None

        rx = re.compile(options.species_regex)
        extract_species = lambda x: rx.search(x).groups()[0]
        plot.setDecoratorExternalNodes(
            SVGTree.NodeDecoratorBySpecies(
                master_tree,
                plot_symbol=options.plot_leaf_symbol,
                plot_label=options.plot_leaf_labels,
                map_species2colour=map_species2colour,
                extract_species=extract_species))

    if options.font_style_tips:
        plot.getDecoratorExternalNodes().setFontStyle(options.font_style_tips)

    plot.getDecoratorExternalNodes().setPlotLabel(options.plot_leaf_labels)

    current_tree = 1

    ## add annotations by further trees given on the command line
    branch_length_annotations = []

    current_reference_tree = master_tree

    if options.annotations:
        for annotation in options.annotations:

            tree = nexus.trees[current_tree]

            if annotation == "support":

                tree.branchlength2support()
                for id, node in tree.chain.items():
                    node.data.branchlength = 1.0

                if options.support_style == "pie":
                    plot.setDecoratorInternalNodes(
                        NodeDecoratorSupportPieChart(
                            nexus.trees[current_tree]))

            elif annotation == "error":

                if options.error_style == "number":
                    b = SVGTree.BranchDecoratorHorizontalBranchLengthError(
                        current_reference_tree, tree)
                    if options.font_size_branches:
                        b.setFontSize(options.font_size_branches)
                    branch_length_annotations.append(b)

            elif annotation == "kaks":

                if options.kaks_style == "number":
                    b = SVGTree.BranchDecoratorHorizontalBranchLengthWithKaks(
                        current_reference_tree, tree)
                    if options.font_size_branches:
                        b.setFontSize(options.font_size_branches)
                    branch_length_annotations.append(b)

            elif annotation == "value":

                b = SVGTree.BranchDecoratorHorizontalBranchLength(tree)
                if options.font_size_branches:
                    b.setFontSize(options.font_size_branches)
                branch_length_annotations.append(b)

            elif annotation == "master":
                current_reference_tree = tree

            elif annotation == "tables":
                b = BranchDecoratorTable(tree,
                                         filename=options.filename_tables)
                plot.setDecoratorHorizontalBranches(b)

            current_tree += 1

        if len(branch_length_annotations) == 1:
            b = branch_length_annotations[0]
        elif len(branch_length_annotations) == 2:
            b1, b2 = branch_length_annotations
            b1.setFontColour(SVGTree.BLUE)
            b2.setFontColour(SVGTree.RED)
            b = SVGTree.BranchDecoratorHorizontalAboveBelow(
                master_tree, b1, b2)
        elif len(branch_length_annotations) > 2:
            raise "obtained more than three branch length annotations. Layout not implemented"

        plot.setDecoratorHorizontalBranches(b)

    plot.initializePlot()

    plot.writeToFile(sys.stdout)

    E.Stop()

Example #9

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: compare_clusters.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-o",
                      "--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for filenames.")

    parser.set_defaults(
        output_pattern=None,
        format="%5.2f",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) != 2:
        raise "please supply to filenames with the clusters."

    map_id2cluster1, map_cluster2ids1 = IOTools.ReadMap(open(args[0]),
                                                        both_directions=True)
    map_id2cluster2, map_cluster2ids2 = IOTools.ReadMap(open(args[1]),
                                                        both_directions=True)

    graph = networkx.Graph()

    for a in map_cluster2ids1.keys():
        graph.add_node((1, a))
    for b in map_cluster2ids2.keys():
        graph.add_node((2, b))

    # build graph between clusters
    for cluster1, ids1 in map_cluster2ids1.items():
        for id1 in ids1:
            if id1 in map_id2cluster2:
                graph.add_edge((1, cluster1), (2, map_id2cluster2[id1]))

    components = networkx.connected_components(graph)

    #######################################################
    #######################################################
    #######################################################
    # write components and compute counts
    #######################################################
    outfile = getFile("components", options)
    outfile.write("id\ttotal\tn1\tn2\tmembers1\tmembers2\n")
    n = 0
    counts = {}
    subsets = []
    for component in components:

        m1, m2 = [], []

        for x in component:
            if x[0] == 1:
                m1.append(x[1])
            else:
                m2.append(x[1])

        t = len(component)
        n1 = len(m1)
        n2 = len(m2)
        cc = (n1, n2)
        if cc not in counts:
            counts[cc] = 0
        counts[cc] += 1

        if cc == (1, 1):
            subsets.append(n)

        n += 1
        outfile.write("%i\t%i\t%i\t%i\t%s\t%s\n" %
                      (n, t, n1, n2, ",".join(m1), ",".join(m2)))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    #######################################################
    #######################################################
    #######################################################
    # write counts
    #######################################################
    outfile = getFile("counts", options)
    outfile.write("n1\tn2\tcounts\tpcounts1\tpcounts2\n")
    for cc, c in counts.items():
        outfile.write(
            "%i\t%i\t%i\t%s\t%s\n" %
            (cc[0], cc[1], c, options.format %
             (100.0 * float(c) / len(map_cluster2ids1)), options.format %
             (100.0 * float(c) / len(map_cluster2ids2))))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    #######################################################
    #######################################################
    #######################################################
    # analyze subsets - how many of the 1:1 clusters
    # contain the exact members?
    #######################################################
    outfile = getFile("subsets", options)
    outfile.write("id\tn1\tn2\tunion\tinter\tunique1\tunique2\n")

    ntrue = 0
    nrest1 = 0
    nrest2 = 0
    nother = 0

    for component_id in subsets:
        component = components[component_id]
        if component[0][0] == 1:
            id1, id2 = component[0][1], component[1][1]
        else:
            id1, id2 = component[1][1], component[0][1]

        members1 = set(map_cluster2ids1[id1])
        members2 = set(map_cluster2ids2[id2])

        union = len(members1.union(members2))
        intersection = len(members1.intersection(members2))
        rest1 = len(members1.difference(members2))
        rest2 = len(members2.difference(members1))

        if rest1 == 0 and rest2 == 0:
            ntrue += 1
        elif rest1 == 0:
            nrest1 += 1
        elif rest2 == 0:
            nrest2 += 1
        else:
            nother += 1

        outfile.write("%i\t%i\t%i\t%i\t%i\t%i\t%i\n" %
                      (component_id, len(members1), len(members2), union,
                       intersection, rest1, rest2))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    # write subset statistics
    ntotal = len(subsets)
    options.stdout.write("# subset statistics of 1:1 corresponding clusters\n")
    options.stdout.write("class\tcounts\ttotal\n")
    options.stdout.write("%s\t%i\t%s\n" %
                         ("total", ntotal, options.format % 100))
    options.stdout.write("%s\t%i\t%s\n" % ("true", ntrue, options.format %
                                           (100.0 * ntrue / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("unique1", nrest1, options.format %
                                           (100.0 * nrest1 / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("unique2", nrest2, options.format %
                                           (100.0 * nrest2 / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("other", nother, options.format %
                                           (100.0 * nother / ntotal)))

    E.Stop()

Example #10

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/analyze_queries.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-g",
                      "--genomes",
                      dest="genomes",
                      type="string",
                      help="genomes to analyse.")
    parser.add_option("-i",
                      "--priority",
                      dest="priority",
                      type="string",
                      help="quality priority.")
    parser.add_option("-s",
                      "--sort",
                      dest="sort",
                      type="string",
                      help="sort order.")
    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with template peptide sequences.")
    parser.add_option("-m",
                      "--methods",
                      dest="methods",
                      type="string",
                      help="methods to apply [missed].")
    parser.add_option(
        "-f",
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "filename with schema|prediction_id|gene to use as filter. The prediction_ids are used for filtering."
    )
    parser.add_option("-q",
                      "--filter-quality",
                      dest="filter_quality",
                      type="string",
                      help="only consider predictions of given qualities.")
    parser.add_option("--pattern-output",
                      dest="pattern_output",
                      type="string",
                      help="output pattern for multiple file output.")
    parser.add_option("--pattern-stats",
                      dest="pattern_stats",
                      type="string",
                      help="output pattern for multiple statistics output.")
    parser.add_option("--outfile-clusters",
                      dest="outfile_clusters",
                      type="string",
                      help="output filename for clusters.")
    parser.add_option("--infile-clusters",
                      dest="infile_clusters",
                      type="string",
                      help="input filename for clusters.")
    parser.add_option("-n",
                      "--non-redundant",
                      dest="non_redundant",
                      action="store_true",
                      help="use non-redundant set for output.")
    parser.add_option("--clustering-method",
                      dest="clustering_method",
                      type="choice",
                      choices=("fragment", "hid"),
                      help="clustering method to use.")

    parser.set_defaults(
        genomes="",
        priority="CG,PG,SG,RG,CP,PP,SP,RP,CF,PF,SF,UG,UP,UF,BF,UK",
        sort="CG,PG,SG,RG,CP,PP,SP,RP,CF,PF,SF,UG,UP,UF,BF,UK",
        methods="missed",
        peptides=None,
        filename_filter=None,
        separator="|",
        filter_quality=None,
        pattern_output="%s",
        pattern_stats=None,
        clustering_method="fragment",
        outfile_clusters=None,
        infile_clusters=None,
        non_redundant=False,
        format_percent="%5.2f",
    )

    (options, args) = E.Start(parser, add_psql_options=True)

    if options.filename_peptides:
        peptides = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
    else:
        peptides = {}

    if options.genomes:
        options.genomes = options.genomes.split(",")
    if options.priority:
        options.priority = options.priority.split(",")
    if options.methods:
        options.methods = options.methods.split(",")
    if options.sort:
        options.sort = options.sort.split(",")
    if options.filter_quality:
        options.filter_quality = options.filter_quality.split(",")

    subset = {}
    if options.filename_filter:
        data = map(
            lambda x: x[:-1].split(options.separator)[:3],
            filter(lambda x: x[0] != "#",
                   open(options.filename_filter, "r").readlines()))
        for s, p, g in data:
            if s not in subset:
                subset[s] = {}
            subset[s][p] = 1

    if len(options.sort) != len(options.priority):
        raise "different number of classes in sort order and priority order"

    dbhandle = pgdb.connect(options.psql_connection)

    # Cluster peptides
    if options.infile_clusters:
        map_peptide2cluster, map_cluster2peptide = IOTools.ReadMap(
            open(options.infile_clusters, "r"), both_directions=True)
    elif peptides:
        if options.clustering_method == "fragment":
            map_cluster2peptide, map_peptide2cluster = ClusterPeptidesByFragment(
                peptides)
        elif options.clustering_method == "hid":
            map_cluster2peptide, map_peptide2cluster = ClusterPeptidesByHid(
                peptides)
    else:
        map_cluster2peptide = {}
        map_peptide2cluster = {}

    if map_cluster2peptide and options.loglevel >= 1:
        options.stdlog.write(
            "# clustering of peptides: %i cluster for %i peptides\n" %
            (len(map_cluster2peptide), len(map_peptide2cluster)))
        sys.stdout.flush()

    if options.outfile_clusters and not options.infile_clusters:
        options.stdlog.write("# writing clusters to %s\n" %
                             options.outfile_clusters)
        outfile = open(options.outfile_clusters, "w")
        for k, v in map_peptide2cluster.items():
            outfile.write("%s\t%s\n" % (k, v))
        outfile.close()

    for method in options.methods:

        if method == "stats":
            # Count number of missed unique genes/transcripts

            headers = (
                "species",
                "genes",
                "found_genes",
                "missed_genes",
                "pfound_genes",
                "pmissed_genes",
                "nr_found_genes",
                "nr_missed_genes",
                "pnr_found_genes",
                "pnr_missed_genes",
                "transcripts",
                "found_transcripts",
                "missed_transcripts",
                "pfound_transcripts",
                "pmissed_transcripts",
                "nr_found_transcripts",
                "nr_missed_transcripts",
                "pnr_found_transcripts",
                "pnr_missed_transcripts",
            )

            options.stdout.write("\t".join(headers) + "\n")

            for genome in options.genomes:

                r = GetQueryInfo(dbhandle, genome, options, subset)

                found_genes, genes, found_transcripts, transcripts =\
                    CountFoundGenes(map(lambda x: (x[0], x[1], x[2]), r),
                                    map_peptide2cluster)

                nrfound_genes, nrgenes, nrfound_transcripts, nrtranscripts =\
                    CountFoundGenes(map(lambda x: (x[0], x[1], x[3]), r),
                                    map_peptide2cluster)

                nfound_genes = len(found_genes)
                nfound_transcripts = len(found_transcripts)
                nnrfound_genes = len(nrfound_genes)
                nnrfound_transcripts = len(nrfound_transcripts)

                ngenes = len(genes)
                ntranscripts = len(transcripts)

                if ngenes == 0 or ntranscripts == 0:
                    continue

                f1 = lambda x: 100 * float(x) / ngenes
                f2 = lambda x: 100 * float(x) / ntranscripts
                options.stdout.write(
                    "%s\t%i\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f\n"
                    %
                    (genome, ngenes, nfound_genes, ngenes - nfound_genes,
                     f1(nfound_genes), f1(ngenes - nfound_genes),
                     nnrfound_genes, ngenes - nnrfound_genes,
                     f1(nnrfound_genes), f1(ngenes - nnrfound_genes),
                     ntranscripts, nfound_transcripts,
                     ntranscripts - nfound_transcripts, f2(nfound_transcripts),
                     f2(ntranscripts - nfound_transcripts),
                     nnrfound_transcripts, ntranscripts - nnrfound_transcripts,
                     f2(nnrfound_transcripts),
                     f2(ntranscripts - nnrfound_transcripts)))

        elif method == "missed":

            headers = (
                "species",
                "genes",
                "transcripts",
                "missed_genes",
                "missed_transcripts",
                "percent_missed_genes",
                "percent_missed_transcripts",
            )

            options.stdout.write("\t".join(headers) + "\n")

            all_missed_genes = {}
            all_missed_transcripts = {}

            for genome in options.genomes:

                r = GetQueryInfo(dbhandle, genome, options, subset)

                if options.non_redundant:
                    found_genes, genes, found_transcripts, transcripts =\
                        CountFoundGenes(map(lambda x: (x[0], x[1], x[3]), r),
                                        map_peptide2cluster)
                else:
                    found_genes, genes, found_transcripts, transcripts =\
                        CountFoundGenes(map(lambda x: (x[0], x[1], x[2]), r),
                                        map_peptide2cluster)

                sg = set(genes)
                missed_genes = sg.difference(found_genes)

                for x in missed_genes:
                    if x not in all_missed_genes:
                        all_missed_genes[x] = []
                    all_missed_genes[x].append(genome)

                sm = set(transcripts)
                missed_transcripts = sm.difference(found_transcripts)

                for x in missed_transcripts:
                    if x not in all_missed_transcripts:
                        all_missed_transcripts[x] = []
                    all_missed_transcripts[x].append(genome)

                options.stdout.write(
                    "%s\t%i\t%i\t%i\t%i\t%s\t%s\n" %
                    (genome, len(genes), len(transcripts), len(missed_genes),
                     len(missed_transcripts), options.format_percent %
                     (100.0 * float(len(missed_genes)) / len(genes)),
                     options.format_percent %
                     (100.0 * float(len(missed_transcripts)) /
                      len(transcripts))))

            for section in ("genes", "transcripts"):

                if section == "genes":
                    missed = all_missed_genes
                else:
                    missed = all_missed_transcripts

                writeListMissed(open(options.pattern_output % section, "w"),
                                missed, options.genomes, options)

                if options.pattern_stats:
                    outfile = open(options.pattern_stats % section, "w")
                else:
                    outfile = options.stdout
                    outfile.write("# statistics for %s\n" % section)

                writeStatsMissed(outfile, missed, options.genomes, options)

                if outfile != options.stdout:
                    outfile.close()

    E.Stop()

Example #11

Show file

File: SVGDuplicationsWheel.py Project: lesheng/cgat

        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={'CG': "circle", 'PG': "circle", 'SG': "circle"},
        quality2mask=(
            "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK"),
        sort_by_size = True,
        input_format = "pairwise",
    )

    (options, args) = Experiment.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes, "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):

Example #12

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2predictions.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-l",
                      "--filename-locations",
                      dest="filename_locations",
                      type="string",
                      help="filename with locations")

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="the master determines the frame.")

    parser.set_defaults(filename_locations=None, gap_chars="-.", master=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    identifiers = mali.getIdentifiers()

    aligned_columns, aligned_exons = getAlignedColumns(mali, options)

    map_id2location = {}

    if options.filename_locations:
        map_id2location = IOTools.ReadMap(open(options.filename_locations,
                                               "r"))

    options.stdout.write(Prediction.Prediction().getHeader() + "\n")

    nid = 1

    for identifier in identifiers:

        if options.loglevel >= 2:
            options.stdlog.write("# processing %s\n" % (identifier))

        entry = mali.getEntry(identifier)

        sequence = entry.mString
        if sequence[0] not in string.lowercase:
            raise "all sequences should start with an exon."

        was_exon = True
        d = 0
        alignment = []
        carry_over = 0

        last_codon = []
        codon = []
        nchars_in_codon = 0
        n = 0

        last_master_residue = 0
        master_residue = 0
        for column in range(len(sequence)):

            c = sequence[column]
            is_gap = c in options.gap_chars
            is_aligned = column in aligned_columns
            is_exon = column in aligned_exons

            if is_gap:
                continue

            if is_exon:
                master_residue = aligned_exons[column]
                codon.append((n, master_residue))

            n += 1

            # check if we have a complete codon
            if is_exon:
                # A codon is complete, if it ends at frame 2 or
                # it spans more than one codons in the master.
                # Gaps in the master that are a multiple of 3 are ignored
                d = master_residue - last_master_residue - 1

                if master_residue % 3 == 2 or (d % 3 != 0 and d > 0):

                    if last_codon:
                        d = codon[0][0] - last_codon[-1][0] - 1
                        if d > 0:
                            # add in-frame introns
                            if d > 10:
                                alignment.append(["5", 0, 2])
                                alignment.append(["I", 0, d - 4])
                                alignment.append(["3", 0, 2])
                            else:
                                raise "untreated case"

                    alignment += processCodon(codon)
                    last_codon = codon
                    codon = []

            last_master_residue = master_residue

        last = alignment[0]
        new_alignment = []
        for this in alignment[1:]:
            if this[0] == last[0]:
                last[1] += this[1]
                last[2] += this[2]
                continue

            new_alignment.append(last)
            last = this

        new_alignment.append(last)

        if options.loglevel >= 4:
            options.stdlog.write("# output=%s\n" % (str(new_alignment)))

        assert (new_alignment[-1][2] % 3 == 0)

        lalignment = sum(map(lambda x: x[2], new_alignment))

        prediction = Prediction.Prediction()

        prediction.mQueryToken = identifier

        genomic_sequence = re.sub("[%s]" % options.gap_chars, "",
                                  mali[identifier])

        prediction.mPredictionId = nid
        nid += 1

        if identifier in map_id2location:

            prediction.mSbjctToken, prediction.mSbjctStrand, sfrom, sto = map_id2location[
                identifier].split(":")[:4]

            prediction.mSbjctGenomeFrom = int(sfrom) + entry.mFrom
            prediction.mSbjctGenomeTo = int(sto)

        else:
            prediction.mSbjctToken = "unk"
            prediction.mSbjctStrand = "+"
            prediction.mSbjctGenomeFrom = 0

        prediction.mQueryCoverage = 100
        prediction.mPercentIdentity = 100
        prediction.mPercentSimilarity = 100

        prediction.mQueryLength = prediction.mQueryTo

        prediction.mSbjctGenomeTo = prediction.mSbjctGenomeFrom + lalignment

        prediction.mMapPeptide2Genome = new_alignment
        prediction.mAlignmentString = string.join(
            map(lambda x: string.join(map(str, x), " "),
                prediction.mMapPeptide2Genome), " ")

        prediction.mMapPeptide2Translation, prediction.mTranslation = Genomics.Alignment2PeptideAlignment(
            prediction.mMapPeptide2Genome, 0, 0, genomic_sequence)

        (prediction.mNIntrons, prediction.mNFrameShifts, prediction.mNGaps, prediction.mNSplits, prediction.mNStopCodons, disruptions) = \
            Genomics.CountGeneFeatures(0,
                                       prediction.mMapPeptide2Genome,
                                       genomic_sequence)

        options.stdout.write(str(prediction) + "\n")

    E.Stop()

Example #13

Show file

File: rnaseq_junction_bam2bam.py Project: yangjl/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--template-bam",
                      dest="filename_genome_bam",
                      type="string",
                      help="input bam file for header information [%default]")

    parser.add_option("-s",
                      "--contig-sizes",
                      dest="filename_contigs",
                      type="string",
                      help="filename with contig sizes [%default]")

    parser.add_option(
        "-o",
        "--colour",
        dest="colour_mismatches",
        action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option("-i",
                      "--ignore-mismatches",
                      dest="ignore_mismatches",
                      action="store_true",
                      help="ignore mismatches [%default]")

    parser.add_option(
        "-c",
        "--remove-contigs",
        dest="remove_contigs",
        type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option("-f",
                      "--force",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files [%default]")

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.set_defaults(
        filename_genome_bam=None,
        filename_gtf=None,
        filename_mismapped=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    genomefile, referencenames, referencelengths = None, None, None

    if options.filename_genome_bam:
        genomefile = pysam.Samfile(options.filename_genome_bam, "rb")
    elif options.filename_contigs:
        contigs = IOTools.ReadMap(IOTools.openFile(options.filename_contigs))
        data = zip(*list(contigs.iteritems()))
        referencenames, referencelengths = data[0], map(int, data[1])
    else:
        raise ValueError(
            "please provide either --template-bam or --contig-sizes")

    infile = pysam.Samfile("-", "rb")
    outfile = pysam.Samfile("-",
                            "wb",
                            template=genomefile,
                            referencenames=referencenames,
                            referencelengths=referencelengths)

    if options.colour_mismatches:
        tag = "CM"
    else:
        tag = "NM"

    nambiguous = 0
    ninput = 0
    nunmapped = 0
    ncigar = 0
    nfull = 0
    noutput = 0

    contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)])

    for qname, readgroup in itertools.groupby(infile, lambda x: x.qname):
        ninput += 1
        reads = list(readgroup)
        if reads[0].is_unmapped:
            nunmapped += 1
            continue

        # filter for best match
        best = min([x.opt(tag) for x in reads])
        reads = [x for x in reads if x.opt(tag) == best]
        if len(reads) > 1:
            nambiguous += 1
            continue

        read = reads[0]

        # reject complicated matches (indels, etc)
        # to simplify calculations below.
        if len(read.cigar) > 1:
            ncigar += 1
            continue

        # set NH flag to latest count
        t = dict(read.tags)
        t['NH'] = 1
        read.tags = list(t.iteritems())

        sname = infile.getrname(read.tid)

        contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split(
            "|")
        first_exon_end, last_exon_start = middle.split("-")
        first_exon_start, first_exon_end, last_exon_start, last_exon_end = map(int, (\
                first_exon_start, first_exon_end, last_exon_start, last_exon_end ) )
        first_exon_end += 1

        total = first_exon_end - first_exon_start + last_exon_end - last_exon_start
        first_exon_length = first_exon_end - first_exon_start

        match1 = first_exon_length - read.pos
        intron_length = last_exon_start - first_exon_end
        match2 = read.qlen - match1

        # match lies fully in one exon - ignore
        if match1 <= 0 or match2 <= 0:
            nfull += 1
            continue

        # increment pos
        read.pos = first_exon_start + read.pos
        read.tid = contig2tid[contig]
        # 3 = BAM_CREF_SKIP
        read.cigar = [(0, match1), (3, intron_length), (0, match2)]

        outfile.write(read)

        noutput += 1

    outfile.close()
    if genomefile:
        genomefile.close()

    c = E.Counter()
    c.input = ninput
    c.output = noutput
    c.full = nfull
    c.cigar = ncigar
    c.ambiguous = nambiguous
    c.unmapped = nunmapped

    E.info("%s" % str(c))

    ## write footer and output benchmark information.
    E.Stop()

Example #14

Show file

File: WrapperNJTree.py Project: santayana/cgat

    parser.add_option("--dump", dest="dump", action="store_true",
                      help="dump output.")

    parser.set_defaults(
        separator="|",
        dump=False,
        filename_map=None,
        filename_alignment="-",
        filename_tree=None,
    )

    (options, args) = E.Start(parser)

    if options.filename_map:
        map_species2sp = IOTools.ReadMap(open(options.filename_map, "r"))

    E.debug("species map: %s" % str(map_species2sp))

    identifier_parser = IdentifierParserGPipe(map_species2sp=map_species2sp)

    njtree = NJTree(identifier_parser=identifier_parser)

    njtree.SetLog(options.stdlog)
    njtree.SetErr(options.stderr)

    if options.filename_tree:
        njtree.SetSpeciesTree(options.filename_tree)

    mali = Mali.Mali()
    if options.filename_alignment == "-":

Example #15

Show file

def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--filename-weights",
        dest="filename_weights",
        type="string",
        help=
        "filename with codon frequencies. Multiple filenames can be separated by comma [default=%default]."
    )

    parser.add_option("-s",
                      "--sections",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "hid", "na", "aa", "degeneracy",
                               "bias", "codons", "codon-usage",
                               "codon-translator"),
                      help="which sections to output [default=%default]")

    parser.add_option(
        "-t",
        "--type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help=
        "type of sequence: na=nucleotides, aa=amino acids [default=%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression to extract identifier from fasta description line [default=%default]."
    )

    parser.set_defaults(
        filename_weights="uniform",
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
    )

    (options, args) = E.Start(parser, argv=argv)
    options.filename_weights = options.filename_weights.split(",")

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(open(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        ## print codon table differences
        E.info("difference between supplied codon usage preferences.")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y: continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon): continue
                    d += b[codon] * math.log(b[codon] / p)
                E.info("tablediff\t%s\t%s\t%f" %
                       (options.filename_weights[x],
                        options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "na":
                s = SequencePropertiesNA()
            elif section == "aa":
                s = SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequencePropertiesDegeneracy()
            elif section == "bias":
                s = SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "aa":
                s = SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    ## setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            E.warning("empty sequence %s" % cur_record.title)
            continue

        id = rx.search(cur_record.title).groups()[0]

        options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    options.stdout.write("total")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getFields()))
    options.stdout.write("\n")

    E.Stop()

Example #16

Show file

File: matrix2stats.py Project: gsc0107/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: matrix2stats.py 2795 2009-09-16 15:29:23Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("chi-squared", "pearson-chi-squared"),
                      help="statistical methods to apply.")

    parser.add_option("-t", "--header-names", dest="headers", action="store_true",
                      help="matrix has row/column headers.")

    parser.add_option("--no-headers", dest="headers", action="store_false",
                      help="matrix has no row/column headers.")

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""input format for matrix."""  )

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=("full", "sparse", "phylip"),
                      help="""output format for matrix."""  )

    parser.add_option("-p", "--parameters", dest="parameters", action="append", type="string",
                      help="parameters for various functions.")

    parser.add_option("-a", "--iteration", dest="iteration", type="choice",
                      choices=("pairwise", "all-vs-all"),
                      help="""how to compute stats [%default]."""  )

    parser.set_defaults(
        method="chi-squared",
        headers=True,
        value_format="%6.4f",
        pvalue_format="%6.4e",
        input_format="full",
        write_separators=True,
        parameters=[],
        iteration=None,
    )

    (options, args) = E.Start(parser)

    lines = [x for x in sys.stdin.readlines() if x[0] != "#"]

    chunks = [x for x in range(len(lines)) if lines[x][0] == ">"]

    if not chunks:
        options.write_separators = False
        chunks = [-1]

    chunks.append(len(lines))

    ninput, noutput, nskipped = 0, 0, 0

    if options.write_separators:
        options.stdout.write("test\t")

    header_prefix = ""

    if options.method == "chi-squared":
        header_prefix = "observed\texpected"
        options.stdout.write("\t".join(
            (header_prefix, "n", "min", "max", "chi", "df", "P", "passed", "phi")) + "\n")

    elif options.method in ("pearson-chi-squared",):
        options.stdout.write("column\t")
        options.stdout.write("\t".join(
            (header_prefix, "n", "prob", "obs", "exp", "chi", "df", "P", "passed", "phi")) + "\n")

        if len(options.parameters) == 0:
            raise "out of parameters - please supply probability or filename with probabilities."

        param = options.parameters[0]
        del options.parameters[0]

        if options.write_separators:
            probabilities = IOTools.ReadMap(
               IOTools.openFile(param, "r"), map_functions=(str, float))
        else:
            probability = float(param)

    for x in range(len(chunks) - 1):
        ninput += 1
        matrix, row_headers, col_headers = MatlabTools.readMatrix(
            StringIO("".join(lines[chunks[x] + 1:chunks[x + 1]])),
            format=options.input_format,
            headers=options.headers)
        nrows, ncols = matrix.shape

        if options.loglevel >= 2:
            options.stdlog.write("# read matrix: %i x %i, %i row titles, %i colum titles.\n" %
                                 (nrows, ncols, len(row_headers), len(col_headers)))

        if options.write_separators:
            options.stdout.write(lines[chunks[x]][1:-1] + "\t")

        pairs = []
        if options.iteration == "pairwise":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(row1 + 1, len(row_headers)):
                    pairs.append((row1, row2))
        elif options.iteration == "all-vs-all":
            pairs = []
            for row1 in range(0, len(row_headers)):
                for row2 in range(0, len(row_headers)):
                    if row1 == row2:
                        continue
                    pairs.append((row1, row2))

        if options.method == "chi-squared":

            for row1, row2 in pairs:
                row_header1 = row_headers[row1]
                row_header2 = row_headers[row2]
                try:
                    result = Stats.doChiSquaredTest(
                        numpy.vstack((matrix[row1], matrix[row2])))
                except ValueError:
                    nskipped += 1
                    continue

                noutput += 1
                options.stdout.write("\t".join((
                    "%s" % row_header1,
                    "%s" % row_header2,
                    "%i" % result.mSampleSize,
                    "%i" % min(matrix.flat),
                    "%i" % max(matrix.flat),
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)) + "\n")

        elif options.method == "pearson-chi-squared":

            if nrows != 2:
                raise ValueError("only implemented for 2xn table")

            if options.write_separators:
                id = re.match("(\S+)", lines[chunks[x]][1:-1]).groups()[0]
                probability = probabilities[id]

            for col in range(ncols):
                options.stdout.write("%s\t" % col_headers[col])
                result = Stats.doPearsonChiSquaredTest(
                    probability, sum(matrix[:, col]), matrix[0, col])
                options.stdout.write("\t".join((
                    "%i" % result.mSampleSize,
                    "%f" % probability,
                    "%i" % result.mObserved,
                    "%f" % result.mExpected,
                    options.value_format % result.mChiSquaredValue,
                    "%i" % result.mDegreesFreedom,
                    options.pvalue_format % result.mProbability,
                    "%s" % result.mSignificance,
                    options.value_format % result.mPhi)))
                if col < ncols - 1:
                    options.stdout.write("\n")
                    if options.write_separators:
                        options.stdout.write(lines[chunks[x]][1:-1] + "\t")

            options.stdout.write("\n")

    E.info("# ninput=%i, noutput=%i, nskipped=%i\n" %
           (ninput, noutput, nskipped))

    E.Stop()

Example #17

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id: tree_strain2species.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("--filename-synonyms", dest="filename_synonyms", type="string",
                      help="filename with synonyms. Use this to aggregate several strains for a species.")

    parser.add_option("--filename-genes", dest="output_filename_genes", type="string",
                      help="output filename with new gene names.")

    parser.add_option("--species-tree", dest="species_tree", action="store_true",
                      help="input tree are species trees. If not given, the trees are assumed to be gene trees.")

    parser.add_option("--merge-mode", dest="merge_mode", type="choice",
                      choices=("ignore", "add-mean", "add-max", "add-min"),
                      help="how to deal with branch lengths of merged nodes.")

    parser.set_defaults(
        filename_synonyms="map_strain2species",
        pattern_gene="J%06i",
        output_format="nh",
        separator="|",
        output_filename_genes=None,
        keep_old_names=False,
        species_tree=False,
        merge_mode="ignore",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    ########################################################################
    ########################################################################
    ########################################################################
    # read synonyms
    if options.filename_synonyms:
        infile = open(options.filename_synonyms, "r")
        map_strain2species = IOTools.ReadMap(infile)
        infile.close()
    else:
        map_strain2species = {}

    lines = map(lambda x: x[:-1], sys.stdin.readlines())

    ninput, noutput, nskipped, nmerged = 0, 0, 0, 0

    # iterate over chunks
    chunks = filter(lambda x: lines[x][0] == ">", range(len(lines)))
    if len(chunks) == 0:
        chunks = [0]
    chunks.append(len(lines))

    if options.species_tree:
        processSpeciesTrees(chunks, lines, map_strain2species, options)
    else:
        processGeneTrees(chunks, lines, map_strain2species, options)

    E.Stop()

Example #18

Show file

File: plot_duplications.py Project: lesheng/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--headers",
                      dest="headers",
                      action="store_true",
                      help="first row is a header [ignored].")
    parser.add_option("-t",
                      "--title",
                      dest="title",
                      type="string",
                      help="page title.")
    parser.add_option("-f",
                      "--footer",
                      dest="footer",
                      type="string",
                      help="page footer.")
    parser.add_option("-c",
                      "--contig-sizes",
                      dest="filename_contig_sizes",
                      type="string",
                      help="filname with contig sizes.")
    parser.add_option("-r",
                      "--radius",
                      dest="radius",
                      type="int",
                      help="radius.")
    parser.add_option("-i",
                      "--increment",
                      dest="radius_increment",
                      type="int",
                      help="radius increment.")
    parser.add_option("-u",
                      "--url",
                      dest="url",
                      type="string",
                      help="string to build url for annotation.")
    parser.add_option("--min-contig",
                      dest="min_contig_size",
                      type="string",
                      help="minimum contig size to delineate.")

    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="minimum branch length.")

    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="maximum branch length.")

    parser.set_defaults(
        filename_contig_sizes=None,
        headers=False,
        titles="",
        pattern_filename=None,
        title="",
        footer="",
        radius=3000,
        min_value=0.0,
        max_value=0.2,
        url=None,
        radius_increment=40,
        min_contig_size=10000,
        remove_empty_contigs=True,
        separator="|",
        quality2symbol={
            'CG': "circle",
            'PG': "circle",
            'SG': "circle"
        },
        quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG",
                      "UP", "UF", "BF", "UK"),
        sort_by_size=True,
        input_format="pairwise",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.filename_contig_sizes:
        map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes,
                                               "r"),
                                          map_functions=(str, int))

    # read data and get contigs that are used (i.e.: remove empty contigs)
    chrs = {}
    lines = sys.stdin.readlines()

    if options.remove_empty_contigs:
        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
        for k in map_contig2size.keys():
            if k not in chrs:
                del map_contig2size[k]

    k = map_contig2size.keys()

    if len(k) == 0:
        E.Stop()
        sys.exit(0)

    k.sort()

    if options.sort_by_size:
        k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y]))

    plot = DuplicationPlot(k, map_contig2size, num_entries=0)

    plot.mRadiusIncrement = options.radius_increment
    plot.mRadius = options.radius
    plot.mMaxValue = options.max_value
    plot.mMinValue = options.min_value

    if options.title:
        plot.setTitle(options.title)
    if options.footer:
        plot.setFooter(options.footer)

    plot.initializePlot()

    data = []

    if options.input_format == "pairwise":

        # read data from pairwise analysis
        # format is: cluster_id, locations of duplications, tree of
        # duplications

        for line in lines:
            if line[0] == "#":
                continue

            d = line[:-1].split("\t")

            cluster_id, in_locations, in_tree = d[:3]

            mi, ma = 0, 0
            found = False
            n = 0
            chrs = {}
            for l in in_locations.split(";"):
                gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
                if chr not in map_contig2size:
                    continue
                chrs[chr] = 1
                sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)

                xi = plot.getPosition(chr, strand, sbjct_from)
                xa = plot.getPosition(chr, strand, sbjct_to)

                if not mi:
                    mi = xi
                else:
                    mi = min(mi, xi)

                n += 1
                ma = max(ma, xa)
                found = True

            if not found:
                continue
            cis = len(chrs) == 1
            if options.loglevel >= 2:
                options.stdlog.write(
                    "# adding duplications in cluster %s: %s with tree %s\n" %
                    (cluster_id, in_locations, in_tree))
            data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree))

    data.sort()

    plot.mNumEntries = len(data)
    plot.initializePlot()

    last_ndups = 0

    for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]:

        if ndups != last_ndups:
            plot.pushRadius()
            plot.addSeparator()

        last_ndups = ndups

        map_gene2location = {}
        for l in in_locations.split(";"):
            gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":")
            if chr not in map_contig2size:
                continue
            sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to)
            map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to)

        if not map_gene2location:
            continue

        tree = TreeTools.Newick2Tree(in_tree)

        # the last subset is all nodes again.
        s = TreeTools.GetSubsets(tree)

        is_first = True
        for children, height, branchlength in s[:-1]:
            if len(children) == 1:
                continue
            c = map(lambda x: x.split(options.separator), children)
            plot.addDuplication(c,
                                map_gene2location,
                                height,
                                url=options.url,
                                with_separator=is_first,
                                link_to_previous=not is_first,
                                quality2symbol=options.quality2symbol,
                                quality2mask=options.quality2mask)
            is_first = False

    plot.writeToFile(sys.stdout)

    E.Stop()

Example #19

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        help=
        "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]",
        choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t"))
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("--plot-legend",
                      dest="legend",
                      type="string",
                      help="legend for histograms."
                      "")
    parser.add_option("-f",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")
    parser.add_option(
        "-n",
        "--norm-test",
        dest="norm_test",
        action="store_true",
        help=
        """test if a set of values is normally distributed. Mean and variance
                       are calculated from the data.""")
    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="""number of bins (for plotting purposes only).""")
    parser.add_option("--bin-size",
                      dest="bin_size",
                      type="float",
                      help="""bin size for plot.""")
    parser.add_option("--min-value",
                      dest="min_value",
                      type="float",
                      help="""minimum_value for plot.""")
    parser.add_option("--max-value",
                      dest="max_value",
                      type="float",
                      help="""maximum_value for plot.""")
    parser.add_option("--skip-plot",
                      dest="plot",
                      action="store_false",
                      help="""skipping plotting.""")
    parser.add_option("--header-names",
                      dest="header",
                      type="string",
                      help="""header of value column [default=%default].""")
    parser.add_option("--title",
                      dest="title",
                      type="string",
                      help="""plot title [default=%default].""")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
        legend=None,
        norm_test=False,
        num_bins=0,
        legend_range="2,2",
        bin_size=None,
        min_value=None,
        plot=True,
        header="value",
        title=None,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    kwargs = {}
    xargs = []
    for arg in args:
        if "=" in arg:
            key, value = arg.split("=")
            kwargs[key] = value
        else:
            xargs.append(arg)

    if options.legend:
        options.legend = options.legend.split(",")

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))
        f = str
    else:
        f = float

    if options.filename_input1:
        infile1 = IOTools.openFile(options.filename_input1, "r")
    else:
        infile1 = sys.stdin

    values1, errors1 = IOTools.ReadList(infile1,
                                        map_function=f,
                                        map_category=map_category2value)

    if options.filename_input1:
        infile1.close()

    if errors1 and options.loglevel >= 3:
        options.stdlog.write("# errors in input1: %s\n" %
                             ";".join(map(str, errors1)))

    if options.norm_test:
        mean = R.mean(values1)
        stddev = R.sd(values1)
        options.stdlog.write(
            "# creating %i samples from normal distribution with mean %f and stddev %f\n"
            % (len(values1), mean, stddev))

        values2 = R.rnorm(len(values1), mean, stddev)
        errors2 = ()
    else:
        values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                            map_function=f,
                                            map_category=map_category2value)

    if errors2 and options.loglevel >= 3:
        options.stdlog.write("# errors in input2: %s\n" %
                             ";".join(map(str, errors2)))

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" %
            (len(values1), len(errors1), len(values2), len(errors2)))

    if options.method in ("paired-mwu", "paired-t"):
        if len(values1) != len(values2):
            raise ValueError(
                "number of values must be equal for paired tests.")

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2, *xargs, **kwargs)
    elif options.method == "mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=False,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-mwu":
        result = R.wilcox_test(values1,
                               values2,
                               paired=True,
                               correct=True,
                               *xargs,
                               **kwargs)
    elif options.method == "paired-t":
        result = R.t_test(values1, values2, paired=True, *xargs, **kwargs)
    elif options.method == "shapiro":
        if len(values1) > 5000:
            E.warn(
                "shapiro-wilk test only accepts < 5000 values, a random sample has been created."
            )
            values1 = random.sample(values1, 5000)
        result = R.shapiro_test(values1, *xargs, **kwargs)

    if options.plot:
        R.assign("v1", values1)
        R.assign("v2", values2)

        if options.title:
            # set the size of the outer margins - the title needs to be added at the end
            # after plots have been created
            R.par(oma=R.c(0, 0, 4, 0))

        R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

        R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")
        R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
          )

        # compute breaks:

        min_value = min(min(values1), min(values2))
        if options.min_value is not None:
            min_value = min(min_value, options.min_value)

        max_value = max(max(values1), max(values2))
        if options.max_value is not None:
            max_value = max(max_value, options.max_value)

        extra_options = ""
        if options.num_bins and not (options.min_value or options.max_value):
            extra_options += ", breaks=%i" % options.num_bins

        elif options.num_bins and (options.min_value or options.max_value):
            bin_size = float((max_value - min_value)) / (options.num_bins + 1)
            breaks = [
                min_value + x * bin_size for x in range(options.num_bins)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        elif options.bin_size is not None:
            num_bins = int(((max_value - min_value) / options.bin_size)) + 1
            breaks = [
                min_value + x * options.bin_size for x in range(num_bins + 1)
            ]
            extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks))

        R("""h1 <- hist( v1, freq=FALSE,           density=20, main='Relative frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        R("""h1 <- hist( v1, freq=TRUE,            density=20, main='Absolute frequency histogram' %s)"""
          % extra_options)
        R("""h2 <- hist( v2, freq=TRUE,  add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )"""
          % extra_options)
        if options.legend:
            R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2,
            max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))"""
              % ("','".join(options.legend)))

        if options.title:
            R.mtext(options.title, 3, outer=True, line=1, cex=1.5)

    if options.loglevel >= 1:
        options.stdout.write("## Results for %s\n" % result['method'])

    options.stdout.write("%s\t%s\n" % ("key", options.header))

    for key in list(result.keys()):
        if key == "data.name":
            continue
        options.stdout.write("\t".join((key, str(result[key]))) + "\n")

    stat = Stats.Summary(values1)
    for key, value in list(stat.items()):
        options.stdout.write("%s1\t%s\n" % (str(key), str(value)))

    stat = Stats.Summary(values2)
    for key, value in list(stat.items()):
        options.stdout.write("%s2\t%s\n" % (str(key), str(value)))

    if options.plot:
        if options.hardcopy:
            R.dev_off()

    E.Stop()

Example #20

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fasta2nj.py 2781 2009-09-10 11:33:14Z andreas $")

    parser.add_option(
        "-m",
        "--map",
        dest="filename_map",
        type="string",
        help="filename with mapping of species ids to swissprot species ids.")

    parser.set_defaults(
        separator="|",
        filename_map=None,
    )

    (options, args) = E.Start(parser)

    if options.filename_map:
        map_species2sp = IOTools.ReadMap(open(options.filename_map, "r"))

    ninput, noutput, nerrors = 0, 0, 0
    for line in sys.stdin:
        if line[0] == ">":
            ninput += 1

            id = re.match(">([^/ \t]+)", line[:-1]).groups()[0]
            data = id.split(options.separator)

            species = data[0]

            if len(data) == 2:
                gene = data[1]
                transcript = None
            elif len(data) >= 3:
                gene = data[2]
                transcript = data[1]

            if map_species2sp:
                try:
                    species = map_species2sp[species]
                except IndexError:
                    nerrors += 1
                    if options.loglevel >= 1:
                        options.stdlog.write("# could not map species %s\n" %
                                             species)
            if transcript:
                options.stdout.write(">%s_%s GENEID=%s\n" %
                                     (transcript, species, gene))
            else:
                options.stdout.write(">%s_%s\n" % (species, gene))
            noutput += 1
        else:
            options.stdout.write(line)

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nerrors=%i\n" %
                             (ninput, noutput, nerrors))
    E.Stop()

Example #21

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="string",
        help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option("-a",
                      "--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="write hardcopy to file.",
                      metavar="FILE")
    parser.add_option("-1",
                      "--infile1",
                      dest="filename_input1",
                      type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2",
                      "--infile2",
                      dest="filename_input2",
                      type="string",
                      help="input filename for distribution 2.")
    parser.add_option("-p",
                      "--infile-map",
                      dest="filename_input_map",
                      type="string",
                      help="input filename for mapping categories to values.")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
    )

    (options, args) = E.Start(
        parser,
        add_pipe_options=True,
        add_psql_options=True,
    )

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map,
                                                  "r"),
                                             map_functions=(str, float))

    values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"),
                                        map_category=map_category2value)
    values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                        map_category=map_category2value)

    E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" %
           (len(values1), len(errors1), len(values2), len(errors2)))

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2)
    elif options.method == "mwu":
        result = R.wilcox_test(values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)

    R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

    R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );"""
      )

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')"""
      )
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')"""
      )
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)"""
      )

    print "## Results for %s" % result['method']
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print x, result[x]

    E.Stop()

Example #22

Show file

File: codonbias_shuffle_fasta.py Project: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: shuffle_fasta.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-c",
        "--codons",
        dest="codons",
        action="store_true",
        help="make sure that shuffled sequences only contain valid codons.")

    parser.add_option("-a",
                      "--conserve-aminos",
                      dest="conserve_aminos",
                      action="store_true",
                      help="conserve amino acids.")

    parser.add_option(
        "-b",
        "--bias",
        dest="bias",
        type="float",
        help=
        "introduce bias into codon usage choice. Complete bias is 1.0, while no bias is 0.0."
    )

    parser.add_option(
        "-i",
        "--biased-codon-usage",
        dest="filename_biased_codon_usage",
        type="string",
        help="Filename with reference codon usage table for biased codon usage."
    )

    parser.add_option(
        "-u",
        "--bulk-codon-usage",
        dest="filename_bulk_codon_usage",
        type="string",
        help=
        "Filename with reference codon usage table for unbiased codon usage.")

    parser.set_defaults(
        codons=False,
        conserve_aminos=False,
        bias=0.0,
        filename_biased_codon_usage=None,
        filename_bulk_codon_usage=None,
        stop_codons=("TAG", "TAA", "TGA"),
        precision=10000,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    # get map of amino acids to codons
    map_aa2codons = Genomics.GetMapAA2Codons()

    # for codon based shuffling: build ranges based on strength of bias and on reference codon usage
    # Bias switches from completely biased to unbiased. Unbiased is uniform
    # usage.
    if options.filename_biased_codon_usage:

        map_codon2frequency = IOTools.ReadMap(open(
            options.filename_biased_codon_usage, "r"),
                                              map_functions=(str, float),
                                              has_header=True)

        if options.filename_bulk_codon_usage:
            map_codon2frequency_bulk = IOTools.ReadMap(
                open(options.filename_bulk_codon_usage, "r"),
                map_functions=(str, float),
                has_header=True)

        codon_ranges = {}
        for aa in map_aa2codons.keys():
            c = []
            x = 0
            for codon in map_aa2codons[aa]:

                if options.filename_bulk_codon_usage:
                    u = map_codon2frequency_bulk[codon]
                else:
                    # uniform usage
                    u = 1.0 / len(map_aa2codons[aa])

                g = map_codon2frequency[codon]
                f = g + (u - g) * (1.0 - options.bias)
                x += f * options.precision
                c.append(x)
            codon_ranges[aa] = c

    while 1:
        cur_record = iterator.next()

        if cur_record is None:
            break

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if options.conserve_aminos:
            n = []
            for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                aa = Genomics.MapCodon2AA(codon)
                if aa not in map_aa2codons:
                    continue
                if options.bias or options.filename_biased_codon_usage:
                    # get random number from 0 to precision
                    v = random.randint(0, options.precision)
                    # find the corresponding intervall:
                    l = len(map_aa2codons[aa])
                    x = 0
                    while x < l - 1:
                        if v < codon_ranges[aa][x]:
                            break
                        x += 1
                else:
                    x = random.randint(0, len(map_aa2codons[aa]) - 1)
                n.append(map_aa2codons[aa][x])
            sequence = "".join(n)
        else:
            sequence = list(sequence)
            if options.codons:
                while 1:
                    random.shuffle(sequence)
                    for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                        if codon in options.stop_codons:
                            redo = True
                            break
                    else:
                        break
            else:
                random.shuffle(sequence)
            sequence = "".join(sequence)
        options.stdout.write(">%s\n%s\n" %
                             (cur_record.title, "".join(sequence)))

    E.Stop()

Example #23

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/annotate_clusters.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option(
        "-r",
        "--species-regex",
        dest="species_regex",
        type="string",
        help="regular expression to extractspecies from identifier.")

    parser.add_option(
        "--filename-map",
        dest="filename_map_id2cluster",
        type="string",
        help="filename with mapping information from id to cluster.")

    parser.add_option("--filename-interpro",
                      dest="filename_interpro",
                      type="string",
                      help="filename with interpro domain information.")

    parser.add_option("--filename-pfam",
                      dest="filename_pfam",
                      type="string",
                      help="filename with pfam domain information.")

    parser.set_defaults(
        master_species="dmel_vs_dmel4",
        separator="|",
        filename_map_id2cluster="input.map",
        filename_interpro="/home/andreas/projects/flies/data_1v5/interpro.list",
        filename_pfam="/home/andreas/projects/flies/data_1v5/pfam.list",
        write_no_annotation=True,
        separator_fields=";",
    )

    (options, args) = E.Start(parser,
                              add_psql_options=True,
                              add_csv_options=True)

    clusters, nerrors = IOTools.ReadList(sys.stdin)

    map_id2cluster, map_cluster2id = IOTools.ReadMap(open(
        options.filename_map_id2cluster, "r"),
                                                     both_directions=True)

    if len(clusters) == 0:
        clusters = map_cluster2id.keys()
        clusters.sort()

    if options.filename_interpro:
        map_id2interpro = readAnnotationInterpro(
            open(options.filename_interpro, "r"))

    if options.filename_pfam:
        map_id2pfam = readAnnotationPfam(open(options.filename_pfam, "r"))

    ninput, noutput, nnomaster, nnoannotation = 0, 0, 0, 0
    nskipped = 0

    options.stdout.write("cluster\tgenes")

    if map_id2interpro:
        options.stdout.write("\tinterpro\tidescription")
    if map_id2pfam:
        options.stdout.write("\tpfam\tpdescription")
    options.stdout.write("\n")

    for cluster in clusters:

        ninput += 1
        if cluster not in map_cluster2id:
            if options.loglevel >= 1:
                options.stdlog.write("# cluster %s not in map.\n" % cluster)
            nskipped += 1
            continue

        genes = set()

        for id in map_cluster2id[cluster]:

            s, t, g, q = id.split(options.separator)

            if s != options.master_species:
                continue

            genes.add(g)

        if not genes:
            nnomaster += 1
            continue

        annotations_interpro = {}
        if map_id2interpro:
            for gene in genes:
                if gene in map_id2interpro:
                    for annotation in map_id2interpro[gene]:
                        annotations_interpro[
                            annotation.mIdentifier] = annotation

        annotations_pfam = {}

        if map_id2pfam:
            for gene in genes:
                if gene in map_id2pfam:
                    for annotation in map_id2pfam[gene]:
                        annotations_pfam[annotation.mIdentifier] = annotation

        nannotations = max(len(annotations_pfam), len(annotations_interpro))

        if nannotations == 0 and not options.write_no_annotation:
            nnoannotation += 1
            continue

        options.stdout.write("%s\t%s" % (cluster, ";".join(genes)))

        if map_id2interpro:
            printAnnotations(options.stdout, annotations_interpro, options)

        if map_id2pfam:
            printAnnotations(options.stdout, annotations_pfam, options)

        options.stdout.write("\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nskipped=%i, nnomaster=%i, nnoannotation=%i\n"
            % (ninput, noutput, nskipped, nnomaster, nnoannotation))

    E.Stop()