Python Genomics.GetUniformCodonUsage Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: Genomics

Method/Function: GetUniformCodonUsage

Examples at hotexamples.com: 3

Python Genomics.GetUniformCodonUsage - 3 examples found. These are the top rated real world Python examples of CGAT.Genomics.GetUniformCodonUsage extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

IsPositiveStrand(13)

IsNegativeStrand(10)

MapCodon2AA(10)

IsStopCodon(7)

CalculatePairIndices(7)

Alignment2PeptideAlignment(6)

GetHID(6)

ReadGenomicSequences(5)

CalculateCodonFrequenciesFromCounts(4)

CountGeneFeatures(4)

GetDegeneracy(4)

Alignment2ExonBoundaries(3)

GetUniformCodonUsage(3)

ReadContigSizes(3)

ParseFasta2Hash(2)

Protein2Wobble(2)

MaskStopCodons(2)

Alignment2CDNA(2)

CountCodons(2)

Alignment2String(2)

GetIntronType(2)

GetMapAA2Codons(1)

GetGenomicSequence(1)

GetDegenerateSites(1)

MapSequences(1)

CalculateCAIWeightsFromCounts(1)

ParseFasta2HashFromIndex(1)

AlignmentProtein2CDNA(1)

ReadClusters(1)

Example #1

Show file

def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--filename-weights",
        dest="filename_weights",
        type="string",
        help=
        "filename with codon frequencies. Multiple filenames can be separated by comma [default=%default]."
    )

    parser.add_option("-s",
                      "--sections",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "hid", "na", "aa", "degeneracy",
                               "bias", "codons", "codon-usage",
                               "codon-translator"),
                      help="which sections to output [default=%default]")

    parser.add_option(
        "-t",
        "--type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help=
        "type of sequence: na=nucleotides, aa=amino acids [default=%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help=
        "regular expression to extract identifier from fasta description line [default=%default]."
    )

    parser.set_defaults(
        filename_weights="uniform",
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
    )

    (options, args) = E.Start(parser, argv=argv)
    options.filename_weights = options.filename_weights.split(",")

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(open(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        ## print codon table differences
        E.info("difference between supplied codon usage preferences.")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y: continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon): continue
                    d += b[codon] * math.log(b[codon] / p)
                E.info("tablediff\t%s\t%s\t%f" %
                       (options.filename_weights[x],
                        options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "na":
                s = SequencePropertiesNA()
            elif section == "aa":
                s = SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequencePropertiesDegeneracy()
            elif section == "bias":
                s = SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequencePropertiesLength()
            elif section == "hid":
                s = SequencePropertiesHid()
            elif section == "aa":
                s = SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    ## setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            E.warning("empty sequence %s" % cur_record.title)
            continue

        id = rx.search(cur_record.title).groups()[0]

        options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    options.stdout.write("total")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getFields()))
    options.stdout.write("\n")

    E.Stop()

Example #2

Show file

File: fasta2table.py Project: zpeng1989/cgat

def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type="string",
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_option("-s",
                      "--section",
                      dest="sections",
                      type="choice",
                      action="append",
                      choices=("length", "sequence", "hid", "na", "aa", "cpg",
                               "dn", "degeneracy", "gaps", "codons",
                               "codon-usage", "codon-translator",
                               "codon-bias"),
                      help="which sections to output [%default]")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type="choice",
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids [%default].")

    parser.add_option(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type="string",
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_option("--split-fasta-identifier",
                      dest="split_id",
                      action="store_true",
                      help="split fasta description line (starting >) and use "
                      "only text before first space")

    parser.add_option(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table"
        "[%default]")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    rx = re.compile(options.regex_identifier)

    reference_codons = []
    if options.filename_weights:
        options.filename_weights = options.filename_weights.split(",")
        for filename in options.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    IOTools.ReadMap(IOTools.openFile(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        options.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in a.items():
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                options.stdlog.write("# tablediff\t%s\t%s\t%f\n" %
                                     (options.filename_weights[x],
                                      options.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(options.stdin)

    def getCounter(section):

        if options.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(
                    options.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif options.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in options.sections:
        totals[section] = getCounter(section)

    options.stdout.write("id")
    for section in options.sections:
        options.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    options.stdout.write("\n")
    options.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if options.split_id is True:
            options.stdout.write("%s" % id.split()[0])
        else:
            options.stdout.write("%s" % id)
        options.stdout.flush()

        for section in options.sections:
            s = getCounter(section)
            s.loadSequence(sequence, options.seqtype)
            totals[section].addProperties(s)

            options.stdout.write("\t" + "\t".join(s.getFields()))

        options.stdout.write("\n")

    if options.add_total:
        options.stdout.write("total")
        for section in options.sections:
            options.stdout.write("\t" + "\t".join(totals[section].getFields()))
        options.stdout.write("\n")

    E.Stop()

Example #3

Show file

File: codonbias_acai2tsv.py Project: lesheng/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: codonbias_acai2tsv.py 865 2007-01-15 13:44:43Z andreas $"
    )

    parser.add_option("-o",
                      "--input-file-trace",
                      dest="input_filename_trace",
                      type="string",
                      help="input filename for cai.",
                      metavar="FILE")

    parser.add_option("-e",
                      "--input-file-genes",
                      dest="input_filename_genes",
                      type="string",
                      help="input filename for genes information from cai.",
                      metavar="FILE")

    parser.add_option("-c",
                      "--input-file-codons",
                      dest="input_filename_codons",
                      type="string",
                      help="input filename for codon usage information.",
                      metavar="FILE")

    parser.add_option("--input-file-sequences",
                      dest="input_filename_sequences",
                      type="string",
                      help="input filename with sequences.",
                      metavar="FILE")

    parser.add_option("-t",
                      "--input-file-subset",
                      dest="input_filename_subset",
                      type="string",
                      help="input filename with subset.",
                      metavar="FILE")

    parser.add_option("--codon-table-format",
                      dest="codon_table_format",
                      type="choice",
                      choices=("list", "matrix"),
                      help="output options for output codon tables.")

    parser.add_option("--codon-table-type",
                      dest="codon_table_type",
                      type="choice",
                      choices=("counts", "frequencies", "weights",
                               "absolute-frequencies"),
                      help="type of codon table.")

    parser.add_option("-r",
                      "--reference",
                      dest="reference",
                      type="string",
                      help="dump CAI reference weights for species.")

    parser.add_option("-s",
                      "--select",
                      dest="select",
                      type="string",
                      help="fields to select from genes table.")

    parser.add_option("-m",
                      "--map",
                      dest="input_filename_map",
                      type="string",
                      help="filename with mapping information for gene names.",
                      metavar="FILE")

    parser.add_option("-i",
                      "--invert-map",
                      dest="invert_map",
                      action="store_true",
                      help="invert map.")

    parser.add_option(
        "-d",
        "--dominant-set",
        dest="dominant_set",
        type="float",
        help="only print out dominant set (# fraction of most biased genes).")

    parser.add_option(
        "--reverse-set",
        dest="reverse_set",
        action="store_true",
        help="print the reverse set, i.e., then non-dominant set.")

    parser.add_option(
        "-u",
        "--codon-usage",
        dest="codon_usage",
        type="string",
        help="print codon usage for the full/biased set of genes [full|biased]."
    )

    parser.add_option(
        "-w",
        "--weights",
        dest="weights",
        type="string",
        help=
        "print weights [final-list|final-matrix|random|compute|weights|frequencies|absolute-frequencies]."
    )

    parser.add_option("--weights-matrix2table",
                      dest="weights_matrix2table",
                      action="store_true",
                      help="convert a weights matrix to a weights table.")

    parser.add_option("--get-preferred-codons",
                      dest="get_preferred_codons",
                      type="string",
                      help="compute overview of preferred codons.")

    parser.set_defaults(input_filename="-",
                        input_filename_trace=None,
                        input_filename_genes=None,
                        input_filename_codons=None,
                        input_filename_map=None,
                        input_filename_subset=None,
                        input_filename_sequences=None,
                        invert_map=False,
                        select=None,
                        codon_usage=None,
                        weights=None,
                        revserse_set=False,
                        pseudocounts=1,
                        codon_table_format="list",
                        codon_table_type="weights",
                        weights_matrix2table=False,
                        random_size=1000,
                        get_preferred_codons=None,
                        dominant_set=0.0)

    (options, args) = E.Start(parser)
    if options.select:
        options.select = options.select.split(",")

    outfile = options.stdout

    ###################################################################
    # convert weights table to a codon table
    if options.weights_matrix2table:
        lines = options.stdin.readlines()
        data = []
        for line in lines:
            if line[0] == "#":
                continue
            data += list(map(float, line[:-1].split(",")))

        weights = {}
        x = 0
        for cc in OUTPUT_ORDER_CODON_MATRIX:
            for c in cc:
                weights[c] = data[x]
                x += 1

        outfile.write("CODON\tWEIGHT\n")
        codons = weights.keys()
        codons.sort()
        for codon in codons:
            outfile.write("%s\t%f\n" % (codon, weights[codon]))

        E.Stop()
        sys.exit(1)

    ###################################################################
    map_genes = {}

    if options.input_filename_map:
        data = map(
            lambda x: x[:-1].split("\t")[:2],
            filter(lambda x: x[0] != "#",
                   open(options.input_filename_map, "r").readlines()))

        for a, b in data:
            if options.invert_map:
                a, b = b, a
            map_genes[a] = b

    result = WrapperAdaptiveCAI.AdaptiveCAIResult()

    if options.input_filename_genes:
        gene_file = open(options.input_filename_genes, "r")
    else:
        gene_file = None

    if options.input_filename_codons:
        codon_file = open(options.input_filename_codons, "r")
    else:
        codon_file = None

    if options.input_filename_trace:
        trace_file = open(options.input_filename_trace, "r")
    else:
        trace_file = None

    if options.input_filename_subset:
        l, e = IOTools.ReadList(open(options.input_filename_subset, "r"))
        subset = set(l)
        if options.loglevel >= 1:
            options.stdlog.write("# read %i entries into subset from %s.\n" %
                                 (len(subset), options.input_filename_subset))
    else:
        subset = None

    result.Read(gene_file=gene_file,
                codon_file=codon_file,
                trace_file=trace_file)

    if gene_file:
        gene_file.close()
    if codon_file:
        codon_file.close()
    if trace_file:
        trace_file.close()

    if options.reference:
        if options.reference not in CODON_PREFERENCES:
            raise "unknown species %s: possibles species are: %s" % (
                options.reference, str(CODON_PREFERNCES.keys()))

        weights = Genomics.CalculateCAIWeightsFromCounts(
            CODON_PREFERENCES[options.reference], options.pseudocounts)

        for x in range(len(OUTPUT_ORDER_CODON_MATRIX)):
            outfile.write(",".join(
                map(lambda z: "%5.3f" % z, [
                    weights[codon.upper()]
                    for codon in OUTPUT_ORDER_CODON_MATRIX[x]
                ])))
            outfile.write("\n")

    if options.dominant_set and gene_file:
        cai_threshold = result.GetDominantThreshold(options.dominant_set)
    else:
        if options.reverse_set:
            cai_threshold = 1.0
        else:
            cai_threshold = 0.0

    if options.select:

        fields = []
        titles = []
        for x in options.select:
            f = re.match("(\S+) (AS|as) (\S+)", x)
            if f:
                fields.append(f.groups()[0].upper())
                titles.append(f.groups()[2])
            else:
                fields.append(x.upper())
                titles.append(x)

        outfile.write("GENENAME\t" + string.join(titles, "\t") + "\n")

        for genename, data in result.mGeneInfo.items():
            if genename in map_genes:
                genename = map_genes[genename]

            if options.reverse_set:
                if data["CAICLASS"] >= cai_threshold:
                    continue
            else:
                if data["CAICLASS"] < cai_threshold:
                    continue

            outfile.write(genename)
            for c in fields:
                outfile.write("\t%s" % str(data[c]))
            outfile.write("\n")

    if options.weights:

        format = options.codon_table_format

        if options.weights in ("compute-counts", "compute-weights",
                               "compute-frequencies"):
            # compute codon usage weights from a set of sequences
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for x in codons:
                counts[x] = 0

            if options.input_filename_sequences:
                sequences = Genomics.ReadPeptideSequences(open(
                    options.input_filename_sequences, "r"),
                                                          filter=subset)
                for key, sequence in sequences.items():
                    sequence = re.sub(" ", "", sequence)
                    if len(sequence) % 3 != 0:
                        raise "warning: sequence %s is not multiple of 3" % key
                    for codon in [
                            sequence[x:x + 3]
                            for x in range(0, len(sequence), 3)
                    ]:
                        counts[codon.upper()] += 1

            if options.weights == "compute-frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.weights == "compute-weights":
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
            else:
                weights = counts

        elif options.weights in ("final-list", "final-matrix"):

            weights = result.mFinalWeights
            if options.weights == "final-list":
                format = "list"
            else:
                format = "matrix"

        elif options.weights == "random":
            # get random weights
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for x in codons:
                counts[x] = random.randint(1, options.random_size)

            weights = Genomics.CalculateCAIWeightsFromCounts(
                counts, options.pseudocounts)
            format = "matrix"

        elif options.weights == "biased":
            # get biased weights
            codons = Genomics.GetUniformCodonUsage()

            weights = Genomics.CalculateCAIWeightsFromCounts(
                counts, options.pseudocounts)
            format = "matrix"

        elif options.weights in ("uniform-weights", "uniform-frequencies"):
            # get uniform weights
            codons = Genomics.GetUniformCodonUsage()

            if options.weights == ("uniform-weights"):
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
                format = "matrix"
            else:
                weights = codons
                format = "list"

        elif options.weights in ("counts", "frequencies",
                                 "absolute-frequencies"):
            # get weights as frequencies
            # compute from scratch. In the caijava file, the absolute frequencey f / gene_length is
            # given. Thus the total number of codons is f * gene_length.
            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for c in codons:
                counts[c] = 0

            for genename, data in result.mGeneInfo.items():

                if options.reverse_set:
                    if data["CAICLASS"] >= cai_threshold:
                        continue
                else:
                    if data["CAICLASS"] < cai_threshold:
                        continue

                l = data["GENELENGTH"]
                for c in codons:
                    counts[c] += int(data[c] * l)

            if options.weights == "frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.weights == "counts":
                weights = counts
            elif options.weights == "absolute-frequencies":
                # compute absolute frequencies (with pseudo-counts, but do not
                # normalize per aa)
                weights = {}
                m = sum(counts.values())
                for k, v in counts.items():
                    weights[k] = float(v) / m

            format = "list"

        elif options.weights == "subset":

            codons = CODON_PREFERENCES["dmelanogaster"].keys()
            counts = {}
            for c in codons:
                counts[c] = 0

            for genename, data in result.mGeneInfo.items():

                found = genename in subset
                if (not found and not options.reverse_set) or (
                        found and options.reverse_set):
                    continue

                l = data["GENELENGTH"]
                for c in codons:
                    counts[c] += int(data[c] * l)

            if options.codon_table_type == "frequencies":
                weights = Genomics.CalculateCodonFrequenciesFromCounts(
                    counts, options.pseudocounts)
            elif options.codon_table_type == "weights":
                weights = Genomics.CalculateCAIWeightsFromCounts(
                    counts, options.pseudocounts)
            elif options.codon_table_type == "counts":
                weights = counts
            if options.codon_table_type == "absolute-frequencies":
                # compute absolute frequencies (with pseudo-counts, but do not
                # normalize per aa)
                weights = {}
                m = sum(counts.values())
                for k, v in counts.items():
                    weights[k] = float(v) / m

        else:
            raise "unknown weights %s" % options.weights

        if format == "list":
            outfile.write("CODON\tWEIGHT\n")
            codons = weights.keys()
            codons.sort()
            for codon in codons:
                outfile.write("%s\t%f\n" % (codon, weights[codon]))

        elif format == "matrix":

            for x in range(len(OUTPUT_ORDER_CODON_MATRIX)):
                outfile.write(",".join(
                    map(lambda z: "%5.3f" % z, [
                        weights[codon.upper()]
                        for codon in OUTPUT_ORDER_CODON_MATRIX[x]
                    ])))
                outfile.write("\n")

    if options.codon_usage:
        outfile.write("CODON\tFREQUENCY\n")

        if options.codon_usage == "biased":
            usages = result.mCodonUsages[-1]
        elif options.codon_usage == "full":
            usages = result.mCodonUsages[0]
        elif options.codon_usage == "weights":
            usages = WrapperAdaptiveCAI.CalculateWeightsFromUsage(
                result.mCodonUsages[0])
        else:
            raise "unknown option '%s' for codon-usage." % options.codon_usage

        codons = usages.keys()
        codons.sort()
        for codon in codons:
            outfile.write("%s\t%f\n" % (codon, usages[codon]))

    E.Stop()