Python Genomics.MapCodon2AA 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: CGAT

클래스/타입: Genomics

메소드/함수: MapCodon2AA

hotexamples.com에서의 예제들: 10

Python Genomics.MapCodon2AA - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 CGAT.Genomics.MapCodon2AA에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

IsPositiveStrand(13)

IsNegativeStrand(10)

MapCodon2AA(10)

IsStopCodon(7)

CalculatePairIndices(7)

Alignment2PeptideAlignment(6)

GetHID(6)

ReadGenomicSequences(5)

CalculateCodonFrequenciesFromCounts(4)

CountGeneFeatures(4)

GetDegeneracy(4)

Alignment2ExonBoundaries(3)

GetUniformCodonUsage(3)

ReadContigSizes(3)

ParseFasta2Hash(2)

Protein2Wobble(2)

MaskStopCodons(2)

Alignment2CDNA(2)

CountCodons(2)

Alignment2String(2)

GetIntronType(2)

GetMapAA2Codons(1)

GetGenomicSequence(1)

GetDegenerateSites(1)

MapSequences(1)

CalculateCAIWeightsFromCounts(1)

ParseFasta2HashFromIndex(1)

AlignmentProtein2CDNA(1)

ReadClusters(1)

예제 #1

파일 보기

파일: codonbias_weights2tsv.py 프로젝트: lesheng/cgat

def WriteOverviewWeights(fields, table, options):

    output = []
    WriteHeader(options)
    for x in range(1, len(fields) - 1):
        for y in range(x + 1, len(fields)):
            changed = {}

            for c in table:
                codon = c[0]
                w1 = c[x]
                w2 = c[y]
                t1 = w1 == 1.0 and w2 != 1.0
                t2 = w1 != 1.0 and w2 == 1.0
                if t1 or t2:
                    aa = Genomics.MapCodon2AA(codon)
                    if aa not in changed:
                        changed[aa] = []
                    if t1:
                        changed[aa].append((t1, w2, codon))
                    else:
                        changed[aa].append((t1, w1, codon))

            output += WriteChanged(fields[x], fields[y], changes, options)

    WriteOutput(output, options)

예제 #2

파일 보기

파일: codemls2tsv.py 프로젝트: santayana/cgat

        def translate(s):
            sequence = s.mString
            seq = []
            for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]:
                aa = Genomics.MapCodon2AA(codon)
                seq.append(aa)

            s.mString = "".join(seq)

예제 #3

파일 보기

    def loadSequence(self, sequence):
        """load sequence properties from a sequence."""

        SequenceProperties.loadSequence(self, sequence)

        # counts of amino acids
        self.mCountsAA = {}

        for x in Bio.Alphabet.IUPAC.extended_protein.letters:
            self.mCountsAA[x] = 0

        for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]:
            aa = Genomics.MapCodon2AA(codon)
            self.mCountsAA[aa] += 1

예제 #4

파일 보기

    def loadSequence(self, sequence, seqtype="na"):
        """load sequence properties from a sequence."""

        SequenceProperties.loadSequence(self, sequence, seqtype)

        if len(sequence) % 3:
            raise ValueError(
                '''sequence length is not a multiple of 3 (length=%i)''' %
                (len(sequence)))

        # counts of amino acids
        self.mCountsAA = {}

        for x in Bio.Alphabet.IUPAC.extended_protein.letters:
            self.mCountsAA[x] = 0

        for codon in (sequence[x:x + 3] for x in range(0, len(sequence), 3)):
            aa = Genomics.MapCodon2AA(codon)
            self.mCountsAA[aa] += 1

예제 #5

파일 보기

def filterMali(mali, method="3rd"):
    """build a new multiple alignment based on a filter.

    valid methods are
    3rd:        only third positions
    4d:         only four-fold degenerate sites
    """

    if method not in ("3rd", "4d"):
        raise "unknown method %s" % method

    if method == "3rd":
        columns = range(2, mali.getWidth(), 3)

    elif method == "4d":
        # translate
        trans_mali = Mali.Mali()
        for id, seq in mali.items():
            s = []
            sequence = seq.mString
            l = len(sequence)
            for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                aa = Genomics.MapCodon2AA(codon)
                s.append(aa)

            trans_mali.addSequence(id, 0, l, "".join(s))

        # get four-fold (or higher) degenerate amino acids
        aa_columns = trans_mali.getColumns()
        columns = []
        for c in range(len(aa_columns)):
            chars = set(aa_columns[c])
            chars = chars.difference(set(mali.mGapChars))
            if len(chars) == 1:
                char = list(chars)[0].upper()
                try:
                    deg = Genomics.DegeneracyAA[char]
                except KeyError:
                    continue
                if deg >= 4:
                    columns.append(c * 3)

    mali.takeColumns(columns)

예제 #6

파일 보기

파일: codonbias_weights2tsv.py 프로젝트: lesheng/cgat

def WriteOverviewFrequencies(fields, table, options):

    WriteHeader(options)
    output = []

    for x in range(1, len(fields) - 1):
        for y in range(x + 1, len(fields)):
            frequencies = {}

            # collect frequencies per amino acid
            for c in table:
                codon = c[0]
                f1 = c[x]
                f2 = c[y]
                aa = Genomics.MapCodon2AA(codon)
                if aa not in frequencies:
                    frequencies[aa] = []

                frequencies[aa].append((codon, f1, f2))

            changed = {}

            # sort for both genomes, and check if preference has changed
            for aa, codons in frequencies.items():
                codons.sort(lambda x, y: cmp(x[1], y[1]))
                pref_codon1 = codons[-1]
                codons.sort(lambda x, y: cmp(x[2], y[2]))
                pref_codon2 = codons[-1]

                if pref_codon1 == pref_codon2:
                    continue
                else:
                    changed[aa] = [(True, pref_codon1[2], pref_codon1[0]),
                                   (False, pref_codon2[1], pref_codon2[0])]

            output += WriteChanges(fields[x], fields[y], changed, options)

    WriteOutput(output, options)

예제 #7

파일 보기

파일: codonbias_shuffle_fasta.py 프로젝트: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: shuffle_fasta.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option(
        "-c",
        "--codons",
        dest="codons",
        action="store_true",
        help="make sure that shuffled sequences only contain valid codons.")

    parser.add_option("-a",
                      "--conserve-aminos",
                      dest="conserve_aminos",
                      action="store_true",
                      help="conserve amino acids.")

    parser.add_option(
        "-b",
        "--bias",
        dest="bias",
        type="float",
        help=
        "introduce bias into codon usage choice. Complete bias is 1.0, while no bias is 0.0."
    )

    parser.add_option(
        "-i",
        "--biased-codon-usage",
        dest="filename_biased_codon_usage",
        type="string",
        help="Filename with reference codon usage table for biased codon usage."
    )

    parser.add_option(
        "-u",
        "--bulk-codon-usage",
        dest="filename_bulk_codon_usage",
        type="string",
        help=
        "Filename with reference codon usage table for unbiased codon usage.")

    parser.set_defaults(
        codons=False,
        conserve_aminos=False,
        bias=0.0,
        filename_biased_codon_usage=None,
        filename_bulk_codon_usage=None,
        stop_codons=("TAG", "TAA", "TGA"),
        precision=10000,
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    iterator = FastaIterator.FastaIterator(sys.stdin)

    # get map of amino acids to codons
    map_aa2codons = Genomics.GetMapAA2Codons()

    # for codon based shuffling: build ranges based on strength of bias and on reference codon usage
    # Bias switches from completely biased to unbiased. Unbiased is uniform
    # usage.
    if options.filename_biased_codon_usage:

        map_codon2frequency = IOTools.ReadMap(open(
            options.filename_biased_codon_usage, "r"),
                                              map_functions=(str, float),
                                              has_header=True)

        if options.filename_bulk_codon_usage:
            map_codon2frequency_bulk = IOTools.ReadMap(
                open(options.filename_bulk_codon_usage, "r"),
                map_functions=(str, float),
                has_header=True)

        codon_ranges = {}
        for aa in map_aa2codons.keys():
            c = []
            x = 0
            for codon in map_aa2codons[aa]:

                if options.filename_bulk_codon_usage:
                    u = map_codon2frequency_bulk[codon]
                else:
                    # uniform usage
                    u = 1.0 / len(map_aa2codons[aa])

                g = map_codon2frequency[codon]
                f = g + (u - g) * (1.0 - options.bias)
                x += f * options.precision
                c.append(x)
            codon_ranges[aa] = c

    while 1:
        cur_record = iterator.next()

        if cur_record is None:
            break

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if options.conserve_aminos:
            n = []
            for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                aa = Genomics.MapCodon2AA(codon)
                if aa not in map_aa2codons:
                    continue
                if options.bias or options.filename_biased_codon_usage:
                    # get random number from 0 to precision
                    v = random.randint(0, options.precision)
                    # find the corresponding intervall:
                    l = len(map_aa2codons[aa])
                    x = 0
                    while x < l - 1:
                        if v < codon_ranges[aa][x]:
                            break
                        x += 1
                else:
                    x = random.randint(0, len(map_aa2codons[aa]) - 1)
                n.append(map_aa2codons[aa][x])
            sequence = "".join(n)
        else:
            sequence = list(sequence)
            if options.codons:
                while 1:
                    random.shuffle(sequence)
                    for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                        if codon in options.stop_codons:
                            redo = True
                            break
                    else:
                        break
            else:
                random.shuffle(sequence)
            sequence = "".join(sequence)
        options.stdout.write(">%s\n%s\n" %
                             (cur_record.title, "".join(sequence)))

    E.Stop()

예제 #8

파일 보기

def AlignCodonBased(seq_wobble,
                    seq_cds,
                    seq_peptide,
                    map_p2c,
                    options,
                    diag_width=2,
                    max_advance=2):
    """advance in codons in seq_wobble and match to nucleotides in seq_cds.

    Due to alinglib this is all in one-based coordinates.
    Takes care of frameshifts.
    """

    map_p2c.clear()

    gop, gep = -1.0, -1.0
    matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation(
        1, -10, 1, alignlib_lite.py_getDefaultEncoder())

    pep_seq = seq_peptide.asString()
    cds_seq = seq_cds.asString()
    wobble_seq = seq_wobble.asString()

    lcds = seq_cds.getLength()
    lwobble = seq_wobble.getLength()
    y = 0
    x = 0

    last_start = None

    while x < lwobble and y < lcds:

        xr = seq_wobble.asResidue(x)
        # skip over masked chars in wobble - these are gaps
        if seq_wobble.asChar(x) == "X":
            x += 1
            continue

        # skip over masked chars in wobble - these are from
        # masked chars in the peptide sequence
        # Note to self: do not see all implications of this change
        # check later.
        if seq_wobble.asChar(x) == "N":
            x += 1
            continue

        # skip over gaps in wobble
        if seq_wobble.asChar(x) == "-":
            x += 1
            continue

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if options.loglevel >= 6:
            if (x % 3 == 0):
                c = seq_cds.asChar(y) + seq_cds.asChar(y +
                                                       1) + seq_cds.asChar(y +
                                                                           2)
                options.stdlog.write(
                    "# c=%s, x=%i, y=%i, aa=%s target=%s\n" %
                    (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)]))

            options.stdlog.write(
                "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" %
                (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr,
                 seq_cds.asResidue(y), str(s)))

        # deal with mismatches
        if s <= 0:

            tmp_map_p2c = alignlib_lite.py_makeAlignmentVector()

            # backtrack to previous three codons and align
            # three codons for double frameshifts that span two codons and
            # produce two X's and six WWWWWW.

            # number of nucleotides to extend (should be multiple of 3)
            # less than 12 caused failure for some peptides.
            d = 15

            # extend by amound dx
            dx = (x % 3) + d

            x_start = max(0, x - dx)
            # map to ensure that no ambiguous residue mappings
            # exist after re-alignment
            y_start = max(0,
                          map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT))

            if (x_start, y_start) == last_start:
                raise ValueError("infinite loop detected")

            last_start = (x_start, y_start)

            x_end = min(x_start + 2 * d, len(wobble_seq))
            y_end = min(y_start + 2 * d, len(cds_seq))

            wobble_fragment = alignlib_lite.py_makeSequence(
                wobble_seq[x_start:x_end])
            cds_fragment = alignlib_lite.py_makeSequence(
                cds_seq[y_start:y_end])

            AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c,
                            options)

            if options.loglevel >= 10:
                options.stdlog.write(
                    "# fragmented alignment from %i-%i, %i-%i:\n%s\n" %
                    (x_start, x_end, y_start, y_end,
                     str(
                         alignlib_lite.py_AlignmentFormatExplicit(
                             tmp_map_p2c, wobble_fragment, cds_fragment))))

                options.stdlog.flush()

            # clear alignment
            map_p2c.removeRowRegion(x_start, x_end)
            ngap = 0
            last_x, last_y = None, None
            for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()):
                yyy = tmp_map_p2c.mapRowToCol(xxx)

                if yyy >= 0:
                    x = xxx + x_start
                    y = yyy + y_start
                    xr = seq_wobble.asResidue(x)
                    s = matrix.getValue(seq_wobble.asResidue(x),
                                        seq_cds.asResidue(y))
                    if s < 0:
                        raise ValueError(
                            "mismatched residue wobble: %i (%s), cds: %i (%s)"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y)))

                    map_p2c.addPair(x, y, s)
                    last_x, last_y = x, y
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y),
                               xr, seq_cds.asResidue(y), s))
                        options.stdlog.flush()
                    ngap = 0
                else:
                    ngap += 1

                # treat special case of double frameshifts. They might cause a petide/wobble residue
                # to be eliminated and thus the translated sequences will differ.
                # simply delete the last residue between x and y and move to
                # next codon.
                if ngap == 3:
                    map_p2c.removeRowRegion(last_x, last_x + 1)

                    last_x += 1
                    map_p2c.addPair(last_x, last_y)
                    if options.loglevel >= 6:
                        options.stdlog.write(
                            "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n"
                            % (last_x, seq_wobble.asChar(last_x), last_y,
                               seq_cds.asChar(last_y), xr,
                               seq_cds.asResidue(last_y), s))
                        options.stdlog.flush()
                    ngap = 0

            # exit condition if alignment is shorter than problematic residue
            # need to catch this to avoid infinite loop.
            if tmp_map_p2c.getRowTo() < d:
                if lwobble - x <= 4:
                    # only last codon is missing, so ok
                    break
                else:
                    raise ValueError("failure to align in designated window.")

            s = 0

        s = matrix.getValue(xr, seq_cds.asResidue(y))

        if s < 0:
            raise ValueError("mis-matching residues.")

        map_p2c.addPair(x, y, float(s))

        # advance to next residues
        x += 1
        y += 1

    # sanity checks
    assert (map_p2c.getRowTo() <= seq_wobble.getLength())
    assert (map_p2c.getColTo() <= seq_cds.getLength())

예제 #9

파일 보기

def ProcessResult(result, options, mali=None, prefix=None, p_value=None):

    counts = None

    if options.method == "summary-slr":

        thresholds = "95%", "99%", "95% corrected", "99% corrected"

        if prefix:
            options.stdout.write("%s\t" % prefix)

        options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % (
            result.mTreeLength,
            result.mOmega,
            result.mKappa,
            result.mLogLikelihood,
            len(result.mSites),
            result.mNSitesSynonymous,
            result.mNSitesGaps + result.mNSitesSingleChar,
        ))
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds)))
        options.stdout.write("\t")
        options.stdout.write("\t".join(
            map(lambda x: "%i" % result.mNNegativeSites[x], thresholds)))
        options.stdout.write("\n")

    elif options.method in ("summary-filtered", "positive-site-table",
                            "negative-site-table", "neutral-site-table",
                            "positive-site-list", "negative-site-list",
                            "neutral-site-list"):

        mali_length = mali.getLength()
        mali_width = mali.getWidth()
        column_data = map(
            lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."),
            mali.getColumns())

        # sanity check: do lengths of mali and # of sites correspond
        if len(result.mSites) * 3 != mali_width:
            raise "mali (%i) and # of sites (%i) do not correspond." % (
                mali_width, len(result.mSites))

        if options.method == "summary-filtered":
            # count sites, but filter with multiple alignment
            ntotal = 0
            npositive = 0
            nnegative = 0
            nneutral = 0
            nfiltered = 0
            nsynonymous = 0

            if prefix:
                options.stdout.write("%s\t" % prefix)

            for x in range(len(result.mSites)):
                site = result.mSites[x]
                column = column_data[x * 3]

                if column.mNChars != mali_length:
                    nfiltered += 1
                    continue

                if site.isPositive(options.significance_threshold,
                                   options.use_adjusted):
                    npositive += 1
                elif site.isNegative(options.significance_threshold,
                                     options.use_adjusted):
                    nnegative += 1

                if site.isSynonymous():
                    nsynonymous += 1

                ntotal += 1

            options.stdout.write(
                "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" %
                (result.mTreeLength, result.mOmega, result.mKappa,
                 result.mLogLikelihood, len(result.mSites), nfiltered, ntotal,
                 nsynonymous, nnegative, npositive))
            counts = Result(nfiltered, ntotal, nsynonymous, nnegative,
                            npositive)

        elif options.method in (
                "positive-site-table",
                "negative-site-table",
                "neutral-site-table",
                "positive-site-list",
                "negative-site-list",
                "neutral-site-list",
        ):

            select_positive_sites = options.method in ("positive-site-table",
                                                       "positive-site-list")
            select_negative_sites = options.method in ("negative-site-table",
                                                       "negative-site-list")

            # iterate over sites and output those under xxx selection
            identifiers = mali.getIdentifiers()
            chars_per_row = [[] for x in range(mali_length)]

            sites = []

            for col in range(len(result.mSites)):

                site = result.mSites[col]
                column = column_data[col * 3]

                if column.mNChars != mali_length:
                    continue

                keep = False

                if select_positive_sites and site.isPositive(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                elif select_negative_sites and site.isNegative(
                        options.significance_threshold, options.use_adjusted):
                    keep = True

                if not keep:
                    continue

                sites.append((col, site))

            nsites = len(sites)

            if options.truncate_sites_list:
                # truncate sites list, sort by significance
                sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue))
                sites = sites[:options.truncate_sites_list]

            for col, site in sites:

                site = result.mSites[col]
                xcol = col * 3

                for row in range(mali_length):
                    id = identifiers[row]
                    x = max(xcol - options.context_size * 3, 0)
                    y = min(xcol + 3 + options.context_size * 3, mali_width)
                    segment = mali[id][x:y]
                    codon = mali[id][xcol:xcol + 3]
                    pos = mali.getResidueNumber(id, xcol)
                    pos /= 3

                    # save as real-world coordinates
                    chars_per_row[row].append(
                        PositionInformation(
                            Genomics.MapCodon2AA(codon), pos + 1, xcol,
                            Genomics.TranslateDNA2Protein(segment).upper()))

            if p_value is not None:
                pp_value = p_value
            else:
                pp_value = "na"

            if options.method in ("positive-site-table", "negative-site-table",
                                  "neutral-site-table"):

                if options.context_size:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i in %s" %
                                (x.mAA, x.mSequencePosition, x.mContext)
                                for x in chars_per_row[row]
                            ])))
                else:
                    for row in range(mali_length):
                        if prefix:
                            options.stdout.write("%s\t" % prefix)

                        options.stdout.write(
                            "%s\t%i\t%s\t%s\n" %
                            (identifiers[row], nsites, pp_value, ";".join([
                                "%s%i" % (x.mAA, x.mSequencePosition)
                                for x in chars_per_row[row]
                            ])))

            elif options.method in ("positive-site-list", "negative-site-list",
                                    "neutral-site-list"):

                for row in range(mali_length):

                    if prefix:
                        xprefix = "%s\t%s" % (prefix, identifiers[row])
                    else:
                        xprefix = "%s" % (identifiers[row])
                    x = 0
                    for chars in chars_per_row[row]:
                        x += 1
                        options.stdout.write(
                            "%s\t%i\t%s\t%i\t%i\t%s\n" %
                            (xprefix, x, chars.mAA, chars.mSequencePosition,
                             chars.mMaliPosition, chars.mContext))

    options.stdout.flush()

    return counts

예제 #10

파일 보기

def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "remove-stops", "upper", "lower", "reverse-complement",
                 "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_option("-p",
                      "--parameters",
                      dest="parameters",
                      type="string",
                      help="parameter stack for methods that require one "
                      "[default=%default].")

    parser.add_option("-x",
                      "--ignore-errors",
                      dest="ignore_errors",
                      action="store_true",
                      help="ignore errors [default = %default].")

    parser.add_option("--sample-proportion",
                      dest="sample_proportion",
                      type="float",
                      help="sample proportion [default = %default].")

    parser.add_option("--exclude-pattern",
                      dest="exclude_pattern",
                      type="string",
                      help="exclude all sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--include-pattern",
                      dest="include_pattern",
                      type="string",
                      help="include only sequences with ids matching pattern "
                      "[default = %default].")

    parser.add_option("--filter-method",
                      dest="filter_methods",
                      type="string",
                      action="append",
                      help="filtering methods to apply "
                      "[default = %default].")

    parser.add_option(
        "-t",
        "--sequence-type",
        dest="type",
        type="choice",
        choices=("aa", "na"),
        help="sequence type (aa or na) [%default]. This option determines "
        "which characters to use for masking [default = %default].")

    parser.add_option(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type="string",
        help="template for numerical identifier [default = %default] "
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.set_defaults(
        methods=[],
        parameters="",
        type="na",
        aa_mask_chars="xX",
        aa_mask_char="x",
        na_mask_chars="nN",
        na_mask_char="n",
        gap_chars="-.",
        gap_char="-",
        template_identifier="ID%06i",
        ignore_errors=False,
        exclude_pattern=None,
        include_pattern=None,
        sample_proportion=None,
        filter_methods=[],
    )

    (options, args) = E.Start(parser)
    options.parameters = options.parameters.split(",")

    rx_include, rx_exclude = None, None
    if options.include_pattern:
        rx_include = re.compile(options.include_pattern)
    if options.exclude_pattern:
        rx_exclude = re.compile(options.exclude_pattern)

    iterator = FastaIterator.FastaIterator(options.stdin)

    nseq = 0

    map_seq2nid = {}

    if "apply-map" in options.methods:
        map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if options.type == "na":
        mask_chars = options.na_mask_chars
        mask_char = options.na_mask_char
    else:
        mask_chars = options.aa_mask_chars
        mask_char = options.aa_mask_char

    if "map-codons" in options.methods:
        map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r"))
        del options.parameters[0]

    if "mask-soft" in options.methods:
        f = options.parameters[0]
        del options.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in options.methods or "back-translate" in options.methods:

        # open a second stream to read sequences from
        f = options.parameters[0]
        del options.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    ninput, noutput, nerrors, nskipped = 0, 0, 0, 0

    if "sample" in options.methods:
        if not options.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = options.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in options.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in IOTools.openFile(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        nseq += 1
        ninput += 1

        sequence = re.sub(" ", "", cur_record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(cur_record.title):
            nskipped += 1
            continue

        if rx_exclude and rx_exclude.search(cur_record.title):
            nskipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or cur_record.title in filter_id_list):
            nskipped += 1
            continue

        for method in options.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % options.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        cur_record.title, ls)
                    nerrors += 1
                    if options.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if cur_record.title != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        cur_record.title, other_record.title)

                other_sequence = re.sub("[ %s]" % options.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % options.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in options.gap_chars:
                        c = options.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = string.translate(
                    sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = options.na_mask_char
                elif method == "remove-stops":
                    char = options.gap_char

                for x in sequence:

                    if x not in options.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in options.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (cur_record.title))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in options.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, cur_record.title)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                if id in map_seq2nid:
                    rest = cur_record.title[len(id):]
                    cur_record.title = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", cur_record.title).groups()[0]
                new_id = options.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                cur_record.title = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if cur_record.title != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (cur_record.title, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        % (cur_record.title, len(other_sequence) * 3,
                           len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in options.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [options.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [options.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            nskipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            nskipped += 1
            continue

        options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence))
        noutput += 1

    if "build-map" in options.methods:
        p = options.parameters[0]
        if p:
            outfile = IOTools.openFile(p, "w")
        else:
            outfile = options.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" %
           (ninput, noutput, nskipped, nerrors))

    E.Stop()