Esempi in Python per Exons.ReadExonBoundaries

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: CGAT

Classe/tipologia: Exons

Metodo/funzione: ReadExonBoundaries

Esempi su hotexamples.com: 13

Exons.ReadExonBoundaries in Python: 13 esempi trovati. Questi sono i migliori esempi reali in Python per CGAT.Exons.ReadExonBoundaries, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

ReadExonBoundaries(13)

Alignment2Exons(8)

CheckOverlap(3)

ComparisonResult(3)

SetRankToPositionFlag(2)

CheckCoverage(2)

CheckCoverageAinB(2)

ClusterByExonIdentity(2)

CompareGeneStructures(2)

Alignment2ExonBoundaries(1)

GetExonsRange(1)

MatchExons(1)

GetPeptideLengths(1)

GetGenomeLengths(1)

CountMissedBoundaries(1)

GetExonBoundariesFromTable(1)

Exons2Alignment(1)

Exon(1)

ClusterByExonOverlap(1)

CheckContainedAinB(1)

CalculateStats(1)

UpdatePeptideCoordinates(1)

Esempio n. 1

Mostra file

File: exons2clusters.py Progetto: yangjl/cgat

def ClusterByExonCorrespondence(lengths={}, peptide_sequences=None):

    exons = Exons.ReadExonBoundaries(sys.stdin)
    if param_loglevel >= 1:
        print "# read exons for %i transcripts" % len(exons)

    if not lengths:
        for k in exons:
            lengths[k] = (exons[k][0].mPeptideTo / 3) + 1
            for e in exons[k][1:]:
                lengths[k] = max(lengths[k], (e.mPeptideTo / 3) + 1)

        if param_loglevel >= 1:
            print "# lengths for %i transcripts" % len(lengths)

    map_region2transcript = {}
    map_transcript2region = {}
    map_transcript2transcript = {}
    ## build map of regions to transcripts
    for t in exons:
        map_transcript2region[t] = []
        for e in exons[t]:
            r = "%s-%s-%i-%i" % (e.mSbjctToken, e.mSbjctStrand, e.mGenomeFrom,
                                 e.mGenomeTo)
            if r not in map_region2transcript: map_region2transcript[r] = []
            map_region2transcript[r].append(t)
            map_transcript2region[t].append(r)

    ## build map of transcript to transcript
    map_transcript2transcript = {}

    for t in map_transcript2region:
        map_transcript2transcript[t] = []
        for r in map_transcript2region[t]:
            for tt in map_region2transcript[r]:
                map_transcript2transcript[t].append(tt)

    for t in map_transcript2transcript:
        map_transcript2transcript[t].sort()
        l = None
        n = []
        for tt in map_transcript2transcript[t]:
            if t == tt: continue
            if l != tt: n.append(tt)
            l = tt
        map_transcript2transcript[t] = n

    ## cluster greedily, take longest transcript
    cluster_id = 1
    for t in map_transcript2region:
        if t not in map_transcript2transcript: continue
        cluster = CollectCluster(map_transcript2transcript, t)
        PrintCluster(cluster, cluster_id, lengths, peptide_sequences,
                     param_regex_preferred)
        cluster_id += 1

    if param_loglevel >= 1:
        print "# RESULT: %i transcripts in %i genes" % (
            len(map_transcript2region), cluster_id - 1)

Esempio n. 2

Mostra file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/exons2exons.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      help="method to apply.",
                      type="choice",
                      choices=("remove-stop", ))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option("--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="work in forward coordinates.")

    parser.set_defaults(method=None,
                        forward_coordinates=False,
                        genome_file=None)

    (options, args) = E.Start(parser)

    if options.method == "remove-stop" and not options.genome_file:
        raise "please supply genome file for method %s" % options.method

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()
        exons = Exons.ReadExonBoundaries(sys.stdin, contig_sizes=contig_sizes)
    else:
        exons = Exons.ReadExonBoundaries(sys.stdin)

    ninput, noutput, nremoved_stops, nremoved_exons = 0, 0, 0, 0
    for id, ee in exons.items():

        if options.loglevel >= 3:
            for e in ee:
                options.stdlog.write("# %s\n" % str(e))

        if options.method == "remove-stop":
            e = ee[-1]
            d = min(3, e.mPeptideTo - e.mPeptideFrom)
            if d < 3:
                codon2 = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand,
                                           e.mGenomeTo - d, e.mGenomeTo)
                prev_e = ee[-2]
                codon1 = fasta.getSequence(prev_e.mSbjctToken,
                                           prev_e.mSbjctStrand,
                                           prev_e.mGenomeTo - (3 - d),
                                           prev_e.mGenomeTo)
                codon = codon1 + codon2
            else:
                codon = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand,
                                          e.mGenomeTo - d, e.mGenomeTo)

            if codon.upper() in Genomics.StopCodons:

                if d < 3:
                    nremoved_exons += 1
                    d = 3 - d
                    del ee[-1]
                    e = ee[-1]

                e.mGenomeTo -= d
                e.mPeptideTo -= d
                nremoved_stops += 1

                if e.mGenomeTo == e.mGenomeFrom:
                    nremoved_exons += 1
                    del ee[-1]
                    e = ee[-1]

            assert (e.mGenomeTo > e.mGenomeFrom)
            assert (e.mPeptideTo > e.mPeptideFrom)

        if options.forward_coordinates:

            l = contig_sizes[ee[0].mSbjctToken]
            for e in ee:
                e.InvertGenomicCoordinates(l)

        for e in ee:
            options.stdout.write(str(e) + "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nremoved_stops=%i, nremoved_exons=%i\n" %
            (ninput, noutput, nremoved_stops, nremoved_exons))

    E.Stop()

Esempio n. 3

Mostra file

File: links2fasta.py Progetto: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="peptide sequence [Default=%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format [Default=%default]")

    parser.add_option(
        "-e",
        "--expand",
        dest="expand",
        action="store_true",
        help=
        "expand positions from peptide to nucleotide alignment [Default=%default]"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="map alignments [Default=%default]")

    parser.add_option("-c",
                      "--codons",
                      dest="require_codons",
                      action="store_true",
                      help="require codons [Default=%default]")

    parser.add_option(
        "--one-based-coordinates",
        dest="one_based_coordinates",
        action="store_true",
        help=
        "expect one-based coordinates. The default are zero based coordinates [Default=%default]."
    )

    parser.add_option("--no-identical",
                      dest="no_identical",
                      action="store_true",
                      help="do not output identical pairs [Default=%default]")

    parser.add_option(
        "-g",
        "--no-gaps",
        dest="no_gaps",
        action="store_true",
        help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option("-x",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exon boundaries [Default=%default]")

    parser.add_option("-o",
                      "--outfile",
                      dest="filename_outfile",
                      type="string",
                      help="filename to save links [Default=%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of alignment [Default=%default]")

    parser.add_option(
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "given a set of previous alignments, only write new pairs [Default=%default]."
    )

    parser.set_defaults(filename_sequences=None,
                        filename_exons=None,
                        filename_map=None,
                        filename_outfile=None,
                        no_gaps=False,
                        format="fasta",
                        expand=False,
                        require_codons=False,
                        no_identical=False,
                        min_length=0,
                        report_step=100,
                        one_based_coordinates=False,
                        filename_filter=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_sequences, "r"))
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i sequences\n" % len(sequences))
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"))
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i exons\n" % len(exons))
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i maps\n" % len(map_old2new))
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:
            options.stdlog.write("# reading filtering information.\n")
            sys.stdout.flush()

        map_pair2hids = {}

        if os.path.exists(options.filename_filter):

            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator(infile)

            while 1:
                cur_record = iterator.next()
                if cur_record is None:
                    break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None:
                    break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids:
                    map_pair2hids[id] = []

                map_pair2hids[id].append(s)

            infile.close()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# read filtering information for %i pairs.\n" %
                len(map_pair2hids))
            sys.stdout.flush()
    else:
        map_pair2hids = None

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" %
                             (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None

    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links(sys.stdin):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write("# iterations: %i in %i seconds.\n" %
                                     (iterations, time.time() - t1))
                sys.stdout.flush()

        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write("# read link %s\n" % str(link))

        row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken])
        col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken])

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment(link.mQueryAli, 3)
            link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3)

        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli, link.mSbjctFrom,
            link.mSbjctAli).copy(map_row2col)

        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in row with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mQueryToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New,
                map_row2col, alignlib_lite.py_RR)
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in col with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mSbjctToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_row2col,
                map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR)
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        dr = row_seq.getLength() - map_row2col.getRowTo()
        dc = col_seq.getLength() - map_row2col.getColTo()
        if dr < 0 or dc < 0:
            raise ValueError(
                "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s"
                %
                (link.mQueryToken, link.mSbjctToken, row_seq.getLength(),
                 col_seq.getLength(),
                 str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))

        if options.loglevel >= 2:
            options.stdlog.write(
                str(
                    alignlib_lite.py_AlignmentFormatExplicit(
                        map_row2col, row_seq, col_seq)) + "\n")
        # check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()

            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write("# %s\n" % str(map_row2col))
                options.stdlog.write("# %s\n" % str(link))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mQueryToken]))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mSbjctToken]))
                options.stdlog.write("#\n%s\n" %
                                     alignlib_lite.py_AlignmentFormatExplicit(
                                         map_row2col, row_seq, col_seq))

                raise ValueError(
                    "incomplete codons %i in pair %s - %s" %
                    (naligned, link.mQueryToken, link.mSbjctToken))

        # if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            # Get overlapping segments
            segments = Exons.MatchExons(map_row2col, exons1, exons2)

            for a, b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in
                # the input files.

                from1, to1 = GetAdjustedBoundaries(a, exons1)
                from2, to2 = GetAdjustedBoundaries(b, exons2)

                alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col,
                                               from1 + 1, to1, from2 + 1, to2)

                mode = Write(tmp1_map_row2col,
                             row_seq,
                             col_seq,
                             link,
                             no_gaps=options.no_gaps,
                             no_identical=options.no_identical,
                             min_length=options.min_length,
                             suffix1="_%s" % str(a),
                             suffix2="_%s" % str(b),
                             outfile=outfile,
                             pair_filter=map_pair2hid,
                             format=options.format)

                if mode not in counts:
                    counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write(map_row2col,
                         row_seq,
                         col_seq,
                         link,
                         min_length=options.min_length,
                         no_gaps=options.no_gaps,
                         no_identical=options.no_identical,
                         outfile=outfile,
                         pair_filter=map_pair2hids,
                         format=options.format)

            if mode not in counts:
                counts[mode] = 0
            counts[mode] += 1

        noutput += 1

    if outfile:
        outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join(
            map(lambda x, y: "%s=%i" %
                (x, y), counts.keys(), counts.values())))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()

Esempio n. 4

Mostra file

File: mali_evaluate.py Progetto: lesheng/cgat

            if line[0] == "#":
                continue
            if line[0] == ">":
                continue
            a, b = line[:-1].split("\t")[:2]
            if b not in components:
                components[b] = []
            components[b].append(a)

        if param_loglevel >= 1:
            print "# read %i components." % len(components)
    else:
        components = {'all': all_identifiers}

    if param_filename_exons:
        exons = Exons.ReadExonBoundaries(
            open(param_filename_exons, "r"), filter=all_mali)
        if param_loglevel >= 2:
            print "# read %i exons." % len(exons)
    else:
        exons = {}

    print "# PREFIX\tsummary\tNSEQUENCES\tNASSIGNED\tNCLUSTERS\tNASSIGNED\tUNASSIGNED"
    print "# PREFIX\tcluster\tNMEMBERS\tMEMBERS"
    print "# PREFIX\tfragments\tNFRAGMENTS\tFRAGMENTS"

    print "# PREFIX\tpide\tNPAIRS\tNAMIN\tNAMAX\tNAMEAN\tNAMEDIAN\tNASTDDEV\tAAMIN\tAAMAX\tAAMEAN\tAAMEDIAN\tAASTDDEV"

    print string.join(("# PREFIX", "codons",
                       "NCLEAN", "NNOSTOPS",
                       "ALIGNED_MIN", "ALIGNED_MAX", "ALIGNED_MEAN", "ALIGNED_MEDIAN", "ALIGNED_STDDEV",
                       "CODONS_MIN", "CODONS_MAX", "CODONS_MEAN", "CODONS_MEDIAN", "CODONS_STDDEV",

Esempio n. 5

Mostra file

File: filter_paralogous_links.py Progetto: santayana/cgat

        elif o == "--report-step":
            param_report_step = int(a)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    print E.GetHeader()
    print E.GetParams()
    sys.stdout.flush()

    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds = Exons.ReadExonBoundaries(open(param_filename_cds, "r"))

    if param_loglevel >= 1:
        print "# read %i cds" % (len(cds))
        sys.stdout.flush()

    ninput, npairs, nskipped = 0, 0, 0

    for line in sys.stdin:
        if line[0] == "#":
            continue
        if line[0] == ">":
            print line[:-1]
            continue

        ninput += 1

Esempio n. 6

Mostra file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--boundaries",
                      dest="filename_boundaries",
                      type="string",
                      help="filename with exon boundaries.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exons (output).")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences.")

    parser.add_option(
        "-w",
        "--write-notfound",
        dest="write_notfound",
        action="store_true",
        help="print exons for predictions not found in reference.")

    parser.add_option("-q",
                      "--quality-pide",
                      dest="quality_threshold_pide",
                      type="int",
                      help="quality threshold (pide) for exons.")

    parser.set_defaults(
        genome_file="genome",
        filename_boundaries=None,
        filename_exons=None,
        filename_peptides=None,
        quality_threshold_pide=0,
        write_notfound=False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary=9,
        ## stop codons to search for
        stop_codons=("TAG", "TAA", "TGA"),
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries(open(
            options.filename_boundaries, "r"),
                                                             do_invert=1,
                                                             remove_utr=1)
        E.info("read exon boundaries for %i queries" %
               len(reference_exon_boundaries))

    if options.filename_exons:
        outfile_exons = open(options.filename_exons, "w")
        outfile_exons.write("%s\n" % "\t".join(
            ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame",
             "reference_id", "reference_from", "reference_to",
             "reference_phase", "pidentity", "psimilarity", "nframeshifts",
             "ngaps", "nstopcodons", "is_ok", "genome_exon_from",
             "genome_exon_to")))

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
        E.info("read peptide sequences for %i queries" %
               len(peptide_sequences))
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None

    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    options.stdout.write("%s\n" % "\t".join(
        ("prediction_id", "number", "dubious_exons", "boundaries_sum",
         "boundaries_max", "identical_exons", "inserted_exons",
         "deleted_exons", "inserted_introns", "deleted_introns",
         "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons",
         "deleted_Cexons", "inserted_Nexons", "inserted_Cexons")))

    for line in sys.stdin:

        if line[0] == "#": continue

        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries(
            entry.mMapPeptide2Genome,
            query_from=entry.mQueryFrom,
            sbjct_from=entry.mSbjctGenomeFrom,
            add_stop_codon=0)

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)

        genomic_fragment = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        skip = False
        if peptide_sequences.has_key(entry.mQueryToken):

            query_sequence = alignlib_lite.makeSequence(
                peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)

            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % (
                    entry.mQueryToken, query_sequence.getLength(),
                    entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True

            elif sbjct_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % (
                    entry.mSbjctToken, sbjct_sequence.getLength(),
                    entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence,
                    alignlib_lite.makeScorer(query_sequence, sbjct_sequence))
                percent_identity = alignlib_lite.calculatePercentIdentity(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity(
                    entry.mMapPeptide2Translation) * 100

            E.debug(
                "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f"
                %
                (str(entry.mPredictionId), entry.mPercentSimilarity,
                 entry.mPercentIdentity, percent_similarity, percent_identity))

        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0

        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key(entry.mQueryToken):
            print "# WARNING: sequence %s has no exon boundaries" % (
                entry.mQueryToken)
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True

        if not skip:

            nfound += 1

            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom

            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0

            inserted_exons = 0
            temp_inserted_exons = 0

            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write("# %s\n" % str(e))
                for e in ref_exons:
                    options.stdlog.write("# %s\n" % str(e))

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e, r = 0, 0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e + 1, r + 1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write("# current exons: %i and %i\n" %
                                         (e, r))
                    sys.stdout.flush()

                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[
                    e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0

                if query_sequence and sbjct_sequence:

                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment(tmp_ali,
                                                entry.mMapPeptide2Translation,
                                                xquery_from, xquery_to)

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write(
                            "# WARNING: empty alignment %s\n" % str(
                                (ref_from, exon_from, ref_to, exon_to,
                                 xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.AlignmentFormatExplicit(
                                    tmp_ali, query_sequence, sbjct_sequence)))

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity(
                            tmp_ali, query_sequence, sbjct_sequence) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity(
                            tmp_ali) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0

                if e < len(exons) - 1:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = exons[e + 1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = 0, 0, 0, 0, 0, []

                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (
                        ref_exons[r + 1].mPeptideFrom,
                        ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0

                if options.loglevel >= 2:
                    options.stdlog.write("# %s\n" % "\t".join(
                        map(str, (entry.mQueryToken, exon_from, exon_to,
                                  exon_phase, exon_genome_from, exon_genome_to,
                                  ref_from, ref_to, ref_phase))))
                    sys.stdout.flush()

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary

                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()

                        xquery_from = max(ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment(
                            tmp_ali, entry.mMapPeptide2Translation,
                            xquery_from, xquery_to)

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write(
                                "# warning: empty alignment %s\n" % str(
                                    (ref_from, exon_from, ref_to, exon_to,
                                     xquery_from, xquery_to)))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str(
                                    alignlib_lite.AlignmentFormatExplicit(
                                        tmp_ali, query_sequence,
                                        sbjct_sequence))

                            percent_identity = alignlib_lite.calculatePercentIdentity(
                                tmp_ali, query_sequence, sbjct_sequence) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity(
                                tmp_ali) * 100

                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom -
                                                       1) * 3 and dfrom > 0:
                        if is_good_exon:
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0

                    ## truncated terminal exons
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto <= (
                                entry.mQueryLength -
                                entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0

                    if e == len(exons) - 1 and dto <= (
                            entry.mQueryLength -
                            entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max(dfrom, exons_boundaries_max)
                        exons_boundaries_max = max(dto, exons_boundaries_max)

                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron

                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali,
                            genomic_fragment,
                            border_stop_codon=0)
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                this_r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                is_good_exon,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[
                    e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                0,
                                0,
                                0,
                                0,
                                r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                            )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity

                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[
                        0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali, genomic_fragment)

                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

        options.stdout.write("\t".join(
            map(str, (entry.mPredictionId, exons_num_exons, dubious_exons,
                      exons_boundaries_sum, exons_boundaries_max,
                      nidentical_exons, ninserted_exons, ndeleted_exons,
                      ninserted_introns, ndeleted_introns,
                      truncated_Nterminal_exon, truncated_Cterminal_exon,
                      ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons,
                      ninserted_Cexons))) + "\n")

Esempio n. 7

Mostra file

def ReadTranscriptsAndCds(transcript_ids1, transcript_ids2):

    if param_loglevel >= 1:
        print "# reading %i left and %i right transcripts" % (
            len(transcript_ids1), len(transcript_ids2))
        sys.stdout.flush()
    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds1 = Exons.ReadExonBoundaries(open(param_filename_cds1, "r"),
                                    filter=transcript_ids1,
                                    reset=True)
    cds2 = Exons.ReadExonBoundaries(open(param_filename_cds2, "r"),
                                    filter=transcript_ids2,
                                    reset=True)

    if param_loglevel >= 1:
        print "# read %i left and %i right cds" % (len(cds1), len(cds2))
        sys.stdout.flush()

    if param_loglevel >= 2:
        if len(cds1) != len(transcript_ids1):
            print "# missed in left:  %s" % ":".join(
                set(transcript_ids1.keys()).difference(cds1.keys()))
        if len(cds2) != len(transcript_ids2):
            print "# missed in right: %s" % ":".join(
                set(transcript_ids2.keys()).difference(cds2.keys()))

    if param_loglevel >= 1:
        print "# reading genomic sequences."
        sys.stdout.flush()

    transcripts1 = {}
    if param_filename_transcripts1:
        if param_mode_genome1 == "indexed":
            transcripts1 = Genomics.ParseFasta2HashFromIndex(
                param_filename_transcripts1, filter=transcript_ids1)
        else:
            transcripts1 = Genomics.ReadGenomicSequences(
                open(param_filename_transcripts1, "r"),
                do_reverse=0,
                filter=transcript_ids1,
                mask=param_mask)
    transcripts2 = {}
    if param_filename_transcripts2:
        if param_mode_genome2 == "indexed":
            transcripts2 = Genomics.ParseFasta2HashFromIndex(
                param_filename_transcripts2, filter=transcript_ids2)
        else:
            transcripts2 = Genomics.ReadGenomicSequences(
                open(param_filename_transcripts2, "r"),
                do_reverse=0,
                filter=transcript_ids2,
                mask=param_mask)
    if param_loglevel >= 1:
        print "# read %i left and %i right transcript sequences" % (
            len(transcripts1), len(transcripts2))
        sys.stdout.flush()

    return transcripts1, transcripts2, cds1, cds2

Esempio n. 8

Mostra file

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic data (indexed)." )

    parser.add_option("--coordinate-format", dest="coordinate_format", type="string",
                      help="input type of coordinates." )

    parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true",
                      help="output forward coordinates." )

    parser.add_option("-e", "--extract-id", dest="extract_id", type="string",
                      help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" )

    parser.set_defaults(
        coordinate_format = "zero-forward",
        forward_coordinates = False,
        genome_file = None,
        extract_id = None )

    (options, args) = E.Start( parser )
    
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    if options.extract_id:
        extract_id = re.compile( options.extract_id )
    else:
        extract_id = None

    converter = IndexedFasta.getConverter( options.coordinate_format )

    exons = Exons.ReadExonBoundaries( sys.stdin,
                                      contig_sizes = contig_sizes,
                                      converter = converter,
                                      do_invert = True,
                                      format = "gtf",
                                      gtf_extract_id = extract_id )

    ntranscripts, nexons, nerrors = 0, 0, 0
    for id, ee in exons.items():
        ntranscripts += 1
        has_error = False
        for e in ee:
            if options.forward_coordinates and e.mSbjctToken in contig_sizes and \
                    e.mSbjctStrand == "-":
                l = contig_sizes[e.mSbjctToken]
                e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom

            if e.mGenomeFrom < 0:
                has_error = True
                if options.loglevel >= 1:
                    options.stderr.write( "# Error: %s\n" % str(e) )
                break

            options.stdout.write( str(e) + "\n" )
            nexons += 1
                
        if has_error:
            nerrors += 1
            continue
    
    if options.loglevel >= 1:
        options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors))
    
    E.Stop()

Esempio n. 9

Mostra file

File: gff2predictions.py Progetto: santayana/cgat

                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")

    elif options.output_format == "exontable":
        if options.format == "exons":
            exons = Exons.ReadExonBoundaries(sys.stdin,
                                             contig_sizes=contig_sizes,
                                             delete_missing=True)
        else:
            raise "unknown format."

        for k in exons.keys():
            ee = exons[k]

            id = 0
            for e in ee:
                id += 1
                print "\t".join(
                    map(str, (e.mQueryToken, id, e.mPeptideFrom, e.mPeptideTo,
                              e.frame, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                              e.mGenomeFrom, e.mGenomeTo)))

Esempio n. 10

Mostra file

File: exons2stats.py Progetto: santayana/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-q",
                      "--quality",
                      dest="quality",
                      type="string",
                      help="quality categories to take into account.")
    parser.add_option("-f",
                      "--format=",
                      dest="format",
                      type="string",
                      help="input format [exons|gff|table]")

    parser.add_option("-e",
                      "--exons=",
                      dest="tablename_exons",
                      type="string",
                      help="table name with exons.")
    parser.add_option("-p",
                      "--predictions=",
                      dest="tablename_predictions",
                      type="string",
                      help="table name with predictions.")
    parser.add_option("-n",
                      "--non-redundant",
                      dest="non_redundant",
                      action="store_true",
                      help="only non-redundant predictions.")
    parser.add_option("-s",
                      "--schema",
                      dest="schema",
                      type="string",
                      help="schema to use.")

    parser.set_defaults(
        fields=[
            "Id", "NumExons", "GeneLength", "MinExonLength", "MaxExonLength",
            "MinIntronLength", "MaxIntronLength"
        ],
        tablename_exons="exons",
        tablename_predictions="predictions",
        quality=None,
        non_redundant=False,
        schema=None,
        tablename_redundant="redundant",
        tablename_quality="quality",
        format="exons",
    )

    (options, args) = E.Start(parser,
                              add_csv_options=True,
                              add_psql_options=True)

    if options.quality:
        options.quality = options.quality.split(",")

    if options.format == "table":
        dbhandle = pgdb.connect(options.psql_connection)
        exons = Exons.GetExonBoundariesFromTable(
            dbhandle,
            options.tablename_predictions,
            options.tablename_exons,
            non_redundant_filter=options.non_redundant,
            quality_filter=options.quality,
            table_name_quality=options.tablename_quality,
            table_name_redundant=options.tablename_redundant,
            schema=options.schema)
    else:
        exons = Exons.ReadExonBoundaries(sys.stdin)

    stats = Exons.CalculateStats(exons)

    print "\t".join(options.fields)

    writer = csv.DictWriter(sys.stdout,
                            options.fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    for k, v in stats.items():
        v["Id"] = k
        writer.writerow(v)

    E.Stop()

Esempio n. 11

Mostra file

File: prune_multiple_alignment.py Progetto: yangjl/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="master sequence.")

    parser.add_option("-p",
                      "--master-pattern",
                      dest="master_pattern",
                      type="string",
                      help="master pattern.")

    parser.add_option("--master-species",
                      dest="master_species",
                      type="string",
                      help="species to use as master sequences.")

    parser.add_option("-t",
                      "--translate",
                      dest="filename_translation",
                      type="string",
                      help="filename on where to store translated sequences.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-c",
                      "--mark-codons",
                      dest="mark_codons",
                      action="store_true",
                      help="mark codons.")

    parser.add_option(
        "-i",
        "--ignore-case",
        dest="ignore_case",
        action="store_true",
        help="ignore case (otherwise: lowercase are unaligned chars).")

    parser.add_option("--remove-stops",
                      dest="remove_stops",
                      action="store_true",
                      help="remove stop codons.")

    parser.add_option("--mask-stops",
                      dest="mask_stops",
                      action="store_true",
                      help="mask stop codons.")

    parser.add_option("--mask-char",
                      dest="mask_char",
                      type="string",
                      help="masking character to use.")

    parser.add_option("-f",
                      "--remove-frameshifts",
                      dest="remove_frameshifts",
                      action="store_true",
                      help="remove columns corresponding to frameshifts.")

    parser.add_option(
        "--mask-master",
        dest="mask_master",
        action="store_true",
        help=
        "columns in master to be removed are masked to keep residue numbering."
    )

    parser.add_option(
        "-s",
        "--split-exons",
        dest="split_exons",
        action="store_true",
        help="split columns aligned to different exons in the same gene.")

    parser.add_option("-a",
                      "--target",
                      dest="target",
                      type="choice",
                      choices=("paml", ),
                      help="perform cleaning up for certain targets.")

    parser.set_defaults(
        gap_char="-",
        mask_char="n",
        gap_chars="-.",
        separator="|",
        master=None,
        master_species=None,
        filename_translation=None,
        filename_exons=None,
        master_pattern=None,
        remove_stops=False,
        mark_codons=False,
        mask_unaligned=False,
        split_exons=False,
        remove_frameshifts=False,
        min_segment_length=5,
        ignore_case=False,
        mask_stops=False,
        target=None,
        mask_master=False,
    )

    (options, args) = E.Start(parser)

    if options.target == "paml":
        options.mask_stops = True
        options.mask_char = "n"
        options.remove_frameshifts = True

        if options.loglevel >= 1:
            options.stdlog.write(
                "# setting output to paml : removing frameshifts, masking stops with '%s'.\n"
                % (options.mask_char))

    ## 1. read multiple alignment in fasta format
    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    if options.loglevel >= 1:
        options.stdlog.write("# read mali with %i entries.\n" % len(mali))

    if len(mali) == 0:
        raise "empty multiple alignment"

    identifiers = mali.getIdentifiers()

    masters = []
    if options.master:
        masters = options.master.split(",")
    elif options.master_pattern:
        for id in identifiers:
            if re.search(options.master_pattern, id):
                masters.append(id)
    elif options.master_species:
        for id in identifiers:
            if options.master_species == id.split(options.separator)[0]:
                masters.append(id)
    else:
        masters.append(identifiers[0])

    if options.loglevel >= 2:
        options.stdlog.write("# master sequences are: %s\n" % str(masters))
        options.stdlog.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                         filter=set(identifiers),
                                         from_zero=True)

        if options.loglevel >= 2:
            options.stdlog.write("# read exons %i sequences.\n" % len(exons))
    else:
        exons = {}

    #################################################################################
    #################################################################################
    #################################################################################
    ## translate characters to upper/lower case according to exon info.
    #################################################################################
    if exons:
        for id in identifiers:
            if id in exons:
                mali.getSequence(id).mString = AddExonInformation(
                    mali[id], exons[id], mask_char=options.mask_char)

    elif options.ignore_case:
        ## convert all to uppercase
        mali.upper()

    #################################################################################
    #################################################################################
    #################################################################################
    ## untangle misaligned exons
    #################################################################################
    if exons and options.split_exons:

        ## first split with masters
        if len(masters) > 0:
            SplitExons(mali, exons, masters=masters, options=options)

            if options.loglevel >= 4:
                mali.writeToFile(open("log_mali1", "w"), format="fasta")

        SplitExons(mali, exons, options)

    #################################################################################
    #################################################################################
    #################################################################################
    ## remove frameshifts
    #################################################################################
    if options.remove_frameshifts:
        out_of_frame_columns = []
        if len(masters) == 1:

            frame_columns = GetFrameColumns(mali,
                                            masters[0],
                                            gap_chars=options.gap_chars)

        else:

            columns = []

            for id in masters:
                columns += GetFrameColumns(mali,
                                           id,
                                           gap_chars=options.gap_chars)

            if len(columns) == 0:
                columns += GetFrameColumns(mali,
                                           identifiers[0],
                                           gap_chars=options.gap_chars)

            # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100),
            # and (1,2,100) before (1,3,4).
            columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2])))

            # select codons
            frame_columns = []
            last_codon = columns[0]

            for codon in columns[1:]:
                # skip identical codons
                if codon == last_codon: continue

                # take first (shortest) codon in case of identical first residue
                if codon[0] == last_codon[0]:
                    continue

                # if not overlapping, keep
                if codon[0] > last_codon[2]:
                    frame_columns.append(last_codon)
                else:
                    out_of_frame_columns += last_codon

                # if overlapping, but out of register: skip
                last_codon = codon

            frame_columns.append(last_codon)

        # build set of skipped columns
        frame_set = set()
        for column in frame_columns:
            for c in column:
                frame_set.add(c)

        # columns that contain a master sequence that is out of
        # frame
        out_of_frame_set = set(out_of_frame_columns)
        out_of_frame_set = out_of_frame_set.difference(frame_set)

        if options.loglevel >= 1:
            options.stdlog.write("# found %i/%i columns in frame\n" %
                                 (len(frame_columns) * 3, mali.getWidth()))

            if options.loglevel >= 5:
                options.stdlog.write("# frame columns: %i\n" %
                                     (len(frame_columns)))
                x = 0
                for column in frame_columns:
                    options.stdlog.write("# %i\t%s\n" %
                                         (x, ",".join(map(str, column))))
                    x += 1

            if options.loglevel >= 5:
                options.stdlog.write(
                    "# Out-of frame columns with residue of masters: %i\n" %
                    (len(out_of_frame_set)))
                options.stdlog.write("# %s" %
                                     ",".join(map(str, out_of_frame_columns)))

        mask_chars = (string.upper(options.mask_char),
                      string.lower(options.mask_char))

        to_delete = []

        ignore_case = exons or options.ignore_case

        for id in identifiers:

            ngaps, nmasked = 0, 0

            sequence = mali.getSequence(id).mString

            if options.loglevel >= 7:
                options.stdlog.write(
                    "# processing sequence %s of length %i with gaps\n" %
                    (id, len(sequence)))

            ## treat masters differently if they are only to be masked, not
            ## pruned.
            ## simple mask all characters that are to skipped
            fragments = []
            nstops, ncodons, naligned = 0, 0, 0

            codon = []
            chars = []

            is_master = id in masters

            for x in range(len(sequence)):
                c = sequence[x]

                ## delete columns that do not align to
                ## a master.
                if x not in frame_set and x not in out_of_frame_set:
                    continue

                chars.append(c)
                if c not in options.gap_chars:
                    codon.append(c)
                if len(codon) % 3 == 0:
                    codon = "".join(codon)
                    codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon(
                        codon, options)

                    if codon_is_aligned: naligned += 1

                    to_mask = False
                    if codon_is_all_gaps:
                        ngaps += len(chars)
                    elif codon_is_ok:
                        ncodons += 1
                        if string.upper(codon) in ("TAG", "TAA", "TGA"):
                            nstops += 1
                            to_mask = True
                    else:
                        to_mask = True
                        nmasked += 1

                    if to_mask:
                        for i in range(len(chars)):
                            if chars[i] not in options.gap_chars:
                                chars[i] = options.mask_char

                    fragments.append("".join(chars))
                    chars = []
                    codon = []

            ## mask incomplete codons at the end
            if chars:
                for i in range(len(chars)):
                    if chars[i] not in options.gap_chars:
                        chars[i] = options.mask_char
                fragments.append("".join(chars))


##             else:

##                 for a,b,c in frame_columns:

##                     codon = sequence[a] + sequence[b] + sequence[c]

##                     codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options )

##                     if codon_is_aligned: naligned += 1

##                     if codon_is_all_gaps:
##                         fragments.append( options.gap_char * 3 )
##                         ngaps += 1
##                     elif codon_is_ok:
##                         ncodons += 1
##                         if string.upper(codon) in ("TAG", "TAA", "TGA"):
##                             if options.remove_stops:
##                                 fragments.append( options.gap_char * 3 )
##                             elif options.mask_stops:
##                                 fragments.append( options.mask_char * 3 )
##                             else:
##                                 fragments.append( codon )
##                             nstops += 1
##                         else:
##                             fragments.append( codon )
##                     else:
##                         fragments.append( options.gap_char * 3 )
##                         nmasked += 1

##                     if options.loglevel >= 7:
##                         options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id,
##                                                                                                  a,b,c,
##                                                                                                  codon,
##                                                                                                  str(codon_is_ok),
##                                                                                                  str(codon_is_aligned) ))

            s = string.join(fragments, "")
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n"
                    % (id, len(fragments), naligned, ncodons, nstops, ngaps,
                       nmasked))
                options.stdlog.flush()

            ## postpone deletion in order to not
            ## confuse the iteration of ids
            if naligned == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned nucleotides.\n"
                    % id)
                to_delete.append(id)
            elif ncodons == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned codons.\n"
                    % id)
                to_delete.append(id)
            else:
                mali.setSequence(id, string.join(fragments, ""))

        for id in to_delete:
            del mali[id]

    for id in identifiers:
        if options.mark_codons:
            a = mali[id]
            f = lambda x: a[x:x + 3]
            s = string.join([f(x) for x in range(0, len(a), 3)], " ")
        else:
            s = mali[id]
        options.stdout.write(">%s\n%s\n" % (id, s))

    if options.filename_translation:
        outfile = open(options.filename_translation, "w")
        for id in mali.keys():
            outfile.write(">%s\n%s\n" %
                          (id, Genomics.TranslateDNA2Protein(mali[id])))
        outfile.close()

    E.Stop()

Esempio n. 12

Mostra file

File: select_transcripts.py Progetto: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int",
                       help="overlap residues.")
    parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string",
                       help="filename to filter tokens." )
    parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true",
                       help="exon identity." )
    parser.add_option( "--exons", dest="filename_exons", type="string",
                       help="filename with exon information." )
    parser.add_option( "-m", "--output-members", dest="filename_members", type="string",
                       help="output filename with members." )
    parser.add_option( "--overlap-id", dest="overlap_id", action="store_true",
                       help="overlap id." )
    parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true",
                       help="remove spanning predictions." )
    parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true",
                       help="remove complementary predictions." )
    parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true",
                       help="remove exon swoppers." )
    parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true",
                       help="remove gene spanners." )
    parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true",
                       help="remove suboptimal predictions." )
    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide information." )
    parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string",
                       help="filename with peptide information - after extension." )
    
    parser.add_option( "--test", dest="test_nids", type="string",
                       help="test nids." )
    ## filter options
    parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string",
                       help="filename with transcripts that are used to filter." )
    parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true",
                       help="remove all transcripts that span the filter set." )
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )
    parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int",
                       help="if set discard clusters bigger than this size (patch) [default=%default]." )
    

    parser.set_defaults(
        filename_members = None,
        filename_peptides = None,
        filename_extended_peptides = None,
        filename_exons = None,
        quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ),
        ## Classes, where redundancy is removed by similarity. When exon structure
        ## is not conserved, I can't predict alternative splice variants, so remove
        ## the redundancy.
        quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ),
        quality_genes = ("CG", "SG", "PG", "RG", "UG"),
        ## class that can be removed in spanning/complementary predictions
        quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ),
        ## class that is required for defining exon swopper event
        quality_remove_exon_swopper = ("CG", "PG"),
        ## class that will kept, in spite of being an exons swopper.
        quality_keep_exon_swopper = (),
        ## class that is required for removing gene spanners
        quality_remove_gene_spanners = ("CG"),
        ## class that will kept, in spite of being a gene spanner
        quality_keep_gene_spanners = (),
        ## class that is required for defining suboptimal matches
        quality_remove_suboptimal = ("CG", "PG" ),
        ## class that will be kept, in spite of being a suboptimal match
        quality_keep_suboptimal = (),
        ## gap penalties
        gop = -10.0,
        gep = -1.0,
        ## maximum number of gaps to allow in alignment
        max_gaps = 20,
        ## threshold of percent identity that allows to remove a prediction
        ## of a lower class.
        ## This allows for insertions/deletions
        min_identity = 98,
        ## threshold of percent identity that allows to remove a prediction
        ## of a non-gene by a gene
        min_identity_non_genes = 80,
        ## safety threshold: do not remove, if coverage of member is by x better
        ## than representative
        safety_pide = 10,
        safety_coverage = 10,
        overlap_id = False,
        remove_spanning_predictions = False,
        remove_exon_swoppers = False,
        remove_gene_spanners = False,
        remove_suboptimal = False,
        ## nids to use for testing
        test_nids = None,
        ## remove members with less than maximum coverage
        max_member_coverage = 90,
        ## maximum allowable exon slippage
        max_slippage = 9,
        ## minimum difference in identity for suboptimal predictions to be removed.
        suboptimal_min_identity_difference = 10,
        ## filter options
        filter_filename_transcripts = None,
        filter_remove_spanning = True,
        filter_remove_spanning_both_strands = True,
        genome_file = None,
        discard_large_clusters = None )
    
    (options, args) = E.Start( parser, add_psql_options = True )    

    if options.test_nids: options.test_nids = options.test_nids.split(",")

    # list of eliminated predictions
    eliminated_predictions = {}
    
    if options.filename_members:
        outfile_members = open( options.filename_members, "w" )
    else:
        outfile_members = sys.stdout

    ######################################################
    ######################################################
    ######################################################        
    # data
    ######################################################    
    data = []

    class Entry:
        def __init__(self, gff):
            self.mPid = float(gff["pid"])
            self.mQueryCoverage = float(gff["qcov"])
            self.gene_id = gff['gene_id']
            self.transcript_id = gff['transcript_id']
            self.mExtendedStart = int( gff['xstart'] )
            self.mExtendedEnd = int( gff['xend'] )
            self.start = gff.start
            self.contig = gff.contig
            self.strand = gff.strand
            self.end = gff.end
            self.mQuality = gff['class']
            
    for gff in GTF.iterator( sys.stdin ):
        data.append( Entry(gff) )

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i transcripts.\n" % len(data) )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    # read peptide sequences
    ######################################################    
    if options.loglevel >= 1:
        options.stdlog.write( "# loading peptide databases ... " )
        options.stdlog.flush()

    if options.filename_peptides:
        peptides = IndexedFasta.IndexedFasta( options.filename_peptides )
        peptide_lengths = peptides.getContigSizes()
    else:
        peptide_lengths = {}
        peptides = {}

    ######################################################
    ######################################################
    ######################################################        
    # read extended peptide sequences
    ######################################################    
    if options.filename_extended_peptides:
        extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides )
    else:
        extended_peptides = {}

    if options.loglevel >= 1:
        options.stdlog.write( "finished\n" )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    ## open genome file
    ######################################################        
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    ######################################################
    ######################################################
    ######################################################        
    ## reading exons, clustering and formatting them.
    ######################################################        
    if options.filename_exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries ... " )
            options.stdlog.flush()
            
        ids = [ x.transcript_id for x in data ] 

        exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"),
                                          contig_sizes = contig_sizes,
                                          filter = set(ids) )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) ))

        if len(exons) == 0:
            raise ValueError("no exons found in table.")
            
        # flag terminal exons
        Exons.SetRankToPositionFlag( exons )

        identity_map_cluster2transcripts, identity_map_transcript2cluster =\
                                          Exons.ClusterByExonIdentity( exons,
                                                                       max_terminal_num_exons = 3,
                                                                       max_slippage= options.max_slippage,
                                                                       loglevel = options.loglevel )

        overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\
                                         Exons.ClusterByExonOverlap( exons,
                                                                     min_overlap = 10,
                                                                     loglevel = options.loglevel )
    else:
        exons = {}

    ######################################################        
    nrepresentatives, nmembers, neliminated = 0, 0, 0
    eliminated_by_method = {}

    ######################################################
    ######################################################
    ######################################################
    ## read filter transcripts and apply filters
    ######################################################        
    if options.filter_filename_transcripts:

        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries for filter set ... " )
            options.stdlog.flush()
            
        filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ),
                                                 delete_missing = True,
                                                 contig_sizes = contig_sizes )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) )
        
        t = time.time()
        eliminated = FilterEliminateOverlappingTranscripts( exons,
                                                            filter_exons,
                                                            eliminated_predictions,
                                                            contig_sizes,
                                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t ))
            options.stdlog.flush()

    if options.remove_exon_swoppers and not exons:
        raise ValueError( "please specify exon table if using --remove-swoppers." )
    if options.remove_gene_spanners and not exons:
        raise ValueError( "please specify exon table if using --remove-gene-spanners." )

    ########################################################################################
    ## remove predictions spanning other predictions but do not overlap with them on an exon level.
    if options.remove_gene_spanners and exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# removing gene spanners\n" )
            options.stdlog.flush()
            
        t = time.time()
        eliminated = EliminateGeneSpanners( data,
                                            eliminated_predictions,
                                            exons,
                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t ))
            options.stdlog.flush()
            
    ########################################################################################
    ## sort data by quality, length of prediction and coverage * pid

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data\n" )
        options.stdlog.flush()

    map2pos = {}
    for x in range(len(options.quality_hierarchy)):
        map2pos[options.quality_hierarchy[x]] = x

    data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) )

    # build map of prediction to quality
    map_prediction2data = {}
    for d in data:
        map_prediction2data[d.transcript_id] = d

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data finished\n" )
        options.stdlog.flush()

    ########################################################################################
    ## remove predictions joining two other complete non-overlapping predictions
    if options.remove_exon_swoppers and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing exon swoppers\n" )
            options.stdlog.flush()

        eliminated = EliminateExonSwoppers( data,
                                            eliminated_predictions,
                                            identity_map_transcript2cluster,
                                            identity_map_cluster2transcripts,
                                            map_prediction2data,
                                            exons,
                                            options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i exon swoppers\n" % n )
            options.stdlog.flush()

    ########################################################################################
    ## remove suboptimal predictions
    if options.remove_suboptimal and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing suboptimal predictions\n" )
            options.stdlog.flush()

        t = time.time()
        eliminated = EliminateSuboptimalPredictions( data,
                                                     eliminated_predictions,
                                                     overlap_map_transcript2cluster,
                                                     overlap_map_cluster2transcripts,
                                                     map_prediction2data,
                                                     exons,
                                                     options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) )
            options.stdlog.flush()        

    ########################################################################################
    ## remove redundant predictions
    l = len(data)
        
    options.report_step = max(1, int(l / 100))

    t2= time.time()

    last_quality = None
    qualities = []

    options.stdout.write( "%s\t%s\n" % ("rep", "comment") )
    
    for x in range(len(data)):

        if options.loglevel >= 1:
            if x % options.report_step  == 0:
                options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \
                                      (x+1, l,
                                       int(100 * (x+1) / l),
                                       len(eliminated_predictions), l,
                                       100 * len(eliminated_predictions) / l,
                                       time.time() - t2 ) )
                                                                    
                options.stdlog.flush()
                
        rep = data[x]

        rep_id, rep_quality = rep.transcript_id, rep.mQuality
        
        if rep_id in eliminated_predictions: continue

        if rep_quality != last_quality:
            if last_quality:
                qualities.append( last_quality )
            last_quality = rep_quality
        
        if options.loglevel >= 2:
            options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) )
            options.stdlog.flush()

        eliminated = []

        if options.overlap_id:
            eliminated += EliminateRedundantEntriesByOverlap( rep,
                                                              data[x+1:],
                                                              eliminated_predictions,
                                                              options,
                                                              peptides, 
                                                              extended_peptides,
                                                              filter_quality = qualities,
                                                              this_quality = rep_quality )
                                                              
        else:
            eliminated += EliminateRedundantEntriesByRange( rep,
                                                            data,
                                                            eliminated_predictions,
                                                            options,
                                                            peptides, 
                                                            extended_peptides,
                                                            filter_quality = qualities,
                                                            this_quality = rep_quality )

        options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) )

        if outfile_members:
            outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id)))
            nrepresentatives += 1
            nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method )            

    if outfile_members != sys.stdout:
        outfile_members.close()

    options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\
                          (nrepresentatives, nmembers, neliminated,
                           nrepresentatives+nmembers+neliminated ) )
    
    options.stdlog.write( "# elimination by method:\n" )
    
    for v,c in eliminated_by_method.items():
        options.stdlog.write( "# method=%s, count=%i\n" % (v, c) )

    E.Stop()

Esempio n. 13

Mostra file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2cleaned_mali.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-m",
                      "--genome-master",
                      dest="genome_master",
                      type="string",
                      help="genome to use as master.")

    parser.add_option("-s",
                      "--filename-removed",
                      dest="filename_removed",
                      type="string",
                      help="output filename for deleted entries.")

    parser.add_option("-e",
                      "--filename-exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="output filename of component summary.")

    parser.add_option("-c",
                      "--filename-components",
                      dest="filename_components",
                      type="string",
                      help="output filename for components.")

    parser.add_option(
        "--min-percent-overlap",
        dest="min_percent_overlap",
        type="float",
        help=
        "minimum percent overlap for splitting multiple alignment into components."
    )

    parser.add_option("--max-percent-overlap",
                      dest="max_percent_overlap",
                      type="float",
                      help="maximum percent overlap for split genes.")

    parser.add_option(
        "--min-genomic-distance",
        dest="min_genomic_distance",
        type="int",
        help=
        "minimum genomic distance for adjacent genes to be considered dodgy.")

    parser.add_option("-o",
                      "--mode",
                      dest="mode",
                      type="choice",
                      choices=("joining", "split"),
                      help="""how to filter the alignment.
                      joining: remove joining transcripts (spindly genes)
                      split:  remove split transcripts""")

    parser.add_option(
        "-g",
        "--gene-mode",
        dest="gene_mode",
        action="store_true",
        help=
        """the aligned sequences are genes. This forces the exon boundaries to
                      collated by genes.""")

    parser.set_defaults( \
        genome_master = None,
        filename_removed = None,
        filename_components = None,
        filename_summary = None,
        filename_exons = None,
        mode="joining",
        input_format = "fasta",
        output_format = "fasta",
        max_percent_overlap = 0,
        min_percent_overlap = 0,
        gene_mode = False,
        separator = "|")

    (options, args) = E.Start(parser)

    ###############################################################
    ###############################################################
    ###############################################################
    ## input
    ###############################################################

    mali = Mali.Mali()
    mali.readFromFile(sys.stdin, format=options.input_format)
    all_identifiers = mali.getIdentifiers()

    if options.filename_exons:
        ## read exon boundaries and keep forward coordinates

        if options.gene_mode:
            exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                             from_zero=True)

            gene_exons = {}
            for id, ee in exons.items():
                data = id.split(options.separator)
                new_id = options.separator.join((data[0], data[2]))
                if new_id not in gene_exons: gene_exons[new_id] = []
                for e in ee:
                    e.mQueryToken = new_id
                gene_exons[new_id] += ee
            for id, ee in gene_exons.items():
                ee.sort(lambda x, y: cmp(x.mGenomeFrom, y.mGenomeFrom))
            exons = gene_exons

        else:
            exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                             filter=set(all_identifiers),
                                             from_zero=True)

    else:
        exons = {}

    ###############################################################
    ###############################################################
    ###############################################################
    ## collect all transcripts for a species together with their
    ## aligned length
    ###############################################################
    map_species2transcripts = {}

    for id in mali.getIdentifiers():
        data = id.split(options.separator)

        species = data[0]

        if exons:
            l = exons[id][-1].mGenomeTo - exons[id][0].mGenomeFrom
        else:
            l = len(mali.getEntry(id).getSequence())

        try:
            map_species2transcripts[species].append((l, id))
        except KeyError:
            map_species2transcripts[species] = [(l, id)]

    if options.mode == "joining":
        mapped_transcripts = removeJoiningTranscripts(mali, exons,
                                                      map_species2transcripts,
                                                      options)

    elif options.mode == "split":
        mapped_transcripts = removeSplitTranscripts(mali, exons,
                                                    map_species2transcripts,
                                                    options)

    ###############################################################
    ###############################################################
    ###############################################################
    ## now build overlap graph of remaining sequences split multiple
    ## alignment in components.
    ## Compute reciprocal best match graph
    ###############################################################
    graph = networkx.Graph()

    removed_transcripts = set(map(lambda x: x[0], mapped_transcripts))

    for t in all_identifiers:
        if t not in removed_transcripts:
            graph.add_node(t)

    for t1 in range(len(all_identifiers) - 1):
        transcript1 = all_identifiers[t1]
        if transcript1 in removed_transcripts: continue

        for t2 in range(t1 + 1, len(all_identifiers)):
            transcript2 = all_identifiers[t2]
            if transcript2 in removed_transcripts: continue

            overlap = getPercentOverlap(mali[transcript1], mali[transcript2])
            if overlap > 5:
                graph.add_edge(transcript1, transcript2)

    ## compute components
    components = networkx.connected_components(graph)

    ###############################################################
    ###############################################################
    ###############################################################
    ## output
    ###############################################################
    if options.filename_components:
        n = 1
        outfile = open(options.filename_components, "w")

        outfile.write("id\tcomponent\n")
        for component in components:
            for c in component:
                outfile.write("%s\t%i\n" % (c, n))
            n += 1
        outfile.close()

    if options.filename_removed and len(removed_transcripts) > 0:
        outfile = open(options.filename_removed, "w")
        outfile.write("removed\trepresentative\treason\n")
        for removed_transcript, rep_transcript, reason in mapped_transcripts:
            outfile.write("%s\t%s\t%s\n" %
                          (removed_transcript, rep_transcript, reason))
        outfile.close()

    if options.filename_summary:
        n = 1
        outfile = open(options.filename_summary, "w")
        outfile.write("component\tsize\tnspecies\tnmaster\n")
        for component in components:
            species = map(lambda x: x.split(options.separator)[0], component)
            outfile.write(
                "%i\t%i\t%i\t%i\t%i\n" %
                (n, len(component), len(species),
                 len(filter(lambda x: x == options.genome_master, species))))

            n += 1

    for transcript in removed_transcripts:
        mali.deleteEntry(transcript)

    new_identifiers = mali.getIdentifiers()

    mali.removeGaps(minimum_gaps=len(new_identifiers))

    mali.writeToFile(options.stdout, format=options.output_format)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# input=%i, output=%i, removed=%i, ncomponents=%i\n" %
            (len(all_identifiers), len(new_identifiers),
             len(removed_transcripts), len(components)))
        options.stdlog.write("# final component sizes: %s\n" %
                             ",".join(map(lambda x: str(len(x)), components)))

    E.Stop()