Esempio n. 1
0
def ClusterByExonCorrespondence(lengths={}, peptide_sequences=None):

    exons = Exons.ReadExonBoundaries(sys.stdin)
    if param_loglevel >= 1:
        print "# read exons for %i transcripts" % len(exons)

    if not lengths:
        for k in exons:
            lengths[k] = (exons[k][0].mPeptideTo / 3) + 1
            for e in exons[k][1:]:
                lengths[k] = max(lengths[k], (e.mPeptideTo / 3) + 1)

        if param_loglevel >= 1:
            print "# lengths for %i transcripts" % len(lengths)

    map_region2transcript = {}
    map_transcript2region = {}
    map_transcript2transcript = {}
    ## build map of regions to transcripts
    for t in exons:
        map_transcript2region[t] = []
        for e in exons[t]:
            r = "%s-%s-%i-%i" % (e.mSbjctToken, e.mSbjctStrand, e.mGenomeFrom,
                                 e.mGenomeTo)
            if r not in map_region2transcript: map_region2transcript[r] = []
            map_region2transcript[r].append(t)
            map_transcript2region[t].append(r)

    ## build map of transcript to transcript
    map_transcript2transcript = {}

    for t in map_transcript2region:
        map_transcript2transcript[t] = []
        for r in map_transcript2region[t]:
            for tt in map_region2transcript[r]:
                map_transcript2transcript[t].append(tt)

    for t in map_transcript2transcript:
        map_transcript2transcript[t].sort()
        l = None
        n = []
        for tt in map_transcript2transcript[t]:
            if t == tt: continue
            if l != tt: n.append(tt)
            l = tt
        map_transcript2transcript[t] = n

    ## cluster greedily, take longest transcript
    cluster_id = 1
    for t in map_transcript2region:
        if t not in map_transcript2transcript: continue
        cluster = CollectCluster(map_transcript2transcript, t)
        PrintCluster(cluster, cluster_id, lengths, peptide_sequences,
                     param_regex_preferred)
        cluster_id += 1

    if param_loglevel >= 1:
        print "# RESULT: %i transcripts in %i genes" % (
            len(map_transcript2region), cluster_id - 1)
Esempio n. 2
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/exons2exons.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      help="method to apply.",
                      type="choice",
                      choices=("remove-stop", ))

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option("--forward-coordinates",
                      dest="forward_coordinates",
                      action="store_true",
                      help="work in forward coordinates.")

    parser.set_defaults(method=None,
                        forward_coordinates=False,
                        genome_file=None)

    (options, args) = E.Start(parser)

    if options.method == "remove-stop" and not options.genome_file:
        raise "please supply genome file for method %s" % options.method

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()
        exons = Exons.ReadExonBoundaries(sys.stdin, contig_sizes=contig_sizes)
    else:
        exons = Exons.ReadExonBoundaries(sys.stdin)

    ninput, noutput, nremoved_stops, nremoved_exons = 0, 0, 0, 0
    for id, ee in exons.items():

        if options.loglevel >= 3:
            for e in ee:
                options.stdlog.write("# %s\n" % str(e))

        if options.method == "remove-stop":
            e = ee[-1]
            d = min(3, e.mPeptideTo - e.mPeptideFrom)
            if d < 3:
                codon2 = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand,
                                           e.mGenomeTo - d, e.mGenomeTo)
                prev_e = ee[-2]
                codon1 = fasta.getSequence(prev_e.mSbjctToken,
                                           prev_e.mSbjctStrand,
                                           prev_e.mGenomeTo - (3 - d),
                                           prev_e.mGenomeTo)
                codon = codon1 + codon2
            else:
                codon = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand,
                                          e.mGenomeTo - d, e.mGenomeTo)

            if codon.upper() in Genomics.StopCodons:

                if d < 3:
                    nremoved_exons += 1
                    d = 3 - d
                    del ee[-1]
                    e = ee[-1]

                e.mGenomeTo -= d
                e.mPeptideTo -= d
                nremoved_stops += 1

                if e.mGenomeTo == e.mGenomeFrom:
                    nremoved_exons += 1
                    del ee[-1]
                    e = ee[-1]

            assert (e.mGenomeTo > e.mGenomeFrom)
            assert (e.mPeptideTo > e.mPeptideFrom)

        if options.forward_coordinates:

            l = contig_sizes[ee[0].mSbjctToken]
            for e in ee:
                e.InvertGenomicCoordinates(l)

        for e in ee:
            options.stdout.write(str(e) + "\n")

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, nremoved_stops=%i, nremoved_exons=%i\n" %
            (ninput, noutput, nremoved_stops, nremoved_exons))

    E.Stop()
Esempio n. 3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--sequences",
                      dest="filename_sequences",
                      type="string",
                      help="peptide sequence [Default=%default]")

    parser.add_option("-f",
                      "--format",
                      dest="format",
                      type="string",
                      help="output format [Default=%default]")

    parser.add_option(
        "-e",
        "--expand",
        dest="expand",
        action="store_true",
        help=
        "expand positions from peptide to nucleotide alignment [Default=%default]"
    )

    parser.add_option("-m",
                      "--map",
                      dest="filename_map",
                      type="string",
                      help="map alignments [Default=%default]")

    parser.add_option("-c",
                      "--codons",
                      dest="require_codons",
                      action="store_true",
                      help="require codons [Default=%default]")

    parser.add_option(
        "--one-based-coordinates",
        dest="one_based_coordinates",
        action="store_true",
        help=
        "expect one-based coordinates. The default are zero based coordinates [Default=%default]."
    )

    parser.add_option("--no-identical",
                      dest="no_identical",
                      action="store_true",
                      help="do not output identical pairs [Default=%default]")

    parser.add_option(
        "-g",
        "--no-gaps",
        dest="no_gaps",
        action="store_true",
        help="remove all gaps from aligned sequences [Default=%default]")

    parser.add_option("-x",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exon boundaries [Default=%default]")

    parser.add_option("-o",
                      "--outfile",
                      dest="filename_outfile",
                      type="string",
                      help="filename to save links [Default=%default]")

    parser.add_option("--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of alignment [Default=%default]")

    parser.add_option(
        "--filter",
        dest="filename_filter",
        type="string",
        help=
        "given a set of previous alignments, only write new pairs [Default=%default]."
    )

    parser.set_defaults(filename_sequences=None,
                        filename_exons=None,
                        filename_map=None,
                        filename_outfile=None,
                        no_gaps=False,
                        format="fasta",
                        expand=False,
                        require_codons=False,
                        no_identical=False,
                        min_length=0,
                        report_step=100,
                        one_based_coordinates=False,
                        filename_filter=None)

    (options, args) = E.Start(parser, add_mysql_options=True)

    t0 = time.time()
    if options.filename_sequences:
        sequences = Genomics.ReadPeptideSequences(
            open(options.filename_sequences, "r"))
    else:
        sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i sequences\n" % len(sequences))
        sys.stdout.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"))
    else:
        exons = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i exons\n" % len(exons))
        sys.stdout.flush()

    if options.filename_map:
        map_old2new = {}
        for line in open(options.filename_map, "r"):
            if line[0] == "#":
                continue
            m = Map()
            m.read(line)
            map_old2new[m.mToken] = m
    else:
        map_old2new = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i maps\n" % len(map_old2new))
        sys.stdout.flush()

    if options.filename_filter:
        if options.loglevel >= 1:
            options.stdlog.write("# reading filtering information.\n")
            sys.stdout.flush()

        map_pair2hids = {}

        if os.path.exists(options.filename_filter):

            infile = open(options.filename_filter, "r")

            iterator = FastaIterator.FastaIterator(infile)

            while 1:
                cur_record = iterator.next()
                if cur_record is None:
                    break

                record1 = cur_record

                cur_record = iterator.next()
                if cur_record is None:
                    break

                record2 = cur_record

                identifier1 = re.match("(\S+)", record1.title).groups()[0]
                identifier2 = re.match("(\S+)", record2.title).groups()[0]

                id = "%s-%s" % (identifier1, identifier2)
                s = Genomics.GetHID(record1.sequence + ";" + record2.sequence)

                if id not in map_pair2hids:
                    map_pair2hids[id] = []

                map_pair2hids[id].append(s)

            infile.close()

        if options.loglevel >= 1:
            options.stdlog.write(
                "# read filtering information for %i pairs.\n" %
                len(map_pair2hids))
            sys.stdout.flush()
    else:
        map_pair2hids = None

    if options.loglevel >= 1:
        options.stdlog.write("# finished input in %i seconds.\n" %
                             (time.time() - t0))

    if options.filename_outfile:
        outfile = open(options.filename_outfile, "w")
    else:
        outfile = None

    map_row2col = alignlib_lite.py_makeAlignmentVector()
    tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector()
    counts = {}

    iterations = 0

    t1 = time.time()
    ninput, nskipped, noutput = 0, 0, 0

    for link in BlastAlignments.iterator_links(sys.stdin):

        iterations += 1
        ninput += 1

        if options.loglevel >= 1:
            if (iterations % options.report_step == 0):
                options.stdlog.write("# iterations: %i in %i seconds.\n" %
                                     (iterations, time.time() - t1))
                sys.stdout.flush()

        if link.mQueryToken not in sequences or \
           link.mSbjctToken not in sequences:
            nskipped += 1
            continue

        if options.loglevel >= 3:
            options.stdlog.write("# read link %s\n" % str(link))

        row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken])
        col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken])

        if options.one_based_coordinates:
            link.mQueryFrom -= 1
            link.mSbjctFrom -= 1

        if options.expand:
            link.mQueryFrom = link.mQueryFrom * 3
            link.mSbjctFrom = link.mSbjctFrom * 3
            link.mQueryAli = ScaleAlignment(link.mQueryAli, 3)
            link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3)

        map_row2col.clear()

        alignlib_lite.py_AlignmentFormatEmissions(
            link.mQueryFrom, link.mQueryAli, link.mSbjctFrom,
            link.mSbjctAli).copy(map_row2col)

        if link.mQueryToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mQueryToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in row with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mQueryToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New,
                map_row2col, alignlib_lite.py_RR)
            map_old2new[link.mQueryToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        if link.mSbjctToken in map_old2new:
            tmp1_map_row2col.clear()
            map_old2new[link.mSbjctToken].expand()
            if options.loglevel >= 3:
                options.stdlog.write("# combining in col with %s\n" % str(
                    alignlib_lite.py_AlignmentFormatEmissions(
                        map_old2new[link.mSbjctToken].mMapOld2New)))

            alignlib_lite.py_combineAlignment(
                tmp1_map_row2col, map_row2col,
                map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR)
            map_old2new[link.mSbjctToken].clear()
            alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col)

        dr = row_seq.getLength() - map_row2col.getRowTo()
        dc = col_seq.getLength() - map_row2col.getColTo()
        if dr < 0 or dc < 0:
            raise ValueError(
                "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s"
                %
                (link.mQueryToken, link.mSbjctToken, row_seq.getLength(),
                 col_seq.getLength(),
                 str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col))))

        if options.loglevel >= 2:
            options.stdlog.write(
                str(
                    alignlib_lite.py_AlignmentFormatExplicit(
                        map_row2col, row_seq, col_seq)) + "\n")
        # check for incomplete codons
        if options.require_codons:

            naligned = map_row2col.getNumAligned()

            # turned off, while fixing alignlib_lite
            if naligned % 3 != 0:
                options.stdlog.write("# %s\n" % str(map_row2col))
                options.stdlog.write("# %s\n" % str(link))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mQueryToken]))
                options.stdlog.write("# %s\n" %
                                     str(map_old2new[link.mSbjctToken]))
                options.stdlog.write("#\n%s\n" %
                                     alignlib_lite.py_AlignmentFormatExplicit(
                                         map_row2col, row_seq, col_seq))

                raise ValueError(
                    "incomplete codons %i in pair %s - %s" %
                    (naligned, link.mQueryToken, link.mSbjctToken))

        # if so desired, write on a per exon level:
        if exons:
            if link.mQueryToken not in exons:
                raise IndexError("%s not found in exons" % (link.mQueryToken))
            if link.mSbjctToken not in exons:
                raise IndexError("%s not found in exons" % (link.mSbjctToken))
            exons1 = exons[link.mQueryToken]
            exons2 = exons[link.mSbjctToken]

            # Get overlapping segments
            segments = Exons.MatchExons(map_row2col, exons1, exons2)

            for a, b in segments:
                tmp1_map_row2col.clear()

                # make sure you got codon boundaries. Note that frameshifts
                # in previous exons will cause the codons to start at positions
                # different from mod 3. The problem is that I don't know where
                # the frameshifts occur exactly. The exon boundaries are given
                # with respect to the cds, which include the frame shifts.
                # Unfortunately, phase information seems to be incomplete in
                # the input files.

                from1, to1 = GetAdjustedBoundaries(a, exons1)
                from2, to2 = GetAdjustedBoundaries(b, exons2)

                alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col,
                                               from1 + 1, to1, from2 + 1, to2)

                mode = Write(tmp1_map_row2col,
                             row_seq,
                             col_seq,
                             link,
                             no_gaps=options.no_gaps,
                             no_identical=options.no_identical,
                             min_length=options.min_length,
                             suffix1="_%s" % str(a),
                             suffix2="_%s" % str(b),
                             outfile=outfile,
                             pair_filter=map_pair2hid,
                             format=options.format)

                if mode not in counts:
                    counts[mode] = 0
                counts[mode] += 1

        else:
            mode = Write(map_row2col,
                         row_seq,
                         col_seq,
                         link,
                         min_length=options.min_length,
                         no_gaps=options.no_gaps,
                         no_identical=options.no_identical,
                         outfile=outfile,
                         pair_filter=map_pair2hids,
                         format=options.format)

            if mode not in counts:
                counts[mode] = 0
            counts[mode] += 1

        noutput += 1

    if outfile:
        outfile.close()

    if options.loglevel >= 1:
        options.stdlog.write("# %s\n" % ", ".join(
            map(lambda x, y: "%s=%i" %
                (x, y), counts.keys(), counts.values())))
        options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" %
                             (ninput, noutput, nskipped))

    E.Stop()
Esempio n. 4
0
            if line[0] == "#":
                continue
            if line[0] == ">":
                continue
            a, b = line[:-1].split("\t")[:2]
            if b not in components:
                components[b] = []
            components[b].append(a)

        if param_loglevel >= 1:
            print "# read %i components." % len(components)
    else:
        components = {'all': all_identifiers}

    if param_filename_exons:
        exons = Exons.ReadExonBoundaries(
            open(param_filename_exons, "r"), filter=all_mali)
        if param_loglevel >= 2:
            print "# read %i exons." % len(exons)
    else:
        exons = {}

    print "# PREFIX\tsummary\tNSEQUENCES\tNASSIGNED\tNCLUSTERS\tNASSIGNED\tUNASSIGNED"
    print "# PREFIX\tcluster\tNMEMBERS\tMEMBERS"
    print "# PREFIX\tfragments\tNFRAGMENTS\tFRAGMENTS"

    print "# PREFIX\tpide\tNPAIRS\tNAMIN\tNAMAX\tNAMEAN\tNAMEDIAN\tNASTDDEV\tAAMIN\tAAMAX\tAAMEAN\tAAMEDIAN\tAASTDDEV"

    print string.join(("# PREFIX", "codons",
                       "NCLEAN", "NNOSTOPS",
                       "ALIGNED_MIN", "ALIGNED_MAX", "ALIGNED_MEAN", "ALIGNED_MEDIAN", "ALIGNED_STDDEV",
                       "CODONS_MIN", "CODONS_MAX", "CODONS_MEAN", "CODONS_MEDIAN", "CODONS_STDDEV",
Esempio n. 5
0
        elif o == "--report-step":
            param_report_step = int(a)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    print E.GetHeader()
    print E.GetParams()
    sys.stdout.flush()

    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds = Exons.ReadExonBoundaries(open(param_filename_cds, "r"))

    if param_loglevel >= 1:
        print "# read %i cds" % (len(cds))
        sys.stdout.flush()

    ninput, npairs, nskipped = 0, 0, 0

    for line in sys.stdin:
        if line[0] == "#":
            continue
        if line[0] == ">":
            print line[:-1]
            continue

        ninput += 1
Esempio n. 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--boundaries",
                      dest="filename_boundaries",
                      type="string",
                      help="filename with exon boundaries.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exons (output).")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences.")

    parser.add_option(
        "-w",
        "--write-notfound",
        dest="write_notfound",
        action="store_true",
        help="print exons for predictions not found in reference.")

    parser.add_option("-q",
                      "--quality-pide",
                      dest="quality_threshold_pide",
                      type="int",
                      help="quality threshold (pide) for exons.")

    parser.set_defaults(
        genome_file="genome",
        filename_boundaries=None,
        filename_exons=None,
        filename_peptides=None,
        quality_threshold_pide=0,
        write_notfound=False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary=9,
        ## stop codons to search for
        stop_codons=("TAG", "TAA", "TGA"),
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries(open(
            options.filename_boundaries, "r"),
                                                             do_invert=1,
                                                             remove_utr=1)
        E.info("read exon boundaries for %i queries" %
               len(reference_exon_boundaries))

    if options.filename_exons:
        outfile_exons = open(options.filename_exons, "w")
        outfile_exons.write("%s\n" % "\t".join(
            ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame",
             "reference_id", "reference_from", "reference_to",
             "reference_phase", "pidentity", "psimilarity", "nframeshifts",
             "ngaps", "nstopcodons", "is_ok", "genome_exon_from",
             "genome_exon_to")))

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
        E.info("read peptide sequences for %i queries" %
               len(peptide_sequences))
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None

    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    options.stdout.write("%s\n" % "\t".join(
        ("prediction_id", "number", "dubious_exons", "boundaries_sum",
         "boundaries_max", "identical_exons", "inserted_exons",
         "deleted_exons", "inserted_introns", "deleted_introns",
         "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons",
         "deleted_Cexons", "inserted_Nexons", "inserted_Cexons")))

    for line in sys.stdin:

        if line[0] == "#": continue

        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries(
            entry.mMapPeptide2Genome,
            query_from=entry.mQueryFrom,
            sbjct_from=entry.mSbjctGenomeFrom,
            add_stop_codon=0)

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)

        genomic_fragment = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        skip = False
        if peptide_sequences.has_key(entry.mQueryToken):

            query_sequence = alignlib_lite.makeSequence(
                peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)

            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % (
                    entry.mQueryToken, query_sequence.getLength(),
                    entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True

            elif sbjct_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % (
                    entry.mSbjctToken, sbjct_sequence.getLength(),
                    entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence,
                    alignlib_lite.makeScorer(query_sequence, sbjct_sequence))
                percent_identity = alignlib_lite.calculatePercentIdentity(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity(
                    entry.mMapPeptide2Translation) * 100

            E.debug(
                "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f"
                %
                (str(entry.mPredictionId), entry.mPercentSimilarity,
                 entry.mPercentIdentity, percent_similarity, percent_identity))

        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0

        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key(entry.mQueryToken):
            print "# WARNING: sequence %s has no exon boundaries" % (
                entry.mQueryToken)
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True

        if not skip:

            nfound += 1

            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom

            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0

            inserted_exons = 0
            temp_inserted_exons = 0

            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write("# %s\n" % str(e))
                for e in ref_exons:
                    options.stdlog.write("# %s\n" % str(e))

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e, r = 0, 0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e + 1, r + 1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write("# current exons: %i and %i\n" %
                                         (e, r))
                    sys.stdout.flush()

                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[
                    e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0

                if query_sequence and sbjct_sequence:

                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment(tmp_ali,
                                                entry.mMapPeptide2Translation,
                                                xquery_from, xquery_to)

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write(
                            "# WARNING: empty alignment %s\n" % str(
                                (ref_from, exon_from, ref_to, exon_to,
                                 xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.AlignmentFormatExplicit(
                                    tmp_ali, query_sequence, sbjct_sequence)))

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity(
                            tmp_ali, query_sequence, sbjct_sequence) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity(
                            tmp_ali) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0

                if e < len(exons) - 1:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = exons[e + 1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = 0, 0, 0, 0, 0, []

                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (
                        ref_exons[r + 1].mPeptideFrom,
                        ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0

                if options.loglevel >= 2:
                    options.stdlog.write("# %s\n" % "\t".join(
                        map(str, (entry.mQueryToken, exon_from, exon_to,
                                  exon_phase, exon_genome_from, exon_genome_to,
                                  ref_from, ref_to, ref_phase))))
                    sys.stdout.flush()

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary

                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()

                        xquery_from = max(ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment(
                            tmp_ali, entry.mMapPeptide2Translation,
                            xquery_from, xquery_to)

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write(
                                "# warning: empty alignment %s\n" % str(
                                    (ref_from, exon_from, ref_to, exon_to,
                                     xquery_from, xquery_to)))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str(
                                    alignlib_lite.AlignmentFormatExplicit(
                                        tmp_ali, query_sequence,
                                        sbjct_sequence))

                            percent_identity = alignlib_lite.calculatePercentIdentity(
                                tmp_ali, query_sequence, sbjct_sequence) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity(
                                tmp_ali) * 100

                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom -
                                                       1) * 3 and dfrom > 0:
                        if is_good_exon:
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0

                    ## truncated terminal exons
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto <= (
                                entry.mQueryLength -
                                entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0

                    if e == len(exons) - 1 and dto <= (
                            entry.mQueryLength -
                            entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max(dfrom, exons_boundaries_max)
                        exons_boundaries_max = max(dto, exons_boundaries_max)

                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron

                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali,
                            genomic_fragment,
                            border_stop_codon=0)
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                this_r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                is_good_exon,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[
                    e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                0,
                                0,
                                0,
                                0,
                                r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                            )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity

                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[
                        0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali, genomic_fragment)

                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

        options.stdout.write("\t".join(
            map(str, (entry.mPredictionId, exons_num_exons, dubious_exons,
                      exons_boundaries_sum, exons_boundaries_max,
                      nidentical_exons, ninserted_exons, ndeleted_exons,
                      ninserted_introns, ndeleted_introns,
                      truncated_Nterminal_exon, truncated_Cterminal_exon,
                      ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons,
                      ninserted_Cexons))) + "\n")
Esempio n. 7
0
def ReadTranscriptsAndCds(transcript_ids1, transcript_ids2):

    if param_loglevel >= 1:
        print "# reading %i left and %i right transcripts" % (
            len(transcript_ids1), len(transcript_ids2))
        sys.stdout.flush()
    if param_loglevel >= 1:
        print "# reading exon boundaries."
        sys.stdout.flush()

    cds1 = Exons.ReadExonBoundaries(open(param_filename_cds1, "r"),
                                    filter=transcript_ids1,
                                    reset=True)
    cds2 = Exons.ReadExonBoundaries(open(param_filename_cds2, "r"),
                                    filter=transcript_ids2,
                                    reset=True)

    if param_loglevel >= 1:
        print "# read %i left and %i right cds" % (len(cds1), len(cds2))
        sys.stdout.flush()

    if param_loglevel >= 2:
        if len(cds1) != len(transcript_ids1):
            print "# missed in left:  %s" % ":".join(
                set(transcript_ids1.keys()).difference(cds1.keys()))
        if len(cds2) != len(transcript_ids2):
            print "# missed in right: %s" % ":".join(
                set(transcript_ids2.keys()).difference(cds2.keys()))

    if param_loglevel >= 1:
        print "# reading genomic sequences."
        sys.stdout.flush()

    transcripts1 = {}
    if param_filename_transcripts1:
        if param_mode_genome1 == "indexed":
            transcripts1 = Genomics.ParseFasta2HashFromIndex(
                param_filename_transcripts1, filter=transcript_ids1)
        else:
            transcripts1 = Genomics.ReadGenomicSequences(
                open(param_filename_transcripts1, "r"),
                do_reverse=0,
                filter=transcript_ids1,
                mask=param_mask)
    transcripts2 = {}
    if param_filename_transcripts2:
        if param_mode_genome2 == "indexed":
            transcripts2 = Genomics.ParseFasta2HashFromIndex(
                param_filename_transcripts2, filter=transcript_ids2)
        else:
            transcripts2 = Genomics.ReadGenomicSequences(
                open(param_filename_transcripts2, "r"),
                do_reverse=0,
                filter=transcript_ids2,
                mask=param_mask)
    if param_loglevel >= 1:
        print "# read %i left and %i right transcript sequences" % (
            len(transcripts1), len(transcripts2))
        sys.stdout.flush()

    return transcripts1, transcripts2, cds1, cds2
Esempio n. 8
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic data (indexed)." )

    parser.add_option("--coordinate-format", dest="coordinate_format", type="string",
                      help="input type of coordinates." )

    parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true",
                      help="output forward coordinates." )

    parser.add_option("-e", "--extract-id", dest="extract_id", type="string",
                      help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" )

    parser.set_defaults(
        coordinate_format = "zero-forward",
        forward_coordinates = False,
        genome_file = None,
        extract_id = None )

    (options, args) = E.Start( parser )
    
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    if options.extract_id:
        extract_id = re.compile( options.extract_id )
    else:
        extract_id = None

    converter = IndexedFasta.getConverter( options.coordinate_format )

    exons = Exons.ReadExonBoundaries( sys.stdin,
                                      contig_sizes = contig_sizes,
                                      converter = converter,
                                      do_invert = True,
                                      format = "gtf",
                                      gtf_extract_id = extract_id )

    ntranscripts, nexons, nerrors = 0, 0, 0
    for id, ee in exons.items():
        ntranscripts += 1
        has_error = False
        for e in ee:
            if options.forward_coordinates and e.mSbjctToken in contig_sizes and \
                    e.mSbjctStrand == "-":
                l = contig_sizes[e.mSbjctToken]
                e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom

            if e.mGenomeFrom < 0:
                has_error = True
                if options.loglevel >= 1:
                    options.stderr.write( "# Error: %s\n" % str(e) )
                break

            options.stdout.write( str(e) + "\n" )
            nexons += 1
                
        if has_error:
            nerrors += 1
            continue
    
    if options.loglevel >= 1:
        options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors))
    
    E.Stop()
Esempio n. 9
0
                                    continue

                    else:
                        nnotfound += 1

                new_results.append(entry)
                noutput += 1

            results = new_results
        if results:
            options.stdout.write(str(results) + "\n")

    elif options.output_format == "exontable":
        if options.format == "exons":
            exons = Exons.ReadExonBoundaries(sys.stdin,
                                             contig_sizes=contig_sizes,
                                             delete_missing=True)
        else:
            raise "unknown format."

        for k in exons.keys():
            ee = exons[k]

            id = 0
            for e in ee:
                id += 1
                print "\t".join(
                    map(str, (e.mQueryToken, id, e.mPeptideFrom, e.mPeptideTo,
                              e.frame, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                              e.mGenomeFrom, e.mGenomeTo)))
Esempio n. 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser.add_option("-q",
                      "--quality",
                      dest="quality",
                      type="string",
                      help="quality categories to take into account.")
    parser.add_option("-f",
                      "--format=",
                      dest="format",
                      type="string",
                      help="input format [exons|gff|table]")

    parser.add_option("-e",
                      "--exons=",
                      dest="tablename_exons",
                      type="string",
                      help="table name with exons.")
    parser.add_option("-p",
                      "--predictions=",
                      dest="tablename_predictions",
                      type="string",
                      help="table name with predictions.")
    parser.add_option("-n",
                      "--non-redundant",
                      dest="non_redundant",
                      action="store_true",
                      help="only non-redundant predictions.")
    parser.add_option("-s",
                      "--schema",
                      dest="schema",
                      type="string",
                      help="schema to use.")

    parser.set_defaults(
        fields=[
            "Id", "NumExons", "GeneLength", "MinExonLength", "MaxExonLength",
            "MinIntronLength", "MaxIntronLength"
        ],
        tablename_exons="exons",
        tablename_predictions="predictions",
        quality=None,
        non_redundant=False,
        schema=None,
        tablename_redundant="redundant",
        tablename_quality="quality",
        format="exons",
    )

    (options, args) = E.Start(parser,
                              add_csv_options=True,
                              add_psql_options=True)

    if options.quality:
        options.quality = options.quality.split(",")

    if options.format == "table":
        dbhandle = pgdb.connect(options.psql_connection)
        exons = Exons.GetExonBoundariesFromTable(
            dbhandle,
            options.tablename_predictions,
            options.tablename_exons,
            non_redundant_filter=options.non_redundant,
            quality_filter=options.quality,
            table_name_quality=options.tablename_quality,
            table_name_redundant=options.tablename_redundant,
            schema=options.schema)
    else:
        exons = Exons.ReadExonBoundaries(sys.stdin)

    stats = Exons.CalculateStats(exons)

    print "\t".join(options.fields)

    writer = csv.DictWriter(sys.stdout,
                            options.fields,
                            dialect=options.csv_dialect,
                            lineterminator=options.csv_lineterminator,
                            extrasaction='ignore')

    for k, v in stats.items():
        v["Id"] = k
        writer.writerow(v)

    E.Stop()
Esempio n. 11
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--master",
                      dest="master",
                      type="string",
                      help="master sequence.")

    parser.add_option("-p",
                      "--master-pattern",
                      dest="master_pattern",
                      type="string",
                      help="master pattern.")

    parser.add_option("--master-species",
                      dest="master_species",
                      type="string",
                      help="species to use as master sequences.")

    parser.add_option("-t",
                      "--translate",
                      dest="filename_translation",
                      type="string",
                      help="filename on where to store translated sequences.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-c",
                      "--mark-codons",
                      dest="mark_codons",
                      action="store_true",
                      help="mark codons.")

    parser.add_option(
        "-i",
        "--ignore-case",
        dest="ignore_case",
        action="store_true",
        help="ignore case (otherwise: lowercase are unaligned chars).")

    parser.add_option("--remove-stops",
                      dest="remove_stops",
                      action="store_true",
                      help="remove stop codons.")

    parser.add_option("--mask-stops",
                      dest="mask_stops",
                      action="store_true",
                      help="mask stop codons.")

    parser.add_option("--mask-char",
                      dest="mask_char",
                      type="string",
                      help="masking character to use.")

    parser.add_option("-f",
                      "--remove-frameshifts",
                      dest="remove_frameshifts",
                      action="store_true",
                      help="remove columns corresponding to frameshifts.")

    parser.add_option(
        "--mask-master",
        dest="mask_master",
        action="store_true",
        help=
        "columns in master to be removed are masked to keep residue numbering."
    )

    parser.add_option(
        "-s",
        "--split-exons",
        dest="split_exons",
        action="store_true",
        help="split columns aligned to different exons in the same gene.")

    parser.add_option("-a",
                      "--target",
                      dest="target",
                      type="choice",
                      choices=("paml", ),
                      help="perform cleaning up for certain targets.")

    parser.set_defaults(
        gap_char="-",
        mask_char="n",
        gap_chars="-.",
        separator="|",
        master=None,
        master_species=None,
        filename_translation=None,
        filename_exons=None,
        master_pattern=None,
        remove_stops=False,
        mark_codons=False,
        mask_unaligned=False,
        split_exons=False,
        remove_frameshifts=False,
        min_segment_length=5,
        ignore_case=False,
        mask_stops=False,
        target=None,
        mask_master=False,
    )

    (options, args) = E.Start(parser)

    if options.target == "paml":
        options.mask_stops = True
        options.mask_char = "n"
        options.remove_frameshifts = True

        if options.loglevel >= 1:
            options.stdlog.write(
                "# setting output to paml : removing frameshifts, masking stops with '%s'.\n"
                % (options.mask_char))

    ## 1. read multiple alignment in fasta format
    mali = Mali.Mali()

    mali.readFromFile(sys.stdin)

    if options.loglevel >= 1:
        options.stdlog.write("# read mali with %i entries.\n" % len(mali))

    if len(mali) == 0:
        raise "empty multiple alignment"

    identifiers = mali.getIdentifiers()

    masters = []
    if options.master:
        masters = options.master.split(",")
    elif options.master_pattern:
        for id in identifiers:
            if re.search(options.master_pattern, id):
                masters.append(id)
    elif options.master_species:
        for id in identifiers:
            if options.master_species == id.split(options.separator)[0]:
                masters.append(id)
    else:
        masters.append(identifiers[0])

    if options.loglevel >= 2:
        options.stdlog.write("# master sequences are: %s\n" % str(masters))
        options.stdlog.flush()

    if options.filename_exons:
        exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                         filter=set(identifiers),
                                         from_zero=True)

        if options.loglevel >= 2:
            options.stdlog.write("# read exons %i sequences.\n" % len(exons))
    else:
        exons = {}

    #################################################################################
    #################################################################################
    #################################################################################
    ## translate characters to upper/lower case according to exon info.
    #################################################################################
    if exons:
        for id in identifiers:
            if id in exons:
                mali.getSequence(id).mString = AddExonInformation(
                    mali[id], exons[id], mask_char=options.mask_char)

    elif options.ignore_case:
        ## convert all to uppercase
        mali.upper()

    #################################################################################
    #################################################################################
    #################################################################################
    ## untangle misaligned exons
    #################################################################################
    if exons and options.split_exons:

        ## first split with masters
        if len(masters) > 0:
            SplitExons(mali, exons, masters=masters, options=options)

            if options.loglevel >= 4:
                mali.writeToFile(open("log_mali1", "w"), format="fasta")

        SplitExons(mali, exons, options)

    #################################################################################
    #################################################################################
    #################################################################################
    ## remove frameshifts
    #################################################################################
    if options.remove_frameshifts:
        out_of_frame_columns = []
        if len(masters) == 1:

            frame_columns = GetFrameColumns(mali,
                                            masters[0],
                                            gap_chars=options.gap_chars)

        else:

            columns = []

            for id in masters:
                columns += GetFrameColumns(mali,
                                           id,
                                           gap_chars=options.gap_chars)

            if len(columns) == 0:
                columns += GetFrameColumns(mali,
                                           identifiers[0],
                                           gap_chars=options.gap_chars)

            # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100),
            # and (1,2,100) before (1,3,4).
            columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2])))

            # select codons
            frame_columns = []
            last_codon = columns[0]

            for codon in columns[1:]:
                # skip identical codons
                if codon == last_codon: continue

                # take first (shortest) codon in case of identical first residue
                if codon[0] == last_codon[0]:
                    continue

                # if not overlapping, keep
                if codon[0] > last_codon[2]:
                    frame_columns.append(last_codon)
                else:
                    out_of_frame_columns += last_codon

                # if overlapping, but out of register: skip
                last_codon = codon

            frame_columns.append(last_codon)

        # build set of skipped columns
        frame_set = set()
        for column in frame_columns:
            for c in column:
                frame_set.add(c)

        # columns that contain a master sequence that is out of
        # frame
        out_of_frame_set = set(out_of_frame_columns)
        out_of_frame_set = out_of_frame_set.difference(frame_set)

        if options.loglevel >= 1:
            options.stdlog.write("# found %i/%i columns in frame\n" %
                                 (len(frame_columns) * 3, mali.getWidth()))

            if options.loglevel >= 5:
                options.stdlog.write("# frame columns: %i\n" %
                                     (len(frame_columns)))
                x = 0
                for column in frame_columns:
                    options.stdlog.write("# %i\t%s\n" %
                                         (x, ",".join(map(str, column))))
                    x += 1

            if options.loglevel >= 5:
                options.stdlog.write(
                    "# Out-of frame columns with residue of masters: %i\n" %
                    (len(out_of_frame_set)))
                options.stdlog.write("# %s" %
                                     ",".join(map(str, out_of_frame_columns)))

        mask_chars = (string.upper(options.mask_char),
                      string.lower(options.mask_char))

        to_delete = []

        ignore_case = exons or options.ignore_case

        for id in identifiers:

            ngaps, nmasked = 0, 0

            sequence = mali.getSequence(id).mString

            if options.loglevel >= 7:
                options.stdlog.write(
                    "# processing sequence %s of length %i with gaps\n" %
                    (id, len(sequence)))

            ## treat masters differently if they are only to be masked, not
            ## pruned.
            ## simple mask all characters that are to skipped
            fragments = []
            nstops, ncodons, naligned = 0, 0, 0

            codon = []
            chars = []

            is_master = id in masters

            for x in range(len(sequence)):
                c = sequence[x]

                ## delete columns that do not align to
                ## a master.
                if x not in frame_set and x not in out_of_frame_set:
                    continue

                chars.append(c)
                if c not in options.gap_chars:
                    codon.append(c)
                if len(codon) % 3 == 0:
                    codon = "".join(codon)
                    codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon(
                        codon, options)

                    if codon_is_aligned: naligned += 1

                    to_mask = False
                    if codon_is_all_gaps:
                        ngaps += len(chars)
                    elif codon_is_ok:
                        ncodons += 1
                        if string.upper(codon) in ("TAG", "TAA", "TGA"):
                            nstops += 1
                            to_mask = True
                    else:
                        to_mask = True
                        nmasked += 1

                    if to_mask:
                        for i in range(len(chars)):
                            if chars[i] not in options.gap_chars:
                                chars[i] = options.mask_char

                    fragments.append("".join(chars))
                    chars = []
                    codon = []

            ## mask incomplete codons at the end
            if chars:
                for i in range(len(chars)):
                    if chars[i] not in options.gap_chars:
                        chars[i] = options.mask_char
                fragments.append("".join(chars))


##             else:

##                 for a,b,c in frame_columns:

##                     codon = sequence[a] + sequence[b] + sequence[c]

##                     codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options )

##                     if codon_is_aligned: naligned += 1

##                     if codon_is_all_gaps:
##                         fragments.append( options.gap_char * 3 )
##                         ngaps += 1
##                     elif codon_is_ok:
##                         ncodons += 1
##                         if string.upper(codon) in ("TAG", "TAA", "TGA"):
##                             if options.remove_stops:
##                                 fragments.append( options.gap_char * 3 )
##                             elif options.mask_stops:
##                                 fragments.append( options.mask_char * 3 )
##                             else:
##                                 fragments.append( codon )
##                             nstops += 1
##                         else:
##                             fragments.append( codon )
##                     else:
##                         fragments.append( options.gap_char * 3 )
##                         nmasked += 1

##                     if options.loglevel >= 7:
##                         options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id,
##                                                                                                  a,b,c,
##                                                                                                  codon,
##                                                                                                  str(codon_is_ok),
##                                                                                                  str(codon_is_aligned) ))

            s = string.join(fragments, "")
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n"
                    % (id, len(fragments), naligned, ncodons, nstops, ngaps,
                       nmasked))
                options.stdlog.flush()

            ## postpone deletion in order to not
            ## confuse the iteration of ids
            if naligned == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned nucleotides.\n"
                    % id)
                to_delete.append(id)
            elif ncodons == 0:
                options.stdlog.write(
                    "# sequence: %s removed because there are no aligned codons.\n"
                    % id)
                to_delete.append(id)
            else:
                mali.setSequence(id, string.join(fragments, ""))

        for id in to_delete:
            del mali[id]

    for id in identifiers:
        if options.mark_codons:
            a = mali[id]
            f = lambda x: a[x:x + 3]
            s = string.join([f(x) for x in range(0, len(a), 3)], " ")
        else:
            s = mali[id]
        options.stdout.write(">%s\n%s\n" % (id, s))

    if options.filename_translation:
        outfile = open(options.filename_translation, "w")
        for id in mali.keys():
            outfile.write(">%s\n%s\n" %
                          (id, Genomics.TranslateDNA2Protein(mali[id])))
        outfile.close()

    E.Stop()
Esempio n. 12
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] )

    parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int",
                       help="overlap residues.")
    parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string",
                       help="filename to filter tokens." )
    parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true",
                       help="exon identity." )
    parser.add_option( "--exons", dest="filename_exons", type="string",
                       help="filename with exon information." )
    parser.add_option( "-m", "--output-members", dest="filename_members", type="string",
                       help="output filename with members." )
    parser.add_option( "--overlap-id", dest="overlap_id", action="store_true",
                       help="overlap id." )
    parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true",
                       help="remove spanning predictions." )
    parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true",
                       help="remove complementary predictions." )
    parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true",
                       help="remove exon swoppers." )
    parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true",
                       help="remove gene spanners." )
    parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true",
                       help="remove suboptimal predictions." )
    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide information." )
    parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string",
                       help="filename with peptide information - after extension." )
    
    parser.add_option( "--test", dest="test_nids", type="string",
                       help="test nids." )
    ## filter options
    parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string",
                       help="filename with transcripts that are used to filter." )
    parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true",
                       help="remove all transcripts that span the filter set." )
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )
    parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int",
                       help="if set discard clusters bigger than this size (patch) [default=%default]." )
    

    parser.set_defaults(
        filename_members = None,
        filename_peptides = None,
        filename_extended_peptides = None,
        filename_exons = None,
        quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ),
        ## Classes, where redundancy is removed by similarity. When exon structure
        ## is not conserved, I can't predict alternative splice variants, so remove
        ## the redundancy.
        quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ),
        quality_genes = ("CG", "SG", "PG", "RG", "UG"),
        ## class that can be removed in spanning/complementary predictions
        quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ),
        ## class that is required for defining exon swopper event
        quality_remove_exon_swopper = ("CG", "PG"),
        ## class that will kept, in spite of being an exons swopper.
        quality_keep_exon_swopper = (),
        ## class that is required for removing gene spanners
        quality_remove_gene_spanners = ("CG"),
        ## class that will kept, in spite of being a gene spanner
        quality_keep_gene_spanners = (),
        ## class that is required for defining suboptimal matches
        quality_remove_suboptimal = ("CG", "PG" ),
        ## class that will be kept, in spite of being a suboptimal match
        quality_keep_suboptimal = (),
        ## gap penalties
        gop = -10.0,
        gep = -1.0,
        ## maximum number of gaps to allow in alignment
        max_gaps = 20,
        ## threshold of percent identity that allows to remove a prediction
        ## of a lower class.
        ## This allows for insertions/deletions
        min_identity = 98,
        ## threshold of percent identity that allows to remove a prediction
        ## of a non-gene by a gene
        min_identity_non_genes = 80,
        ## safety threshold: do not remove, if coverage of member is by x better
        ## than representative
        safety_pide = 10,
        safety_coverage = 10,
        overlap_id = False,
        remove_spanning_predictions = False,
        remove_exon_swoppers = False,
        remove_gene_spanners = False,
        remove_suboptimal = False,
        ## nids to use for testing
        test_nids = None,
        ## remove members with less than maximum coverage
        max_member_coverage = 90,
        ## maximum allowable exon slippage
        max_slippage = 9,
        ## minimum difference in identity for suboptimal predictions to be removed.
        suboptimal_min_identity_difference = 10,
        ## filter options
        filter_filename_transcripts = None,
        filter_remove_spanning = True,
        filter_remove_spanning_both_strands = True,
        genome_file = None,
        discard_large_clusters = None )
    
    (options, args) = E.Start( parser, add_psql_options = True )    

    if options.test_nids: options.test_nids = options.test_nids.split(",")

    # list of eliminated predictions
    eliminated_predictions = {}
    
    if options.filename_members:
        outfile_members = open( options.filename_members, "w" )
    else:
        outfile_members = sys.stdout

    ######################################################
    ######################################################
    ######################################################        
    # data
    ######################################################    
    data = []

    class Entry:
        def __init__(self, gff):
            self.mPid = float(gff["pid"])
            self.mQueryCoverage = float(gff["qcov"])
            self.gene_id = gff['gene_id']
            self.transcript_id = gff['transcript_id']
            self.mExtendedStart = int( gff['xstart'] )
            self.mExtendedEnd = int( gff['xend'] )
            self.start = gff.start
            self.contig = gff.contig
            self.strand = gff.strand
            self.end = gff.end
            self.mQuality = gff['class']
            
    for gff in GTF.iterator( sys.stdin ):
        data.append( Entry(gff) )

    if options.loglevel >= 1:
        options.stdlog.write( "# read %i transcripts.\n" % len(data) )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    # read peptide sequences
    ######################################################    
    if options.loglevel >= 1:
        options.stdlog.write( "# loading peptide databases ... " )
        options.stdlog.flush()

    if options.filename_peptides:
        peptides = IndexedFasta.IndexedFasta( options.filename_peptides )
        peptide_lengths = peptides.getContigSizes()
    else:
        peptide_lengths = {}
        peptides = {}

    ######################################################
    ######################################################
    ######################################################        
    # read extended peptide sequences
    ######################################################    
    if options.filename_extended_peptides:
        extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides )
    else:
        extended_peptides = {}

    if options.loglevel >= 1:
        options.stdlog.write( "finished\n" )
        options.stdlog.flush()

    ######################################################
    ######################################################
    ######################################################        
    ## open genome file
    ######################################################        
    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
    else:
        contig_sizes = {}

    ######################################################
    ######################################################
    ######################################################        
    ## reading exons, clustering and formatting them.
    ######################################################        
    if options.filename_exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries ... " )
            options.stdlog.flush()
            
        ids = [ x.transcript_id for x in data ] 

        exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"),
                                          contig_sizes = contig_sizes,
                                          filter = set(ids) )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) ))

        if len(exons) == 0:
            raise ValueError("no exons found in table.")
            
        # flag terminal exons
        Exons.SetRankToPositionFlag( exons )

        identity_map_cluster2transcripts, identity_map_transcript2cluster =\
                                          Exons.ClusterByExonIdentity( exons,
                                                                       max_terminal_num_exons = 3,
                                                                       max_slippage= options.max_slippage,
                                                                       loglevel = options.loglevel )

        overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\
                                         Exons.ClusterByExonOverlap( exons,
                                                                     min_overlap = 10,
                                                                     loglevel = options.loglevel )
    else:
        exons = {}

    ######################################################        
    nrepresentatives, nmembers, neliminated = 0, 0, 0
    eliminated_by_method = {}

    ######################################################
    ######################################################
    ######################################################
    ## read filter transcripts and apply filters
    ######################################################        
    if options.filter_filename_transcripts:

        if options.loglevel >= 1:
            options.stdlog.write( "# reading exon boundaries for filter set ... " )
            options.stdlog.flush()
            
        filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ),
                                                 delete_missing = True,
                                                 contig_sizes = contig_sizes )

        if options.loglevel >= 1:
            options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) )
        
        t = time.time()
        eliminated = FilterEliminateOverlappingTranscripts( exons,
                                                            filter_exons,
                                                            eliminated_predictions,
                                                            contig_sizes,
                                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t ))
            options.stdlog.flush()

    if options.remove_exon_swoppers and not exons:
        raise ValueError( "please specify exon table if using --remove-swoppers." )
    if options.remove_gene_spanners and not exons:
        raise ValueError( "please specify exon table if using --remove-gene-spanners." )

    ########################################################################################
    ## remove predictions spanning other predictions but do not overlap with them on an exon level.
    if options.remove_gene_spanners and exons:
        if options.loglevel >= 1:
            options.stdlog.write( "# removing gene spanners\n" )
            options.stdlog.flush()
            
        t = time.time()
        eliminated = EliminateGeneSpanners( data,
                                            eliminated_predictions,
                                            exons,
                                            options )

        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n
        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t ))
            options.stdlog.flush()
            
    ########################################################################################
    ## sort data by quality, length of prediction and coverage * pid

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data\n" )
        options.stdlog.flush()

    map2pos = {}
    for x in range(len(options.quality_hierarchy)):
        map2pos[options.quality_hierarchy[x]] = x

    data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) )

    # build map of prediction to quality
    map_prediction2data = {}
    for d in data:
        map_prediction2data[d.transcript_id] = d

    if options.loglevel >= 1:
        options.stdlog.write( "# sorting data finished\n" )
        options.stdlog.flush()

    ########################################################################################
    ## remove predictions joining two other complete non-overlapping predictions
    if options.remove_exon_swoppers and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing exon swoppers\n" )
            options.stdlog.flush()

        eliminated = EliminateExonSwoppers( data,
                                            eliminated_predictions,
                                            identity_map_transcript2cluster,
                                            identity_map_cluster2transcripts,
                                            map_prediction2data,
                                            exons,
                                            options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i exon swoppers\n" % n )
            options.stdlog.flush()

    ########################################################################################
    ## remove suboptimal predictions
    if options.remove_suboptimal and exons:

        if options.loglevel >= 1:
            options.stdlog.write( "# removing suboptimal predictions\n" )
            options.stdlog.flush()

        t = time.time()
        eliminated = EliminateSuboptimalPredictions( data,
                                                     eliminated_predictions,
                                                     overlap_map_transcript2cluster,
                                                     overlap_map_cluster2transcripts,
                                                     map_prediction2data,
                                                     exons,
                                                     options )
        
        n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method )
        neliminated += n

        if options.loglevel >= 1:
            options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) )
            options.stdlog.flush()        

    ########################################################################################
    ## remove redundant predictions
    l = len(data)
        
    options.report_step = max(1, int(l / 100))

    t2= time.time()

    last_quality = None
    qualities = []

    options.stdout.write( "%s\t%s\n" % ("rep", "comment") )
    
    for x in range(len(data)):

        if options.loglevel >= 1:
            if x % options.report_step  == 0:
                options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \
                                      (x+1, l,
                                       int(100 * (x+1) / l),
                                       len(eliminated_predictions), l,
                                       100 * len(eliminated_predictions) / l,
                                       time.time() - t2 ) )
                                                                    
                options.stdlog.flush()
                
        rep = data[x]

        rep_id, rep_quality = rep.transcript_id, rep.mQuality
        
        if rep_id in eliminated_predictions: continue

        if rep_quality != last_quality:
            if last_quality:
                qualities.append( last_quality )
            last_quality = rep_quality
        
        if options.loglevel >= 2:
            options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) )
            options.stdlog.flush()

        eliminated = []

        if options.overlap_id:
            eliminated += EliminateRedundantEntriesByOverlap( rep,
                                                              data[x+1:],
                                                              eliminated_predictions,
                                                              options,
                                                              peptides, 
                                                              extended_peptides,
                                                              filter_quality = qualities,
                                                              this_quality = rep_quality )
                                                              
        else:
            eliminated += EliminateRedundantEntriesByRange( rep,
                                                            data,
                                                            eliminated_predictions,
                                                            options,
                                                            peptides, 
                                                            extended_peptides,
                                                            filter_quality = qualities,
                                                            this_quality = rep_quality )

        options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) )

        if outfile_members:
            outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id)))
            nrepresentatives += 1
            nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method )            

    if outfile_members != sys.stdout:
        outfile_members.close()

    options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\
                          (nrepresentatives, nmembers, neliminated,
                           nrepresentatives+nmembers+neliminated ) )
    
    options.stdlog.write( "# elimination by method:\n" )
    
    for v,c in eliminated_by_method.items():
        options.stdlog.write( "# method=%s, count=%i\n" % (v, c) )

    E.Stop()
Esempio n. 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2cleaned_mali.py 2781 2009-09-10 11:33:14Z andreas $"
    )

    parser.add_option("-m",
                      "--genome-master",
                      dest="genome_master",
                      type="string",
                      help="genome to use as master.")

    parser.add_option("-s",
                      "--filename-removed",
                      dest="filename_removed",
                      type="string",
                      help="output filename for deleted entries.")

    parser.add_option("-e",
                      "--filename-exons",
                      dest="filename_exons",
                      type="string",
                      help="filename on where to exon information.")

    parser.add_option("-u",
                      "--filename-summary",
                      dest="filename_summary",
                      type="string",
                      help="output filename of component summary.")

    parser.add_option("-c",
                      "--filename-components",
                      dest="filename_components",
                      type="string",
                      help="output filename for components.")

    parser.add_option(
        "--min-percent-overlap",
        dest="min_percent_overlap",
        type="float",
        help=
        "minimum percent overlap for splitting multiple alignment into components."
    )

    parser.add_option("--max-percent-overlap",
                      dest="max_percent_overlap",
                      type="float",
                      help="maximum percent overlap for split genes.")

    parser.add_option(
        "--min-genomic-distance",
        dest="min_genomic_distance",
        type="int",
        help=
        "minimum genomic distance for adjacent genes to be considered dodgy.")

    parser.add_option("-o",
                      "--mode",
                      dest="mode",
                      type="choice",
                      choices=("joining", "split"),
                      help="""how to filter the alignment.
                      joining: remove joining transcripts (spindly genes)
                      split:  remove split transcripts""")

    parser.add_option(
        "-g",
        "--gene-mode",
        dest="gene_mode",
        action="store_true",
        help=
        """the aligned sequences are genes. This forces the exon boundaries to
                      collated by genes.""")

    parser.set_defaults( \
        genome_master = None,
        filename_removed = None,
        filename_components = None,
        filename_summary = None,
        filename_exons = None,
        mode="joining",
        input_format = "fasta",
        output_format = "fasta",
        max_percent_overlap = 0,
        min_percent_overlap = 0,
        gene_mode = False,
        separator = "|")

    (options, args) = E.Start(parser)

    ###############################################################
    ###############################################################
    ###############################################################
    ## input
    ###############################################################

    mali = Mali.Mali()
    mali.readFromFile(sys.stdin, format=options.input_format)
    all_identifiers = mali.getIdentifiers()

    if options.filename_exons:
        ## read exon boundaries and keep forward coordinates

        if options.gene_mode:
            exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                             from_zero=True)

            gene_exons = {}
            for id, ee in exons.items():
                data = id.split(options.separator)
                new_id = options.separator.join((data[0], data[2]))
                if new_id not in gene_exons: gene_exons[new_id] = []
                for e in ee:
                    e.mQueryToken = new_id
                gene_exons[new_id] += ee
            for id, ee in gene_exons.items():
                ee.sort(lambda x, y: cmp(x.mGenomeFrom, y.mGenomeFrom))
            exons = gene_exons

        else:
            exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"),
                                             filter=set(all_identifiers),
                                             from_zero=True)

    else:
        exons = {}

    ###############################################################
    ###############################################################
    ###############################################################
    ## collect all transcripts for a species together with their
    ## aligned length
    ###############################################################
    map_species2transcripts = {}

    for id in mali.getIdentifiers():
        data = id.split(options.separator)

        species = data[0]

        if exons:
            l = exons[id][-1].mGenomeTo - exons[id][0].mGenomeFrom
        else:
            l = len(mali.getEntry(id).getSequence())

        try:
            map_species2transcripts[species].append((l, id))
        except KeyError:
            map_species2transcripts[species] = [(l, id)]

    if options.mode == "joining":
        mapped_transcripts = removeJoiningTranscripts(mali, exons,
                                                      map_species2transcripts,
                                                      options)

    elif options.mode == "split":
        mapped_transcripts = removeSplitTranscripts(mali, exons,
                                                    map_species2transcripts,
                                                    options)

    ###############################################################
    ###############################################################
    ###############################################################
    ## now build overlap graph of remaining sequences split multiple
    ## alignment in components.
    ## Compute reciprocal best match graph
    ###############################################################
    graph = networkx.Graph()

    removed_transcripts = set(map(lambda x: x[0], mapped_transcripts))

    for t in all_identifiers:
        if t not in removed_transcripts:
            graph.add_node(t)

    for t1 in range(len(all_identifiers) - 1):
        transcript1 = all_identifiers[t1]
        if transcript1 in removed_transcripts: continue

        for t2 in range(t1 + 1, len(all_identifiers)):
            transcript2 = all_identifiers[t2]
            if transcript2 in removed_transcripts: continue

            overlap = getPercentOverlap(mali[transcript1], mali[transcript2])
            if overlap > 5:
                graph.add_edge(transcript1, transcript2)

    ## compute components
    components = networkx.connected_components(graph)

    ###############################################################
    ###############################################################
    ###############################################################
    ## output
    ###############################################################
    if options.filename_components:
        n = 1
        outfile = open(options.filename_components, "w")

        outfile.write("id\tcomponent\n")
        for component in components:
            for c in component:
                outfile.write("%s\t%i\n" % (c, n))
            n += 1
        outfile.close()

    if options.filename_removed and len(removed_transcripts) > 0:
        outfile = open(options.filename_removed, "w")
        outfile.write("removed\trepresentative\treason\n")
        for removed_transcript, rep_transcript, reason in mapped_transcripts:
            outfile.write("%s\t%s\t%s\n" %
                          (removed_transcript, rep_transcript, reason))
        outfile.close()

    if options.filename_summary:
        n = 1
        outfile = open(options.filename_summary, "w")
        outfile.write("component\tsize\tnspecies\tnmaster\n")
        for component in components:
            species = map(lambda x: x.split(options.separator)[0], component)
            outfile.write(
                "%i\t%i\t%i\t%i\t%i\n" %
                (n, len(component), len(species),
                 len(filter(lambda x: x == options.genome_master, species))))

            n += 1

    for transcript in removed_transcripts:
        mali.deleteEntry(transcript)

    new_identifiers = mali.getIdentifiers()

    mali.removeGaps(minimum_gaps=len(new_identifiers))

    mali.writeToFile(options.stdout, format=options.output_format)

    if options.loglevel >= 1:
        options.stdlog.write(
            "# input=%i, output=%i, removed=%i, ncomponents=%i\n" %
            (len(all_identifiers), len(new_identifiers),
             len(removed_transcripts), len(components)))
        options.stdlog.write("# final component sizes: %s\n" %
                             ",".join(map(lambda x: str(len(x)), components)))

    E.Stop()