Python makeSequence Examples

Programming Language: Python

Namespace/Package Name: alignlib_lite

Method/Function: makeSequence

Examples at hotexamples.com: 24

Python makeSequence - 24 examples found. These are the top rated real world Python examples of alignlib_lite.makeSequence extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: exons2clusters.py Project: Charlie-George/cgat

def CheckAlignments(peptide_sequences, query_token, other_tokens):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens))
        sys.stdout.flush()

    if query_token not in peptide_sequences:
        return True

    result = alignlib_lite.makeAlignmentVector()
    alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL,
                                                  -10.0, -1.0)
    row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib_lite.makeSequence(peptide_sequences[x])
        alignator.align(result, row_seq, col_seq)
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True

    return False

Example #2

Show file

File: exons2clusters.py Project: yangjl/cgat

def CheckAlignments(peptide_sequences, query_token, other_tokens):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token,
                                                     str(other_tokens))
        sys.stdout.flush()

    if query_token not in peptide_sequences:
        return True

    result = alignlib_lite.makeAlignmentVector()
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
    row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib_lite.makeSequence(peptide_sequences[x])
        alignator.align(result, row_seq, col_seq)
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True

    return False

Example #3

Show file

File: regions2predictions.py Project: CGATOxford/Optic

def FilterConflicts(old_predictions, new_predictions, removed_predictions,
                    min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##########################################################################
    # sort predictions by genomic region
    if isinstance(old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort(
            ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp((x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo),
                                              (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##########################################################################
    # filter predictions and resolve conflicts based on genomic overlap
    # deleted segments are put in a temporary storage space.
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    result = alignlib_lite.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            (this_query_peptide, this_query_status, this_query_gene,
             this_query_transcript) = \
                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue

        overlap = min(last_prediction.mSbjctGenomeTo,
                      this_prediction.mSbjctGenomeTo) -\
            max(last_prediction.mSbjctGenomeFrom,
                this_prediction.mSbjctGenomeFrom)
        union = max(last_prediction.mSbjctGenomeTo,
                    this_prediction.mSbjctGenomeTo) -\
            min(last_prediction.mSbjctGenomeFrom,
                this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
                (last_query_gene != this_query_gene or
                 last_query_gene is None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:

                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken,
                                         this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken,
                                         last_prediction.mQueryToken)

                    if not alignments.has_key(key):
                        result.clear()
                        alignator.align(result,
                                        alignlib_lite.makeSequence(
                                            peptide_sequences[this_prediction.mQueryToken]),
                                        alignlib_lite.makeSequence(peptide_sequences[last_prediction.mQueryToken]))
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1

                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0

        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - \
                this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference:
                d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference:
                d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId,
                                                                                          last_prediction.mQueryToken,
                                                                                          last_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark(this_prediction, last_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap,
                                                                               str(last_prediction))

                removed_predictions.append(this_prediction)
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId,
                                                                                          this_prediction.mQueryToken,
                                                                                          this_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark(last_prediction, this_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap,
                                                                               str(this_prediction))
                removed_predictions.append(last_prediction)
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)

    return nnew

Example #4

Show file

def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2,
                           peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]),
                transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]),
                transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (
                        seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(
                cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3,
                (map_a2b.getRowTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(
                cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3,
                (map_a2b.getColTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1,
                                       c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries,
                param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries,
                param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                        (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(
                    map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                              map_a2b.getNumGaps(), coverage_a, coverage_b)),
                    "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             sbjct_token1, sbjct_strand1, sbjct_from1,
                             sbjct_to1, seqs1[i].getLength(), sbjct_token2,
                             sbjct_strand2, sbjct_from2, sbjct_to2,
                             seqs2[j].getLength(), map_a2b.getRowFrom(),
                             map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(),
                             map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(),
                             coverage_a, coverage_b, nmissed_cmp2ref,
                             mapped_boundaries, nmissed_ref2cmp,
                             reference_boundaries, i, j, len(c1), len(c2), cc,
                             is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                             map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                             map_a2b.getNumGaps(), coverage_a, coverage_b)),
                        "\t")
                elif method == "location":
                    print string.join(
                        map(str, ("pair", method, prediction_id1,
                                  prediction_id2, sbjct_token1, sbjct_strand1,
                                  sbjct_from1, sbjct_to1, seqs1[i].getLength(),
                                  sbjct_token2, sbjct_strand2, sbjct_from2,
                                  sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (
                        i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs

Example #5

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-b",
                      "--boundaries",
                      dest="filename_boundaries",
                      type="string",
                      help="filename with exon boundaries.")

    parser.add_option("-e",
                      "--exons",
                      dest="filename_exons",
                      type="string",
                      help="filename with exons (output).")

    parser.add_option("-p",
                      "--peptides",
                      dest="filename_peptides",
                      type="string",
                      help="filename with peptide sequences.")

    parser.add_option(
        "-w",
        "--write-notfound",
        dest="write_notfound",
        action="store_true",
        help="print exons for predictions not found in reference.")

    parser.add_option("-q",
                      "--quality-pide",
                      dest="quality_threshold_pide",
                      type="int",
                      help="quality threshold (pide) for exons.")

    parser.set_defaults(
        genome_file="genome",
        filename_boundaries=None,
        filename_exons=None,
        filename_peptides=None,
        quality_threshold_pide=0,
        write_notfound=False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary=9,
        ## stop codons to search for
        stop_codons=("TAG", "TAA", "TGA"),
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries(open(
            options.filename_boundaries, "r"),
                                                             do_invert=1,
                                                             remove_utr=1)
        E.info("read exon boundaries for %i queries" %
               len(reference_exon_boundaries))

    if options.filename_exons:
        outfile_exons = open(options.filename_exons, "w")
        outfile_exons.write("%s\n" % "\t".join(
            ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame",
             "reference_id", "reference_from", "reference_to",
             "reference_phase", "pidentity", "psimilarity", "nframeshifts",
             "ngaps", "nstopcodons", "is_ok", "genome_exon_from",
             "genome_exon_to")))

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_peptides, "r"))
        E.info("read peptide sequences for %i queries" %
               len(peptide_sequences))
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None

    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    options.stdout.write("%s\n" % "\t".join(
        ("prediction_id", "number", "dubious_exons", "boundaries_sum",
         "boundaries_max", "identical_exons", "inserted_exons",
         "deleted_exons", "inserted_introns", "deleted_introns",
         "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons",
         "deleted_Cexons", "inserted_Nexons", "inserted_Cexons")))

    for line in sys.stdin:

        if line[0] == "#": continue

        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries(
            entry.mMapPeptide2Genome,
            query_from=entry.mQueryFrom,
            sbjct_from=entry.mSbjctGenomeFrom,
            add_stop_codon=0)

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)

        genomic_fragment = fasta.getSequence(entry.mSbjctToken,
                                             entry.mSbjctStrand,
                                             entry.mSbjctGenomeFrom,
                                             entry.mSbjctGenomeTo)

        skip = False
        if peptide_sequences.has_key(entry.mQueryToken):

            query_sequence = alignlib_lite.makeSequence(
                peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)

            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % (
                    entry.mQueryToken, query_sequence.getLength(),
                    entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True

            elif sbjct_sequence.getLength(
            ) < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % (
                    entry.mSbjctToken, sbjct_sequence.getLength(),
                    entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence,
                    alignlib_lite.makeScorer(query_sequence, sbjct_sequence))
                percent_identity = alignlib_lite.calculatePercentIdentity(
                    entry.mMapPeptide2Translation, query_sequence,
                    sbjct_sequence) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity(
                    entry.mMapPeptide2Translation) * 100

            E.debug(
                "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f"
                %
                (str(entry.mPredictionId), entry.mPercentSimilarity,
                 entry.mPercentIdentity, percent_similarity, percent_identity))

        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0

        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key(entry.mQueryToken):
            print "# WARNING: sequence %s has no exon boundaries" % (
                entry.mQueryToken)
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True

        if not skip:

            nfound += 1

            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom

            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0

            inserted_exons = 0
            temp_inserted_exons = 0

            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write("# %s\n" % str(e))
                for e in ref_exons:
                    options.stdlog.write("# %s\n" % str(e))

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e, r = 0, 0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e + 1, r + 1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write("# current exons: %i and %i\n" %
                                         (e, r))
                    sys.stdout.flush()

                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[
                    e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0

                if query_sequence and sbjct_sequence:

                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment(tmp_ali,
                                                entry.mMapPeptide2Translation,
                                                xquery_from, xquery_to)

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write(
                            "# WARNING: empty alignment %s\n" % str(
                                (ref_from, exon_from, ref_to, exon_to,
                                 xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write("# %s\n" % str(
                                alignlib_lite.AlignmentFormatExplicit(
                                    tmp_ali, query_sequence, sbjct_sequence)))

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity(
                            tmp_ali, query_sequence, sbjct_sequence) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity(
                            tmp_ali) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0

                if e < len(exons) - 1:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = exons[e + 1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to,
                     next_exon_ali) = 0, 0, 0, 0, 0, []

                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (
                        ref_exons[r + 1].mPeptideFrom,
                        ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0

                if options.loglevel >= 2:
                    options.stdlog.write("# %s\n" % "\t".join(
                        map(str, (entry.mQueryToken, exon_from, exon_to,
                                  exon_phase, exon_genome_from, exon_genome_to,
                                  ref_from, ref_to, ref_phase))))
                    sys.stdout.flush()

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary

                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()

                        xquery_from = max(ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment(
                            tmp_ali, entry.mMapPeptide2Translation,
                            xquery_from, xquery_to)

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write(
                                "# warning: empty alignment %s\n" % str(
                                    (ref_from, exon_from, ref_to, exon_to,
                                     xquery_from, xquery_to)))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str(
                                    alignlib_lite.AlignmentFormatExplicit(
                                        tmp_ali, query_sequence,
                                        sbjct_sequence))

                            percent_identity = alignlib_lite.calculatePercentIdentity(
                                tmp_ali, query_sequence, sbjct_sequence) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity(
                                tmp_ali) * 100

                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom -
                                                       1) * 3 and dfrom > 0:
                        if is_good_exon:
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0

                    ## truncated terminal exons
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto <= (
                                entry.mQueryLength -
                                entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0

                    if e == len(exons) - 1 and dto <= (
                            entry.mQueryLength -
                            entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons) - 1 and r == len(
                            ref_exons) - 1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max(dfrom, exons_boundaries_max)
                        exons_boundaries_max = max(dto, exons_boundaries_max)

                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron

                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali,
                            genomic_fragment,
                            border_stop_codon=0)
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                this_r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                is_good_exon,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[
                    e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (
                    ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo,
                    ref_exons[r].frame, ref_exons[r].mGenomeFrom,
                    ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                0,
                                0,
                                0,
                                0,
                                r,
                                ref_from,
                                ref_to,
                                ref_phase,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                                0,
                            )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity

                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[
                        0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures(
                            exon_genome_from - entry.mSbjctGenomeFrom,
                            exon_ali, genomic_fragment)

                    outfile_exons.write(
                        string.join(
                            map(str, (
                                entry.mPredictionId,
                                this_e,
                                exon_from,
                                exon_to,
                                exon_phase,
                                0,
                                0,
                                0,
                                0,
                                percent_identity,
                                percent_similarity,
                                nframeshifts,
                                ngaps,
                                nstopcodons,
                                1,
                                exon_genome_from,
                                exon_genome_to,
                            )), "\t") + "\n")

        options.stdout.write("\t".join(
            map(str, (entry.mPredictionId, exons_num_exons, dubious_exons,
                      exons_boundaries_sum, exons_boundaries_max,
                      nidentical_exons, ninserted_exons, ndeleted_exons,
                      ninserted_introns, ndeleted_introns,
                      truncated_Nterminal_exon, truncated_Cterminal_exon,
                      ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons,
                      ninserted_Cexons))) + "\n")

Example #6

Show file

        alignator = alignlib_lite.makeAlignatorDPFull(
            alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

        for q1, q2 in pairs:

            ninput += 1

            if param_loglevel >= 1:
                print "# processing %s and %s" % (q1, q2)

            if q1 in transcripts1 and q2 in transcripts2:
                map_a2b = alignlib_lite.makeAlignmentVector()

                alignator.align(map_a2b,
                                alignlib_lite.makeSequence(peptides1[q1]),
                                alignlib_lite.makeSequence(peptides2[q2]))

                if map_a2b.getLength() == 0:
                    if param_loglevel >= 1:
                        print "# Alignment failed between %s and %s" % (q1, q2)
                        sys.stdout.flush()

                    ntotal_errors += 1
                    continue

                nerrors, nskipped = WriteExons(q1, peptides1[q1], cds1[q1],
                                               transcripts1[q1], q2,
                                               peptides2[q2], cds2[q2],
                                               transcripts2[q2], map_a2b)

Example #7

Show file

def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2,
               transcript2, peptide_map_a2b):

    if param_loglevel >= 3:
        for cd in cds1:
            print "#", str(cd)
        for cd in cds2:
            print "#", str(cd)
        print "# peptide_map_a2b", str(
            alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b))
        sys.stdout.flush()

    dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2)

    if len(cds1) != len(cds2):
        if param_loglevel >= 4:
            print ""  # WARNING: different number of exons!"

    seq1 = alignlib_lite.makeSequence(transcript1)
    seq2 = alignlib_lite.makeSequence(transcript2)
    tmp_map_a2b = alignlib_lite.makeAlignmentVector()

    dialign = WrapperDialign.Dialign("-n")
    dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8")
    dba = WrapperDBA.DBA()
    #clustal = WrapperClustal.Clustal()

    matrix, gop, gep = global_substitution_matrix
    alignator_nw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix)
    alignator_sw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix)

    # concatenated alignments for exons:
    # 1: only the common parts
    ali_common1 = ""
    ali_common2 = ""

    e1, e2 = 0, 0
    while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom():
        e1 += 1
    while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom():
        e2 += 1

    nskipped, nerrors = 0, 0

    if param_loglevel >= 5:
        nmapped = 0
        for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1):
            if dna_map_a2b.mapRowToCol(x) >= 0:
                nmapped += 1
        print "# nmapped=", nmapped
        print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b))

    # declare alignments used
    map_intron_a2b = alignlib_lite.makeAlignmentVector()

    result = Exons.CompareGeneStructures(cds1,
                                         cds2,
                                         map_cmp2ref=peptide_map_a2b)

    if param_loglevel >= 2:
        print result.Pretty("#")

    nskipped_exons, nskipped_introns = 0, 0

    last_e1, last_e2 = None, None

    for link in result.mEquivalences:

        if link.mCoverage <= param_min_exon_coverage:
            nskipped_exons += 1
            continue

        e1, e2 = link.mId1, link.mId2

        c1 = cds1[e1]
        c2 = cds2[e2]
        exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo]
        exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo]

        #######################################################################
        # write unaligned exons
        if param_write_exons:
            pair = AlignedPairs.UnalignedPair()

            pair.mCategory = "exon"
            pair.mToken1 = token1
            pair.mId1 = e1 + 1
            pair.mNum1 = len(cds1)
            pair.mLen1 = len(exon_fragment1)
            pair.mSequence1 = exon_fragment1
            pair.mToken2 = token2
            pair.mId2 = e2 + 1
            pair.mNum2 = len(cds2)
            pair.mLen2 = len(exon_fragment2)
            pair.mSequence2 = exon_fragment2
            pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo,
            pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo,

            print str(pair)
            sys.stdout.flush()

        #######################################################################
        # build alignment for overlap of both exons
# tmp_map_a2b.clear()
# alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b,
# c1.mGenomeFrom + 1, c1.mGenomeTo )

# if param_loglevel >= 5:
# print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo)
# for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"):
# print "#", x
# if tmp_map_a2b.getLength() == 0:
# if param_loglevel >= 1:
# print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \
##                       (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2)
# print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\
##                       peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(peptide_map_a2b)
# print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\
##                       dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(dna_map_a2b)
# for cd in cds1: print "##", str(cd)
# for cd in cds2: print "##", str(cd)
##             nerrors += 1
# continue
##         data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b  ).split("\n"))
# if "caligned" in param_write_exons :
# print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1,
##                                                                                token2, e2,
##                                                                                data[0][0], data[0][2],
##                                                                                data[1][0], data[1][2],
# data[0][1], data[1][1] )
##         ali_common1 += data[0][1]
##         ali_common2 += data[1][1]
#######################################################################
# write alignment of introns for orthologous introns
# orthologous introns are between orthologous exons
        if param_write_introns:

            if last_e1 is not None:
                if e1 - last_e1 != 1 or e2 - last_e2 != 1:
                    nskipped_introns += 1
                else:
                    pair = AlignedPairs.UnalignedPair()

                    intron_from1 = cds1[e1 - 1].mGenomeTo
                    intron_to1 = cds1[e1].mGenomeFrom
                    intron_from2 = cds2[e2 - 1].mGenomeTo
                    intron_to2 = cds2[e2].mGenomeFrom

                    intron_fragment1 = transcript1[intron_from1:intron_to1]
                    intron_fragment2 = transcript2[intron_from2:intron_to2]

                    if len(intron_fragment1) == 0 or len(
                            intron_fragment2) == 0:
                        print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\
                              (intron_from1, intron_to1, len(transcript1),
                               intron_from2, intron_to2, len(transcript2))
                        continue

                    pair.mCategory = "intron"
                    pair.mToken1 = token1
                    pair.mId1 = e1 + 1
                    pair.mNum1 = len(cds1) - 1
                    pair.mLen1 = len(intron_fragment1)
                    pair.mFrom1 = intron_from1
                    pair.mTo1 = intron_to1
                    pair.mSequence1 = intron_fragment1
                    pair.mToken2 = token2
                    pair.mId2 = e2 + 1
                    pair.mNum1 = len(cds2) - 1
                    pair.mLen2 = len(intron_fragment2)
                    pair.mFrom2 = intron_from2
                    pair.mTo2 = intron_to2
                    pair.mSequence2 = intron_fragment2

                    if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \
                            (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \
                            (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \
                            (param_max_intron_length and len(intron_fragment2) > param_max_intron_length):
                        if param_loglevel >= 1:
                            print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\
                                  (token1, e1, token2, e2,
                                   len(intron_fragment1),
                                   len(intron_fragment2))
                            sys.stdout.flush()
                            nskipped += 1

                    print str(pair)

# else:
##                         anchored_from1 = intron_from1 - param_extend_introns
##                         anchored_to1 = intron_to1 + param_extend_introns
##                         anchored_from2 = intron_from2 - param_extend_introns
##                         anchored_to2 = intron_to2 + param_extend_introns

##                         anchored_fragment1 = transcript1[anchored_from1:anchored_to1]
##                         anchored_fragment2 = transcript2[anchored_from2:anchored_to2]

# for method in param_write_introns:

# if param_loglevel >= 2:
# print "## aligning with method %s" % method
# sys.stdout.flush

# map_intron_a2b.clear()

# if method == "unaligned":

##                                 from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2

# elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"):

##                                 tmp_intron_a2b = alignlib_lite.makeAlignmentVector()

# if param_loglevel >= 1:
# print "# aligning with method %s two fragments of length %i and %i" % (method,
# len(anchored_fragment1),
# len(anchored_fragment2))
# sys.stdout.flush()

# if method == "dialigned":
##                                     result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dialignedlgs":
##                                     result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dbaligned":
##                                     result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "clusaligned":
##                                     result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# if not result or result.getLength() == 0:
# if param_loglevel >= 1:
# print "# Error: empty intron alignment"
# sys.stdout.flush()
##                                     nerrors += 1
# continue
##                                 tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 )
# alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b,
##                                                        intron_from1 + 1, intron_to1,
# intron_from2 + 1, intron_to2 )
# elif method == "nwaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignator_nw.Align( seq1, seq2, map_intron_a2b )
# seq1.useFullLength()
# seq2.useFullLength()
# elif method == "swaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw )
# seq1.useFullLength()
# seq2.useFullLength()
# else:
##                                 raise "unknown method %s" % method
# if map_intron_a2b.getLength() > 0:
# if param_compress:
##                                     from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo()
##                                     from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo()
##                                     ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b )
# else:
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b  ).split("\n"))
# if len(data) < 2:
##                                         data=[ ( 0, "", 0), (0, "", 0)]
##                                     from1, ali1, to1 = data[0]
##                                     from2, ali2, to2 = data[1]
# print string.join(map(str, ("intron",
# method,
##                                                         token1, e1, len(cds1) - 1, len(intron_fragment1),
##                                                         token2, e2, len(cds2) - 1, len(intron_fragment2),
# map_intron_a2b.getNumGaps(),
# map_intron_a2b.getLength(),
##                                                         map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(),
##                                                         from1, to1, ali1,
##                                                         from2, to2, ali2,
##                                                         intron_from1, intron_to1,
# intron_from2, intron_to2)), "\t")
# sys.stdout.flush()
        last_e1, last_e2 = e1, e2

    ##########################################################################
    # write concatenated exons
# for method in param_write_exons:
# if method == "common":
# print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            0, 0,
##                                                                            0, 0,
# ali_common1, ali_common2 )
# elif method == "exons":
# Write full alignment without gaps.
# This will not care about exon boundaries and gaps.
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))

# try:
##                 from1, s1, to1, from2, s2, to2 = data[0] + data[1]
# except ValueError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1
# except IndexError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1

# if from1:
# if len(s1) != len(s2):
# print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2))
##                     nerrors += 1
##                     from1, to1, from2, to2 = 0, 0, 0, 0
##                     s1, s2 = "", ""
# else:
##                     a1, a2 = [], []
# for x in range( min(len(s1), len(s2)) ):
# if s1[x] != "-" and s2[x] != "-":
##                             a1.append( s1[x] )
##                             a2.append( s2[x] )
##                     s1 = string.join(a1, "")
##                     s2 = string.join(a2, "")

# print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0,
##                                                                              token2, 0,
##                                                                              from1, to1,
##                                                                              from2, to2,
# s1, s2 ) )
# elif method == "full":
# write full alignment (do not care about exon boundaries)
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))
##             if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)]
# print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            data[0][0], data[0][2],
##                                                                            data[1][0], data[1][2],
# data[0][1], data[1][1] )

    if param_loglevel >= 3:
        print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons,
                                                          nskipped_introns)

    return nerrors, nskipped

Example #8

Show file

File: regions2graph.py Project: Charlie-George/cgat

def ProcessRegion(predictions, region_id, region,
                  peptide_sequences=None,
                  filter_queries={}):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write(
            "###################################################################\n")
        options.stdlog.write(
            "# resolving %i predictions in region %s\n" % (len(predictions), str(region)))
        sys.stdout.flush()

    predictions.sort(lambda x, y: cmp(x.score, y.score))
    predictions.reverse()

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0, len(predictions))
    edges = []

    noutput, nskipped = 0, 0

    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write("# filtering from %i with prediction %i: %s\n" % (
                    x, predictions[x].mPredictionId, predictions[x].mQueryToken))
                sys.stdout.flush()

            if map_sequence2cluster[x] != x:
                continue

            region_id += 1
            edges = []

            if predictions[x].mQueryToken not in filter_queries:
                edges.append(predictions[x])
            else:
                nskipped += 1

            for y in range(x + 1, len(predictions)):

                if map_sequence2cluster[y] != y:
                    continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken,
                                     predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken,
                                     predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write("# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" %
                                             (predictions[x].mPredictionId,
                                              predictions[y].mPredictionId))
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key(key):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib_lite.makeSequence(seq1)
                    s2 = alignlib_lite.makeSequence(seq2)
                    alignator.align(result, s1, s2)

                    c1 = 100 * \
                        (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * \
                        (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1, c2)
                    max_cov = max(c1, c2)

                    identity = alignlib_lite.calculatePercentIdentity(
                        result, s1, s2) * 100

                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity:
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write("# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" %
                                             (key,
                                              result.getScore(),
                                              identity,
                                              c1, c2, min_cov, max_cov,
                                              global_alignments[key]))
                        sys.stdout.flush()

                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:
                        edges.append(predictions[y])
                    else:
                        nskipped += 1

            noutput += PrintEdges(region_id, region, edges)

    return region_id, noutput, nskipped

Example #9

Show file

File: prediction2pairs.py Project: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage = globals()["__doc__"])

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genomic data (indexed)." )

    parser.add_option( "-c", "--cds", dest="filename_cds", type="string",
                       help="filename with cds seguences." )

    parser.add_option( "-f", "--format", dest="format", type="choice",
                       choices=("paired_fasta", ),
                       help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format" )

    parser.set_defaults( 
        genome_file = "genome",
        filename_cds = "cds.fasta",
        format = "paired_fasta",
        filename_suffix = ".fasta",
        filename_prefix = "",
        )

    (options, args) = E.Start( parser, add_psql_options = True )    

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(1)

    fasta = IndexedFasta.IndexedFasta( options.genome_file )

    ## reading CDS sequences
    if options.filename_cds:
        cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r") )
    else:
        cds_sequences = {}
    
    if options.loglevel >= 1:
        options.stdlog.write( "# read %i CDS sequences\n" % len(cds_sequences) )

    last_filename_genome = None

    p = PredictionParser.PredictionParserEntry()    
    
    ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0

    for line in options.stdin:
        
        if line[0] == "#": continue
        if line[0] == '"': continue
        
        p.Read(line)

        ninput += 1

        genomic_fragment = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand,
                                              p.mSbjctGenomeFrom, p.mSbjctGenomeTo )

        if len(genomic_fragment) == 0:
            raise "ERROR: empty fragment %s:%s for line" % (p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line
        
        try:
            cds_fragment = cds_sequences[p.mQueryToken]
        except KeyError:
            options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken )
            continue

        map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA( p.mMapPeptide2Genome,
                                                                     query_from = p.mQueryFrom,
                                                                     sbjct_from = 0,
                                                                     genome = genomic_fragment )

        ## check for errors:
        if map_query2sbjct.getRowTo() != p.mQueryTo * 3:
            options.stdlog.write( "# ERROR: boundary shift in query at line %s\n# %i %i\n" % (line, map_query2sbjct.getRowTo(), p.mQueryTo * 3 ) )

        if map_query2sbjct.getColTo() > len(genomic_fragment):
            options.stdlog.write(  "# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %\
            (line, len(genomic_fragment), map_query2sbjct.getColTo()) )
            options.stdlog.write(  "# cds     %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment ))
            nlength += 1
            continue
        
        if map_query2sbjct.getRowTo() > len(cds_fragment):
            options.stdlog.write(  "# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %\
            (line, len(cds_fragment), map_query2sbjct.getRowTo()) )
            options.stdlog.write(  "# cds     %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment ))
            nlength += 1
            continue

        cds_seq = alignlib_lite.makeSequence( cds_fragment )
        genomic_seq = alignlib_lite.makeSequence( genomic_fragment )
        
        f = alignlib_lite.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq )
        row_ali = f.mRowAlignment
        col_ali = f.mColAlignment
        
        row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(row_ali, col_ali)
        
        row_ali = Genomics.MaskStopCodons( row_ali )
        col_ali = Genomics.MaskStopCodons( col_ali )        

        if len(row_ali) != len(col_ali):
            options.stdlog.write(  "# ERROR: wrong alignment lengths.\n" )
            sys.exit(1)
            
        if len(row_ali) % 3 or len(col_ali) % 3:
            options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line )
            options.stdlog.write( "# %6i %s\n# %6i %s\n" % (len(row_ali), str(row_ali), len(col_ali), str(col_ali) ) )
            n3 += 1

        input = re.sub( "[-X]", "", p.mTranslation )
        ref = re.sub( "[-X]", "", Genomics.TranslateDNA2Protein( col_ali ) )
        if input != ref:
            if options.loglevel >= 1:
                options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, 
                                                                                                  len(input), input, 
                                                                                                  len(ref), ref ) )
            nsanity += 1
            continue
        
        options.stdout.write(  ">%s\n%s\n" % (p.mPredictionId, row_ali) )
        options.stdout.write(  ">%s_vs_%s_%s_%i_%i\n%s\n" % \
              (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali) ) 
        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (ninput, noutput, nsanity, nlength, n3) )
                                  
    E.Stop()

Example #10

Show file

File: compare_predictions.py Project: Charlie-George/cgat

                if r.mGenomeTo < t.mGenomeFrom:
                    rr += 1
                    continue
                elif t.mGenomeTo < r.mGenomeFrom:
                    tt += 1
                    continue
                overlap += (min(r.mGenomeTo, t.mGenomeTo) -
                            max(r.mGenomeFrom, t.mGenomeFrom))
                rr += 1
                tt += 1

            if overlap == 0:
                continue

            map_reference2target.clear()
            row = alignlib_lite.makeSequence(reference.mTranslation)
            col = alignlib_lite.makeSequence(target.mTranslation)
            alignator.align(map_reference2target, row, col)

            f = alignlib_lite.AlignmentFormatEmissions(map_reference2target)
            row_ali, col_ali = f.mRowAlignment, f.mColAlignment
            pidentity = 100.0 * \
                alignlib_lite.calculatePercentIdentity(
                    map_reference2target, row, col)
            psimilarity = 100.0 * \
                alignlib_lite.calculatePercentSimilarity(map_reference2target)

            union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                min(reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom)
            inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                max(reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom)

Example #11

Show file

File: select_transcripts.py Project: radaniba/cgat

def EliminateRedundantEntries(
    rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality=None, this_quality=None
):
    """eliminate redundant entries in a set."""

    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = (
            entry.transcript_id,
            entry.mQueryCoverage,
            entry.mPid,
            entry.mQuality,
        )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write("# processing: id=%s class=%s\n" % (mem_id, mem_quality))

        if mem_id in eliminated_predictions:
            continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append((mem_id, "i"))

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append((mem_id, "p"))

        else:
            if mem_quality != this_quality or mem_quality in options.quality_exclude_same:

                seq1 = alignlib_lite.makeSequence(str(rep_seq))
                seq2 = alignlib_lite.makeSequence(str(mem_seq))

                alignator.align(result, seq1, seq2)

                if options.loglevel >= 5:
                    options.stdlog.write("# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit(result, seq1, seq2))

                pidentity = 100 * alignlib_lite.calculatePercentIdentity(result, seq1, seq2)

                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write(
                        "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n"
                        % (mem_id, mem_quality, pidentity, rep_coverage, mem_coverage)
                    )

                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif (
                        mem_coverage >= rep_coverage - options.safety_coverage
                        and 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage
                    ):
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write(
                            "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n"
                            % (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid)
                        )
                    else:
                        eliminated_predictions[mem_id] = rep_id
                        eliminated.append((mem_id, "h"))

                elif (
                    pidentity >= options.min_identity_non_genes
                    and this_quality in options.quality_genes
                    and mem_quality not in options.quality_genes
                ):
                    if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write(
                            "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n"
                            % (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid)
                        )
                    else:
                        eliminated_predictions[mem_id] = rep_id
                        eliminated.append((mem_id, "l"))

    return eliminated

Example #12

Show file

File: orthologs2transcripts.py Project: Charlie-George/cgat

def WriteExons(token1, peptide1, cds1, transcript1,
               token2, peptide2, cds2, transcript2,
               peptide_map_a2b):

    if param_loglevel >= 3:
        for cd in cds1:
            print "#", str(cd)
        for cd in cds2:
            print "#", str(cd)
        print "# peptide_map_a2b", str(alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b))
        sys.stdout.flush()

    dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b,
                                                 cds1, cds2)

    if len(cds1) != len(cds2):
        if param_loglevel >= 4:
            print ""  # WARNING: different number of exons!"

    seq1 = alignlib_lite.makeSequence(transcript1)
    seq2 = alignlib_lite.makeSequence(transcript2)
    tmp_map_a2b = alignlib_lite.makeAlignmentVector()

    dialign = WrapperDialign.Dialign("-n")
    dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8")
    dba = WrapperDBA.DBA()
    #clustal = WrapperClustal.Clustal()

    matrix, gop, gep = global_substitution_matrix
    alignator_nw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix)
    alignator_sw = alignlib_lite.makeAlignatorDPFullDP(
        alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix)

    # concatenated alignments for exons:
    # 1: only the common parts
    ali_common1 = ""
    ali_common2 = ""

    e1, e2 = 0, 0
    while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom():
        e1 += 1
    while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom():
        e2 += 1

    nskipped, nerrors = 0, 0

    if param_loglevel >= 5:
        nmapped = 0
        for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1):
            if dna_map_a2b.mapRowToCol(x) >= 0:
                nmapped += 1
        print "# nmapped=", nmapped
        print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b))

    # declare alignments used
    map_intron_a2b = alignlib_lite.makeAlignmentVector()

    result = Exons.CompareGeneStructures(
        cds1, cds2, map_cmp2ref=peptide_map_a2b)

    if param_loglevel >= 2:
        print result.Pretty("#")

    nskipped_exons, nskipped_introns = 0, 0

    last_e1, last_e2 = None, None

    for link in result.mEquivalences:

        if link.mCoverage <= param_min_exon_coverage:
            nskipped_exons += 1
            continue

        e1, e2 = link.mId1, link.mId2

        c1 = cds1[e1]
        c2 = cds2[e2]
        exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo]
        exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo]

        #######################################################################
        # write unaligned exons
        if param_write_exons:
            pair = AlignedPairs.UnalignedPair()

            pair.mCategory = "exon"
            pair.mToken1 = token1
            pair.mId1 = e1 + 1
            pair.mNum1 = len(cds1)
            pair.mLen1 = len(exon_fragment1)
            pair.mSequence1 = exon_fragment1
            pair.mToken2 = token2
            pair.mId2 = e2 + 1
            pair.mNum2 = len(cds2)
            pair.mLen2 = len(exon_fragment2)
            pair.mSequence2 = exon_fragment2
            pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo,
            pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo,

            print str(pair)
            sys.stdout.flush()

        #######################################################################
        # build alignment for overlap of both exons
# tmp_map_a2b.clear()
# alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b,
# c1.mGenomeFrom + 1, c1.mGenomeTo )

# if param_loglevel >= 5:
# print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo)
# for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"):
# print "#", x
# if tmp_map_a2b.getLength() == 0:
# if param_loglevel >= 1:
# print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \
##                       (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2)
# print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\
##                       peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(peptide_map_a2b)
# print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\
##                       dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \
# Alignlib.writeAlignmentCompressed(dna_map_a2b)
# for cd in cds1: print "##", str(cd)
# for cd in cds2: print "##", str(cd)
##             nerrors += 1
# continue
##         data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b  ).split("\n"))
# if "caligned" in param_write_exons :
# print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1,
##                                                                                token2, e2,
##                                                                                data[0][0], data[0][2],
##                                                                                data[1][0], data[1][2],
# data[0][1], data[1][1] )
##         ali_common1 += data[0][1]
##         ali_common2 += data[1][1]
        #######################################################################
        # write alignment of introns for orthologous introns
        # orthologous introns are between orthologous exons
        if param_write_introns:

            if last_e1 is not None:
                if e1 - last_e1 != 1 or e2 - last_e2 != 1:
                    nskipped_introns += 1
                else:
                    pair = AlignedPairs.UnalignedPair()

                    intron_from1 = cds1[e1 - 1].mGenomeTo
                    intron_to1 = cds1[e1].mGenomeFrom
                    intron_from2 = cds2[e2 - 1].mGenomeTo
                    intron_to2 = cds2[e2].mGenomeFrom

                    intron_fragment1 = transcript1[intron_from1:intron_to1]
                    intron_fragment2 = transcript2[intron_from2:intron_to2]

                    if len(intron_fragment1) == 0 or len(intron_fragment2) == 0:
                        print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\
                              (intron_from1, intron_to1, len(transcript1),
                               intron_from2, intron_to2, len(transcript2))
                        continue

                    pair.mCategory = "intron"
                    pair.mToken1 = token1
                    pair.mId1 = e1 + 1
                    pair.mNum1 = len(cds1) - 1
                    pair.mLen1 = len(intron_fragment1)
                    pair.mFrom1 = intron_from1
                    pair.mTo1 = intron_to1
                    pair.mSequence1 = intron_fragment1
                    pair.mToken2 = token2
                    pair.mId2 = e2 + 1
                    pair.mNum1 = len(cds2) - 1
                    pair.mLen2 = len(intron_fragment2)
                    pair.mFrom2 = intron_from2
                    pair.mTo2 = intron_to2
                    pair.mSequence2 = intron_fragment2

                    if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \
                            (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \
                            (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \
                            (param_max_intron_length and len(intron_fragment2) > param_max_intron_length):
                        if param_loglevel >= 1:
                            print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\
                                  (token1, e1, token2, e2,
                                   len(intron_fragment1),
                                   len(intron_fragment2))
                            sys.stdout.flush()
                            nskipped += 1

                    print str(pair)

# else:
##                         anchored_from1 = intron_from1 - param_extend_introns
##                         anchored_to1 = intron_to1 + param_extend_introns
##                         anchored_from2 = intron_from2 - param_extend_introns
##                         anchored_to2 = intron_to2 + param_extend_introns

##                         anchored_fragment1 = transcript1[anchored_from1:anchored_to1]
##                         anchored_fragment2 = transcript2[anchored_from2:anchored_to2]

# for method in param_write_introns:

# if param_loglevel >= 2:
# print "## aligning with method %s" % method
# sys.stdout.flush

# map_intron_a2b.clear()

# if method == "unaligned":

##                                 from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2

# elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"):

##                                 tmp_intron_a2b = alignlib_lite.makeAlignmentVector()

# if param_loglevel >= 1:
# print "# aligning with method %s two fragments of length %i and %i" % (method,
# len(anchored_fragment1),
# len(anchored_fragment2))
# sys.stdout.flush()

# if method == "dialigned":
##                                     result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dialignedlgs":
##                                     result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "dbaligned":
##                                     result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# elif method == "clusaligned":
##                                     result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b )
# if not result or result.getLength() == 0:
# if param_loglevel >= 1:
# print "# Error: empty intron alignment"
# sys.stdout.flush()
##                                     nerrors += 1
# continue
##                                 tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 )
# alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b,
##                                                        intron_from1 + 1, intron_to1,
# intron_from2 + 1, intron_to2 )
# elif method == "nwaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignator_nw.Align( seq1, seq2, map_intron_a2b )
# seq1.useFullLength()
# seq2.useFullLength()
# elif method == "swaligned":
##                                 seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom )
##                                 seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom )
##                                 alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw )
# seq1.useFullLength()
# seq2.useFullLength()
# else:
##                                 raise "unknown method %s" % method
# if map_intron_a2b.getLength() > 0:
# if param_compress:
##                                     from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo()
##                                     from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo()
##                                     ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b )
# else:
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b  ).split("\n"))
# if len(data) < 2:
##                                         data=[ ( 0, "", 0), (0, "", 0)]
##                                     from1, ali1, to1 = data[0]
##                                     from2, ali2, to2 = data[1]
# print string.join(map(str, ("intron",
# method,
##                                                         token1, e1, len(cds1) - 1, len(intron_fragment1),
##                                                         token2, e2, len(cds2) - 1, len(intron_fragment2),
# map_intron_a2b.getNumGaps(),
# map_intron_a2b.getLength(),
##                                                         map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(),
##                                                         from1, to1, ali1,
##                                                         from2, to2, ali2,
##                                                         intron_from1, intron_to1,
# intron_from2, intron_to2)), "\t")
# sys.stdout.flush()
        last_e1, last_e2 = e1, e2

    ##########################################################################
    # write concatenated exons
# for method in param_write_exons:
# if method == "common":
# print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            0, 0,
##                                                                            0, 0,
# ali_common1, ali_common2 )
# elif method == "exons":
# Write full alignment without gaps.
# This will not care about exon boundaries and gaps.
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))

# try:
##                 from1, s1, to1, from2, s2, to2 = data[0] + data[1]
# except ValueError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1
# except IndexError:
##                 from1, to1, from2, to2 = 0, 0, 0, 0
##                 s1, s2 = "", ""
##                 nerrors += 1

# if from1:
# if len(s1) != len(s2):
# print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2))
##                     nerrors += 1
##                     from1, to1, from2, to2 = 0, 0, 0, 0
##                     s1, s2 = "", ""
# else:
##                     a1, a2 = [], []
# for x in range( min(len(s1), len(s2)) ):
# if s1[x] != "-" and s2[x] != "-":
##                             a1.append( s1[x] )
##                             a2.append( s2[x] )
##                     s1 = string.join(a1, "")
##                     s2 = string.join(a2, "")

# print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0,
##                                                                              token2, 0,
##                                                                              from1, to1,
##                                                                              from2, to2,
# s1, s2 ) )
# elif method == "full":
# write full alignment (do not care about exon boundaries)
# data = map(lambda x: x.split("\t"),
# alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b  ).split("\n"))
##             if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)]
# print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0,
##                                                                            token2, 0,
##                                                                            data[0][0], data[0][2],
##                                                                            data[1][0], data[1][2],
# data[0][1], data[1][1] )

    if param_loglevel >= 3:
        print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns)

    return nerrors, nskipped

Example #13

Show file

File: orthologs2transcripts.py Project: Charlie-George/cgat

def GetOrthologTranscripts(transcripts1, peptides1, cds1,
                           transcripts2, peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(
        peptides1[x[0]]), transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(
        peptides2[x[0]]), transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(cds1[prediction_id1],
                                     (map_a2b.getRowFrom() - 1) * 3,
                                     (map_a2b.getRowTo()) * 3 + 1,
                                     full=False,
                                     min_overlap=param_min_alignment_exon_overlap,
                                     min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(cds2[prediction_id2],
                                     (map_a2b.getColFrom() - 1) * 3,
                                     (map_a2b.getColTo()) * 3 + 1,
                                     full=False,
                                     min_overlap=param_min_alignment_exon_overlap,
                                     min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(
                lambda x: x.mPeptideFrom / 3 + 1, c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries, param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries, param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                            (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                                                 map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1,
                        prediction_id2,
                        sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[
                            i].getLength(),
                        sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[
                            j].getLength(),
                        map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                        map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                        map_a2b.getNumGaps(), coverage_a, coverage_b,
                        nmissed_cmp2ref, mapped_boundaries,
                        nmissed_ref2cmp, reference_boundaries,
                        i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1, prediction_id2,
                        map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                        map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                        map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t")
                elif method == "location":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1,
                        prediction_id2,
                        sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[
                            i].getLength(),
                        sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs

Example #14

Show file

File: orthologs2transcripts.py Project: Charlie-George/cgat

        alignator = alignlib_lite.makeAlignatorDPFull(
            alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

        for q1, q2 in pairs:

            ninput += 1

            if param_loglevel >= 1:
                print "# processing %s and %s" % (q1, q2)

            if q1 in transcripts1 and q2 in transcripts2:
                map_a2b = alignlib_lite.makeAlignmentVector()

                alignator.align(map_a2b,
                                alignlib_lite.makeSequence(peptides1[q1]),
                                alignlib_lite.makeSequence(peptides2[q2]))

                if map_a2b.getLength() == 0:
                    if param_loglevel >= 1:
                        print "# Alignment failed between %s and %s" % (q1, q2)
                        sys.stdout.flush()

                    ntotal_errors += 1
                    continue

                nerrors, nskipped = WriteExons(q1, peptides1[q1], cds1[q1], transcripts1[q1],
                                               q2, peptides2[q2], cds2[
                                                   q2], transcripts2[q2],
                                               map_a2b)

Example #15

Show file

File: prediction2pairs.py Project: CGATOxford/Optic

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic data (indexed).")

    parser.add_option("-c", "--cds-gtf-file", dest="filename_cds", type="string",
                      help="filename with cds seguences.")

    parser.add_option("-f", "--format", dest="format", type="choice",
                      choices=("paired_fasta", ),
                      help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format")

    parser.set_defaults(
        genome_file="genome",
        filename_cds="cds.fasta",
        format="paired_fasta",
        filename_suffix=".fasta",
        filename_prefix="",
    )

    (options, args) = E.Start(parser, add_database_options=True)

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(1)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    # reading CDS sequences
    if options.filename_cds:
        cds_sequences = Genomics.ReadPeptideSequences(
            open(options.filename_cds, "r"))
    else:
        cds_sequences = {}

    if options.loglevel >= 1:
        options.stdlog.write("# read %i CDS sequences\n" % len(cds_sequences))

    last_filename_genome = None

    p = PredictionParser.PredictionParserEntry()

    ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0

    for line in options.stdin:

        if line[0] == "#":
            continue
        if line[0] == '"':
            continue

        p.Read(line)

        ninput += 1

        genomic_fragment = fasta.getSequence(p.mSbjctToken, p.mSbjctStrand,
                                             p.mSbjctGenomeFrom, p.mSbjctGenomeTo)

        if len(genomic_fragment) == 0:
            raise "ERROR: empty fragment %s:%s for line" % (
                p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line

        try:
            cds_fragment = cds_sequences[p.mQueryToken]
        except KeyError:
            options.stdlog.write(
                "# ERROR: cds not found: query %s.\n" % p.mQueryToken)
            continue

        map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA(p.mMapPeptide2Genome,
                                                                    query_from=p.mQueryFrom,
                                                                    sbjct_from=0,
                                                                    genome=genomic_fragment)

        # check for errors:
        if map_query2sbjct.getRowTo() != p.mQueryTo * 3:
            options.stdlog.write("# ERROR: boundary shift in query at line %s\n# %i %i\n" % (
                line, map_query2sbjct.getRowTo(), p.mQueryTo * 3))

        if map_query2sbjct.getColTo() > len(genomic_fragment):
            options.stdlog.write("# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %
                                 (line, len(genomic_fragment), map_query2sbjct.getColTo()))
            options.stdlog.write(
                "# cds     %s\n# genomic %s\n" % (str(cds_fragment), genomic_fragment))
            nlength += 1
            continue

        if map_query2sbjct.getRowTo() > len(cds_fragment):
            options.stdlog.write("# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %
                                 (line, len(cds_fragment), map_query2sbjct.getRowTo()))
            options.stdlog.write(
                "# cds     %s\n# genomic %s\n" % (str(cds_fragment), genomic_fragment))
            nlength += 1
            continue

        cds_seq = alignlib_lite.makeSequence(cds_fragment)
        genomic_seq = alignlib_lite.makeSequence(genomic_fragment)

        f = alignlib_lite.AlignmentFormatExplicit(
            map_query2sbjct, cds_seq, genomic_seq)
        row_ali = f.mRowAlignment
        col_ali = f.mColAlignment

        row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(
            row_ali, col_ali)

        row_ali = Genomics.MaskStopCodons(row_ali)
        col_ali = Genomics.MaskStopCodons(col_ali)

        if len(row_ali) != len(col_ali):
            options.stdlog.write("# ERROR: wrong alignment lengths.\n")
            sys.exit(1)

        if len(row_ali) % 3 or len(col_ali) % 3:
            options.stdlog.write(
                "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line)
            options.stdlog.write("# %6i %s\n# %6i %s\n" % (
                len(row_ali), str(row_ali), len(col_ali), str(col_ali)))
            n3 += 1

        input = re.sub("[-X]", "", p.mTranslation)
        ref = re.sub("[-X]", "", Genomics.TranslateDNA2Protein(col_ali))
        if input != ref:
            if options.loglevel >= 1:
                options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken,
                                                                                                  len(input), input,
                                                                                                  len(ref), ref))
            nsanity += 1
            continue

        options.stdout.write(">%s\n%s\n" % (p.mPredictionId, row_ali))
        options.stdout.write(">%s_vs_%s_%s_%i_%i\n%s\n" %
                             (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali))
        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (
            ninput, noutput, nsanity, nlength, n3))

    E.Stop()

Example #16

Show file

File: compare_predictions2exons.py Project: yangjl/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $",
                                    usage = globals()["__doc__"] )

    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-b", "--boundaries", dest="filename_boundaries", type="string",
                       help="filename with exon boundaries."  )

    parser.add_option( "-e", "--exons", dest="filename_exons", type="string",
                       help="filename with exons (output)."  )

    parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string",
                       help="filename with peptide sequences."  )

    parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true",
                       help="print exons for predictions not found in reference."  )

    parser.add_option( "-q", "--quality-pide", dest="quality_threshold_pide", type="int",
                       help="quality threshold (pide) for exons."  )

    parser.set_defaults( 
        genome_file = "genome",
        filename_boundaries = None,
        filename_exons = None,
        filename_peptides = None,
        quality_threshold_pide = 0,
        write_notfound = False,
        ## allowed number of nucleotides for exon boundaries to
        ## be considered equivalent.
        slipping_exon_boundary = 9,
        ## stop codons to search for        
        stop_codons = ("TAG", "TAA", "TGA"), )


    (options, args) = E.Start( parser, add_pipe_options = True )

    if len(args) > 0:
        print USAGE, "no arguments required."
        sys.exit(2)

    reference_exon_boundaries = {}
    if options.filename_boundaries:
        reference_exon_boundaries = Exons.ReadExonBoundaries( open( options.filename_boundaries, "r"),
                                                              do_invert = 1,
                                                              remove_utr = 1)
        E.info( "read exon boundaries for %i queries" % len(reference_exon_boundaries) )
                
    if options.filename_exons:
        outfile_exons = open( options.filename_exons, "w")
        outfile_exons.write( "%s\n" % "\t".join( (
                    "prediction_id",
                    "exon_id",
                    "exon_from",
                    "exon_to",
                    "exon_frame",
                    "reference_id",
                    "reference_from",
                    "reference_to",
                    "reference_phase",
                    "pidentity",
                    "psimilarity",
                    "nframeshifts",
                    "ngaps",
                    "nstopcodons",
                    "is_ok",
                    "genome_exon_from",
                    "genome_exon_to") ) )

    else:
        outfile_exons = None

    if options.filename_peptides:
        peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r") )
        E.info("read peptide sequences for %i queries" % len(peptide_sequences) )
    else:
        peptide_sequences = {}

    entry = PredictionParser.PredictionParserEntry()
    last_filename_genome = None
    
    nfound, nmissed_exons, nmissed_length = 0, 0, 0
    nempty_alignments = 0

    fasta = IndexedFasta.IndexedFasta( options.genome_file )

    options.stdout.write( "%s\n" % "\t".join( (
                "prediction_id", 
                "number",
                "dubious_exons",
                "boundaries_sum",
                "boundaries_max",
                "identical_exons",
                "inserted_exons",
                "deleted_exons",
                "inserted_introns",
                "deleted_introns",
                "truncated_Nterminus",
                "truncated_Cterminus",
                "deleted_Nexons",
                "deleted_Cexons",
                "inserted_Nexons",
                "inserted_Cexons" ) ) )

    for line in sys.stdin:

        if line[0] == "#": continue
        
        try:
            entry.Read(line)
        except ValueError, msg:
            print "# parsing failed with msg %s in line %s" % (msg, line[:-1])
            sys.exit(1)

        exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome,
                                                   query_from = entry.mQueryFrom,
                                                   sbjct_from = entry.mSbjctGenomeFrom,
                                                   add_stop_codon = 0 )

        if exons[-1][4] != entry.mSbjctGenomeTo:
            print "# WARNING: discrepancy in exon calculation!!!"
            for e in exons:
                print "#", str(e)
            print "#", str(entry)

        if options.loglevel >= 5:
            for e in exons:
                print "#", str(e)
        
        genomic_fragment = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand,
                                              entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo )
        
        skip = False
        if peptide_sequences.has_key( entry.mQueryToken ):
            
            query_sequence = alignlib_lite.makeSequence(peptide_sequences[entry.mQueryToken])
            sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation)
            
            percent_similarity, percent_identity = 0, 0
            if query_sequence.getLength() < entry.mMapPeptide2Translation.getRowTo():
                print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken,
                                                                             query_sequence.getLength(),
                                                                             entry.mMapPeptide2Translation.getRowTo())
                sys.stdout.flush()
                nmissed_length += 1
                skip = True
                
            elif sbjct_sequence.getLength() < entry.mMapPeptide2Translation.getColTo():
                print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken,
                                                                       sbjct_sequence.getLength(),
                                                                       entry.mMapPeptide2Translation.getColTo())
                sys.stdout.flush()                
                nmissed_length += 1
                skip = True
            else:
                alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, 
                                           query_sequence, 
                                           sbjct_sequence,
                                           alignlib_lite.makeScorer( query_sequence, sbjct_sequence ) )
                percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation,
                                                                      query_sequence,
                                                                      sbjct_sequence ) * 100
                percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation ) * 100
                
            E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (
                    str(entry.mPredictionId), 
                    entry.mPercentSimilarity,
                    entry.mPercentIdentity,
                    percent_similarity,
                    percent_identity ) )
                
        else:
            query_sequence = None
            sbjct_sequence = None

        # default values
        exons_num_exons = "na"
        exons_boundaries_sum = "na"
        exons_boundaries_max = "na"
        dubious_exons = "na"

        ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0,0,0,0,0
        truncated_Nterminal_exon, truncated_Cterminal_exon = 0,0
        ndeleted_Nexons, ndeleted_Cexons = 0, 0
        ninserted_Nexons, ninserted_Cexons = 0, 0
        
        exons_offset = exons[0][3]

        if not reference_exon_boundaries.has_key( entry.mQueryToken ):
            print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken )
            sys.stdout.flush()
            nmissed_exons += 1
            skip = True
        
        if not skip:

            nfound += 1
            
            ref_exons = reference_exon_boundaries[entry.mQueryToken]

            ref_exons_offset = ref_exons[0].mGenomeFrom
            
            exons_num_exons = len(ref_exons) - len(exons)
            exons_boundaries_sum = 0
            exons_phase = 0
            exons_boundaries_max = 0
            dubious_exons = 0
            
            inserted_exons = 0
            temp_inserted_exons = 0
            
            if options.loglevel >= 3:
                for e in exons:
                    options.stdlog.write( "# %s\n" % str(e) )
                for e in ref_exons:
                    options.stdlog.write( "# %s\n" % str(e) )

            min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100

            in_sync = 0
            e,r = 0,0

            while e < len(exons) and r < len(ref_exons):

                this_e, this_r = e+1, r+1
                percent_identity = 0
                percent_similarity = 0
                is_good_exon = 0

                if options.loglevel >= 4:
                    options.stdlog.write( "# current exons: %i and %i\n" % (e, r) )
                    sys.stdout.flush()
                    
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[e][0:6]
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom,
                                                                               ref_exons[r].mPeptideTo,
                                                                               ref_exons[r].frame,
                                                                               ref_exons[r].mGenomeFrom,
                                                                               ref_exons[r].mGenomeTo)

                ref_genome_from -= ref_exons_offset
                ref_genome_to   -= ref_exons_offset

                ## get percent identity for exon
                exon_percent_identity = 0
                exon_percent_similarity = 0
                
                if query_sequence and sbjct_sequence:
                    
                    tmp_ali = alignlib_lite.makeAlignmentVector()

                    xquery_from = exon_from / 3
                    xquery_to = exon_to / 3

                    alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to )

                    if tmp_ali.getLength() == 0:
                        options.stdlog.write( "# WARNING: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to)))
                        nempty_alignments += 1
                    else:
                        if options.loglevel >= 5:
                            options.stdlog.write( "# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) ) )

                        exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali,
                                                                                   query_sequence,
                                                                                   sbjct_sequence ) * 100
                        exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali ) * 100

                if exon_percent_identity >= min_pide:
                    is_good_exon = 1
                else:
                    is_good_exon = 0
                    
                if e < len(exons) -1 :
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e+1][0:6]
                else:
                    (next_exon_from, next_exon_to, next_exon_phase,
                     next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, []
                    
                if r < len(ref_exons) - 1:
                    next_ref_from, next_ref_to, next_ref_phase = (ref_exons[r+1].mPeptideFrom,
                                                                  ref_exons[r+1].mPeptideTo,
                                                                  ref_exons[r+1].frame)
                else:
                    next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0
                    
                if options.loglevel >= 2:
                    options.stdlog.write( "# %s\n" % "\t".join( map(str, (entry.mQueryToken,
                                                                          exon_from, exon_to, exon_phase,
                                                                          exon_genome_from, exon_genome_to,
                                                                          ref_from, ref_to, ref_phase ))))
                    sys.stdout.flush()                    

                # beware of small exons.
                # if less than options.slipping_exon_boundary: boundary is 0
                # check if end is more than options.splipping_exon_boundary apart as well.
                if exon_to - exon_from <= options.slipping_exon_boundary or \
                        ref_to - ref_from <= options.slipping_exon_boundary:
                    boundary = 0
                else:
                    boundary = options.slipping_exon_boundary
                    
                if ref_to <= exon_from + boundary and \
                   ref_to <= exon_to - options.slipping_exon_boundary:
                    ## no overlap 
                    is_good_exon = 0
                    if e == 0:
                        ndeleted_Nexons += 1
                    else:
                        ndeleted_exons += 1
                    r += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0
                    overlap = 0
                elif exon_to <= ref_from + boundary and \
                         exon_to <= ref_to - options.slipping_exon_boundary:
                    ## no overlap
                    is_good_exon = 0
                    if r == 0:
                        ninserted_Nexons += 1
                    else:
                        ninserted_exons += 1
                    e += 1
                    ref_from, ref_to, ref_phase = 0, 0, 0
                    overlap = 0
                else:
                    ## overlap
                    overlap = 1
                    dfrom = int(math.fabs(exon_from - ref_from))
                    dto = int(math.fabs(exon_to - ref_to))

                    ## get percent identity for overlapping fragment 
                    if query_sequence and sbjct_sequence:
                        ## this the problem
                        tmp_ali = alignlib_lite.makeAlignmentVector()
                        
                        xquery_from = max( ref_from / 3, exon_from / 3)
                        xquery_to = min(ref_to / 3, exon_to / 3)

                        alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to )

                        if tmp_ali.getLength() == 0:
                            options.stdlog.write( "# warning: empty alignment %s\n" % str((ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to )))
                            percent_identity = 0
                            percent_similarity = 0
                        else:
                            if options.loglevel >= 5:
                                print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence ) )

                            percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali,
                                                                                  query_sequence,
                                                                                  sbjct_sequence ) * 100
                            percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali ) * 100
                            
                    if percent_identity >= min_pide:
                        is_good_exon = 1
                    else:
                        is_good_exon = 0
                        dubious_exons += 1

                    ## adjust regions for terminal exons
                    if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0:
                        if is_good_exon:                        
                            truncated_Nterminal_exon = dfrom
                        dfrom = 0
                            
                    ## truncated terminal exons
                    if e == len(exons)-1 and r == len(ref_exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0:
                        if is_good_exon:                        
                            truncated_Cterminal_exon = dto
                        dto = 0

                    ## do not count deviations for terminal query exons
                    if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0:
                        dfrom = 0
                            
                    if e == len(exons)-1 and dto <= (entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0:
                        dto = 0

                    ## permit difference of one codon (assumed to be stop)
                    if e == len(exons)-1 and r == len(ref_exons)-1 and dto == 3:
                        dto = 0

                    ## deal with different boundary conditions:
                    if dfrom == 0 and dto == 0:
                        if is_good_exon: nidentical_exons += 1
                        e += 1
                        r += 1
                    ## next exon within this ref_exon
                    elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary:
                        if is_good_exon: ninserted_introns += 1
                        e += 1
                        in_sync = 1
                        dto = 0
                    ## next ref_exon within this exon
                    elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary:
                        if is_good_exon: ndeleted_introns += 1
                        r += 1
                        in_sync = 1
                        dto = 0
                    else:
                        e += 1
                        r += 1
                        if in_sync:
                            dfrom = 0

                    if is_good_exon:
                        exons_boundaries_sum += dfrom + dto
                        exons_boundaries_max = max( dfrom, exons_boundaries_max )
                        exons_boundaries_max = max( dto, exons_boundaries_max )
                    
                        
                    ###########################################################
                    ## count inserted/deleted introns and misplaced boundaries
                    ##
                    ## if exon and next_exon in ref_exon: inserted intron
                    ## if ref_exon and next_ref_exon in exon: deleted intron
                    
                if outfile_exons:

                    if genomic_fragment and exon_genome_to:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom,
                                                                                                                       exon_ali,
                                                                                                                       genomic_fragment,
                                                                                                                       border_stop_codon = 0
                                                                                                                       )
                    else:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0

                    if exon_to == 0: this_e = 0
                    if ref_to == 0: this_r = 0
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                this_e, exon_from, exon_to, exon_phase,
                                                                this_r, ref_from, ref_to, ref_phase,
                                                                percent_identity, percent_similarity,
                                                                nframeshifts, ngaps, nstopcodons,
                                                                is_good_exon,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
                    
            while e < len(exons):
                exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[e][0:5]
                e += 1
                ninserted_Cexons += 1

                if outfile_exons:
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId, 
                                                                e, exon_from, exon_to, exon_phase,
                                                                0, 0, 0, 0,
                                                                0, 0,
                                                                0, 0, 0,
                                                                1,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
                    
            while r < len(ref_exons):
                ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = (ref_exons[r].mPeptideFrom,
                                                                               ref_exons[r].mPeptideTo,
                                                                               ref_exons[r].frame,
                                                                               ref_exons[r].mGenomeFrom,
                                                                               ref_exons[r].mGenomeTo)
                ndeleted_Cexons += 1
                ref_genome_from -= ref_exons_offset
                ref_genome_to -= ref_exons_offset
                r += 1
                if outfile_exons:
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                0, 0, 0, 0,
                                                                r, ref_from, ref_to, ref_phase, 
                                                                0, 0,
                                                                0, 0, 0,
                                                                0,
                                                                0, 0,
                                                                )), "\t") + "\n")
        else:
            if options.write_notfound:
                this_e = 0
                ## use prediction's identity/similarity for exons.
                ## This will still then flag stop-codons in later analysis
                percent_identity = entry.mPercentIdentity
                percent_similarity = entry.mPercentSimilarity
            
                for exon in exons:
                    this_e += 1
                    exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[0:6]
                    if genomic_fragment:
                        nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom,
                                                                                                                       exon_ali,
                                                                                                                       genomic_fragment )
                    
                    outfile_exons.write( string.join( map(str, (entry.mPredictionId,
                                                                this_e, exon_from, exon_to, exon_phase,
                                                                0, 0, 0, 0,
                                                                percent_identity, percent_similarity,
                                                                nframeshifts, ngaps, nstopcodons,
                                                                1,
                                                                exon_genome_from, exon_genome_to,
                                                                )), "\t") + "\n")
            
        options.stdout.write( "\t".join(map(str,
                              (entry.mPredictionId,
                               exons_num_exons,
                               dubious_exons,
                               exons_boundaries_sum,
                               exons_boundaries_max,
                               nidentical_exons,
                               ninserted_exons, ndeleted_exons,
                               ninserted_introns, ndeleted_introns,
                               truncated_Nterminal_exon, truncated_Cterminal_exon,
                               ndeleted_Nexons, ndeleted_Cexons,
                               ninserted_Nexons, ninserted_Cexons))) + "\n" )

Example #17

Show file

File: exons2clusters.py Project: Charlie-George/cgat

def PrintCluster(cluster,
                 cluster_id,
                 lengths,
                 peptide_sequences=None,
                 regex_preferred=None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None

    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths:
            l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a

    for mem in cluster:
        l = 0
        if mem in lengths:
            l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib_lite.makeAlignmentVector()

            if rep == mem and rep in lengths:
                alignlib_lite.addDiagonal2Alignment(
                    map_rep2mem, 1, lengths[rep], 0)
            elif mem in peptide_sequences and \
                    rep in peptide_sequences:
                alignator = alignlib_lite.makeAlignatorDPFull(
                    alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align(map_rep2mem,
                                alignlib_lite.makeSequence(
                                    peptide_sequences[rep]),
                                alignlib_lite.makeSequence(peptide_sequences[mem]))

            f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem)
            print string.join(map(str, (rep, mem, l, f)), "\t")

        else:
            print string.join(map(str, (rep, mem, l)), "\t")

    sys.stdout.flush()

    return cluster_id

Example #18

Show file

def FilterConflicts(old_predictions, new_predictions, removed_predictions,
                    min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##################################################################################################
    ## sort predictions by genomic region
    if isinstance(old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort(('mSbjctToken', 'mSbjctStrand',
                              'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp(
            (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.
             mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.
                               mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##################################################################################################
    ## filter predictions and resolve conflicts based on genomic overlap
    ## deleted segments are put in a temporary storage space.
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    result = alignlib_lite.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue

        overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)
        union   = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
               (last_query_gene != this_query_gene or last_query_gene == None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:

                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken,
                                         this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken,
                                         last_prediction.mQueryToken)

                    if not alignments.has_key(key):
                        result.clear()
                        alignator.align(
                            result,
                            alignlib_lite.makeSequence(peptide_sequences[
                                this_prediction.mQueryToken]),
                            alignlib_lite.makeSequence(peptide_sequences[
                                last_prediction.mQueryToken]))
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1

                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0

        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage
                                      ) < param_conflicts_min_difference:
                d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(
                    this_prediction.score) < param_conflicts_min_difference:
                d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (
                        last_prediction.mPredictionId,
                        last_prediction.mQueryToken,
                        last_prediction.mSbjctGenomeFrom, overlap,
                        relative_overlap, str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark(this_prediction, last_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (
                            overlap, relative_overlap, str(last_prediction))

                removed_predictions.append(this_prediction)
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (
                        this_prediction.mPredictionId,
                        this_prediction.mQueryToken,
                        this_prediction.mSbjctGenomeFrom, overlap,
                        relative_overlap, str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark(last_prediction, this_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (
                            overlap, relative_overlap, str(this_prediction))
                removed_predictions.append(last_prediction)
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)

    return nnew

Example #19

Show file

            print "#", cds_fragment
            print "# genomic"
            print "#", genomic_fragment
            continue

        if map_query2sbjct.getRowTo() > len(cds_fragment):
            print "# ERROR: length mismatch: cds fragment (%i) shorter than last aligned residue (%i)" %\
                (len(cds_fragment), map_query2sbjct.getRowTo())
            print "#", line
            print "# cds"
            print "#", cds_fragment
            print "# genomic"
            print "#", genomic_fragment
            continue

        cds_seq = alignlib_lite.makeSequence(cds_fragment)
        genomic_seq = alignlib_lite.makeSequence(genomic_fragment)

        data = map(lambda x: string.split(x, "\t"),
                   string.split(alignlib_lite.writePairAlignment(cds_seq,
                                                                 genomic_seq,
                                                                 map_query2sbjct), "\n"))

        row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(
            data[0][1], data[1][1])

        row_ali = Genomics.MaskStopCodons(row_ali)
        col_ali = Genomics.MaskStopCodons(col_ali)

        if len(row_ali) != len(col_ali):
            print "# ERROR: wrong alignment lengths."

Example #20

Show file

                if r.mGenomeTo < t.mGenomeFrom:
                    rr += 1
                    continue
                elif t.mGenomeTo < r.mGenomeFrom:
                    tt += 1
                    continue
                overlap += (min(r.mGenomeTo, t.mGenomeTo) -
                            max(r.mGenomeFrom, t.mGenomeFrom))
                rr += 1
                tt += 1

            if overlap == 0:
                continue

            map_reference2target.clear()
            row = alignlib_lite.makeSequence(reference.mTranslation)
            col = alignlib_lite.makeSequence(target.mTranslation)
            alignator.align(map_reference2target, row, col)

            f = alignlib_lite.AlignmentFormatEmissions(map_reference2target)
            row_ali, col_ali = f.mRowAlignment, f.mColAlignment
            pidentity = 100.0 * alignlib_lite.calculatePercentIdentity(
                map_reference2target, row, col)
            psimilarity = 100.0 * alignlib_lite.calculatePercentSimilarity(
                map_reference2target)

            union = max( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                    min( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom )
            inter = min( reference.mSbjctGenomeTo, target.mSbjctGenomeTo) - \
                    max( reference.mSbjctGenomeFrom, target.mSbjctGenomeFrom )

Example #21

Show file

File: regions2graph.py Project: santayana/cgat

def ProcessRegion(predictions,
                  region_id,
                  region,
                  peptide_sequences=None,
                  filter_queries={}):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write(
            "###################################################################\n"
        )
        options.stdlog.write("# resolving %i predictions in region %s\n" %
                             (len(predictions), str(region)))
        sys.stdout.flush()

    predictions.sort(lambda x, y: cmp(x.score, y.score))
    predictions.reverse()

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0, len(predictions))
    edges = []

    noutput, nskipped = 0, 0

    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write(
                    "# filtering from %i with prediction %i: %s\n" %
                    (x, predictions[x].mPredictionId,
                     predictions[x].mQueryToken))
                sys.stdout.flush()

            if map_sequence2cluster[x] != x:
                continue

            region_id += 1
            edges = []

            if predictions[x].mQueryToken not in filter_queries:
                edges.append(predictions[x])
            else:
                nskipped += 1

            for y in range(x + 1, len(predictions)):

                if map_sequence2cluster[y] != y:
                    continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken,
                                     predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken,
                                     predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write(
                            "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n"
                            % (predictions[x].mPredictionId,
                               predictions[y].mPredictionId))
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key(key):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib_lite.makeSequence(seq1)
                    s2 = alignlib_lite.makeSequence(seq2)
                    alignator.align(result, s1, s2)

                    c1 = 100 * \
                        (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * \
                        (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1, c2)
                    max_cov = max(c1, c2)

                    identity = alignlib_lite.calculatePercentIdentity(
                        result, s1, s2) * 100

                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity:
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write(
                            "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n"
                            % (key, result.getScore(), identity, c1, c2,
                               min_cov, max_cov, global_alignments[key]))
                        sys.stdout.flush()

                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:
                        edges.append(predictions[y])
                    else:
                        nskipped += 1

            noutput += PrintEdges(region_id, region, edges)

    return region_id, noutput, nskipped

Example #22

Show file

File: exons2clusters.py Project: yangjl/cgat

def PrintCluster(cluster,
                 cluster_id,
                 lengths,
                 peptide_sequences=None,
                 regex_preferred=None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None

    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a

    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib_lite.makeAlignmentVector()

            if rep == mem and rep in lengths:
                alignlib_lite.addDiagonal2Alignment(map_rep2mem, 1,
                                                    lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib_lite.makeAlignatorDPFull(
                    alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align(
                    map_rep2mem,
                    alignlib_lite.makeSequence(peptide_sequences[rep]),
                    alignlib_lite.makeSequence(peptide_sequences[mem]))

            f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem)
            print string.join(map(str, (rep, mem, l, f)), "\t")

        else:
            print string.join(map(str, (rep, mem, l)), "\t")

    sys.stdout.flush()

    return cluster_id

Example #23

Show file

File: pairs2gene_structure.py Project: CGATOxford/Optic

            print "#", cds_fragment
            print "# genomic"
            print "#", genomic_fragment
            continue

        if map_query2sbjct.getRowTo() > len(cds_fragment):
            print "# ERROR: length mismatch: cds fragment (%i) shorter than last aligned residue (%i)" %\
                (len(cds_fragment), map_query2sbjct.getRowTo())
            print "#", line
            print "# cds"
            print "#", cds_fragment
            print "# genomic"
            print "#", genomic_fragment
            continue

        cds_seq = alignlib_lite.makeSequence(cds_fragment)
        genomic_seq = alignlib_lite.makeSequence(genomic_fragment)

        data = map(lambda x: string.split(x, "\t"),
                   string.split(alignlib_lite.writePairAlignment(cds_seq,
                                                                 genomic_seq,
                                                                 map_query2sbjct), "\n"))

        row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(
            data[0][1], data[1][1])

        row_ali = Genomics.MaskStopCodons(row_ali)
        col_ali = Genomics.MaskStopCodons(col_ali)

        if len(row_ali) != len(col_ali):
            print "# ERROR: wrong alignment lengths."

Example #24

Show file

File: select_transcripts.py Project: yangjl/cgat

def EliminateRedundantEntries( rep, 
                               data,
                               eliminated_predictions,
                               options, 
                               peptides,
                               extended_peptides,
                               filter_quality = None,
                               this_quality = None ):
    """eliminate redundant entries in a set."""
    
    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib_lite.makeAlignmentVector()
    
    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id,
                                                       entry.mQueryCoverage,
                                                       entry.mPid,
                                                       entry.mQuality )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality))
            
        if mem_id in eliminated_predictions: continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "i") )

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "p") )

        else:
            if mem_quality != this_quality or \
                   mem_quality in options.quality_exclude_same:
          
                seq1 = alignlib_lite.makeSequence( str(rep_seq) )
                seq2 = alignlib_lite.makeSequence( str(mem_seq) )            

                alignator.align( result, seq1, seq2 )

                if options.loglevel >= 5:
                    options.stdlog.write( "# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit( result, seq1, seq2 ) )
                
                pidentity = 100 * alignlib_lite.calculatePercentIdentity( result, seq1, seq2 )
                
                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\
                                              ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) )
                    
                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and \
                         mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif mem_coverage >= rep_coverage - options.safety_coverage and \
                             100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage:
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "h") )
                        
                elif pidentity >= options.min_identity_non_genes and \
                         this_quality in options.quality_genes and \
                         mem_quality not in options.quality_genes:
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "l") )

    return eliminated