Example #1
0
def CheckAlignments(peptide_sequences, query_token, other_tokens):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token,
                                                     str(other_tokens))
        sys.stdout.flush()

    if query_token not in peptide_sequences:
        return True

    result = alignlib_lite.makeAlignmentVector()
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
    row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib_lite.makeSequence(peptide_sequences[x])
        alignator.align(result, row_seq, col_seq)
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True

    return False
Example #2
0
def CheckAlignments(peptide_sequences, query_token, other_tokens):
    """check wether query aligns to all others.
    """

    if param_loglevel >= 3:
        print "# checking query %s and sbjcts %s" % (query_token, str(other_tokens))
        sys.stdout.flush()

    if query_token not in peptide_sequences:
        return True

    result = alignlib_lite.makeAlignmentVector()
    alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL,
                                                  -10.0, -1.0)
    row_seq = alignlib_lite.makeSequence(peptide_sequences[query_token])

    for x in other_tokens:
        if x not in peptide_sequences:
            continue
        col_seq = alignlib_lite.makeSequence(peptide_sequences[x])
        alignator.align(result, row_seq, col_seq)
        if param_loglevel >= 5:
            print "# %s - %s = %f" % (query_token, x, result.getScore())
        if result.getScore() > param_min_alignment_score:
            return True

    return False
Example #3
0
def FilterConflicts(old_predictions, new_predictions, removed_predictions,
                    min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##################################################################################################
    ## sort predictions by genomic region
    if isinstance(old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort(('mSbjctToken', 'mSbjctStrand',
                              'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp(
            (x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.
             mSbjctGenomeTo), (y.mSbjctToken, y.mSbjctStrand, y.
                               mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##################################################################################################
    ## filter predictions and resolve conflicts based on genomic overlap
    ## deleted segments are put in a temporary storage space.
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    result = alignlib_lite.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            this_query_peptide, this_query_status, this_query_gene, this_query_transcript = \
                                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue

        overlap = min(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  max(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)
        union   = max(last_prediction.mSbjctGenomeTo, this_prediction.mSbjctGenomeTo) - \
                  min(last_prediction.mSbjctGenomeFrom, this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
               (last_query_gene != this_query_gene or last_query_gene == None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:

                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken,
                                         this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken,
                                         last_prediction.mQueryToken)

                    if not alignments.has_key(key):
                        result.clear()
                        alignator.align(
                            result,
                            alignlib_lite.makeSequence(peptide_sequences[
                                this_prediction.mQueryToken]),
                            alignlib_lite.makeSequence(peptide_sequences[
                                last_prediction.mQueryToken]))
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1

                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0

        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage
                                      ) < param_conflicts_min_difference:
                d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(
                    this_prediction.score) < param_conflicts_min_difference:
                d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (
                        last_prediction.mPredictionId,
                        last_prediction.mQueryToken,
                        last_prediction.mSbjctGenomeFrom, overlap,
                        relative_overlap, str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark(this_prediction, last_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (
                            overlap, relative_overlap, str(last_prediction))

                removed_predictions.append(this_prediction)
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (
                        this_prediction.mPredictionId,
                        this_prediction.mQueryToken,
                        this_prediction.mSbjctGenomeFrom, overlap,
                        relative_overlap, str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark(last_prediction, this_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (
                            overlap, relative_overlap, str(this_prediction))
                removed_predictions.append(last_prediction)
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)

    return nnew
Example #4
0
def ProcessRegion(predictions,
                  region_id,
                  region,
                  peptide_sequences=None,
                  filter_queries={}):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write(
            "###################################################################\n"
        )
        options.stdlog.write("# resolving %i predictions in region %s\n" %
                             (len(predictions), str(region)))
        sys.stdout.flush()

    predictions.sort(lambda x, y: cmp(x.score, y.score))
    predictions.reverse()

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0, len(predictions))
    edges = []

    noutput, nskipped = 0, 0

    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write(
                    "# filtering from %i with prediction %i: %s\n" %
                    (x, predictions[x].mPredictionId,
                     predictions[x].mQueryToken))
                sys.stdout.flush()

            if map_sequence2cluster[x] != x:
                continue

            region_id += 1
            edges = []

            if predictions[x].mQueryToken not in filter_queries:
                edges.append(predictions[x])
            else:
                nskipped += 1

            for y in range(x + 1, len(predictions)):

                if map_sequence2cluster[y] != y:
                    continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken,
                                     predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken,
                                     predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write(
                            "# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n"
                            % (predictions[x].mPredictionId,
                               predictions[y].mPredictionId))
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key(key):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib_lite.makeSequence(seq1)
                    s2 = alignlib_lite.makeSequence(seq2)
                    alignator.align(result, s1, s2)

                    c1 = 100 * \
                        (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * \
                        (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1, c2)
                    max_cov = max(c1, c2)

                    identity = alignlib_lite.calculatePercentIdentity(
                        result, s1, s2) * 100

                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity:
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write(
                            "# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n"
                            % (key, result.getScore(), identity, c1, c2,
                               min_cov, max_cov, global_alignments[key]))
                        sys.stdout.flush()

                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:
                        edges.append(predictions[y])
                    else:
                        nskipped += 1

            noutput += PrintEdges(region_id, region, edges)

    return region_id, noutput, nskipped
Example #5
0
def EliminateRedundantEntries(
    rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality=None, this_quality=None
):
    """eliminate redundant entries in a set."""

    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib_lite.makeAlignatorDPFull(alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = (
            entry.transcript_id,
            entry.mQueryCoverage,
            entry.mPid,
            entry.mQuality,
        )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write("# processing: id=%s class=%s\n" % (mem_id, mem_quality))

        if mem_id in eliminated_predictions:
            continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append((mem_id, "i"))

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append((mem_id, "p"))

        else:
            if mem_quality != this_quality or mem_quality in options.quality_exclude_same:

                seq1 = alignlib_lite.makeSequence(str(rep_seq))
                seq2 = alignlib_lite.makeSequence(str(mem_seq))

                alignator.align(result, seq1, seq2)

                if options.loglevel >= 5:
                    options.stdlog.write("# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit(result, seq1, seq2))

                pidentity = 100 * alignlib_lite.calculatePercentIdentity(result, seq1, seq2)

                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write(
                        "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n"
                        % (mem_id, mem_quality, pidentity, rep_coverage, mem_coverage)
                    )

                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif (
                        mem_coverage >= rep_coverage - options.safety_coverage
                        and 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage
                    ):
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write(
                            "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n"
                            % (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid)
                        )
                    else:
                        eliminated_predictions[mem_id] = rep_id
                        eliminated.append((mem_id, "h"))

                elif (
                    pidentity >= options.min_identity_non_genes
                    and this_quality in options.quality_genes
                    and mem_quality not in options.quality_genes
                ):
                    if rep_coverage < mem_coverage - options.safety_coverage or rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write(
                            "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n"
                            % (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid)
                        )
                    else:
                        eliminated_predictions[mem_id] = rep_id
                        eliminated.append((mem_id, "l"))

    return eliminated
Example #6
0
def PrintCluster(cluster,
                 cluster_id,
                 lengths,
                 peptide_sequences=None,
                 regex_preferred=None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None

    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths:
            l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a

    for mem in cluster:
        l = 0
        if mem in lengths:
            l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib_lite.makeAlignmentVector()

            if rep == mem and rep in lengths:
                alignlib_lite.addDiagonal2Alignment(
                    map_rep2mem, 1, lengths[rep], 0)
            elif mem in peptide_sequences and \
                    rep in peptide_sequences:
                alignator = alignlib_lite.makeAlignatorDPFull(
                    alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align(map_rep2mem,
                                alignlib_lite.makeSequence(
                                    peptide_sequences[rep]),
                                alignlib_lite.makeSequence(peptide_sequences[mem]))

            f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem)
            print string.join(map(str, (rep, mem, l, f)), "\t")

        else:
            print string.join(map(str, (rep, mem, l)), "\t")

    sys.stdout.flush()

    return cluster_id
Example #7
0
def FilterConflicts(old_predictions, new_predictions, removed_predictions,
                    min_overlap, peptide_sequences):
    """remove conflicts.

    Remove overlapping entries between different queries.

    Only remove those sequences, which are alignable.

    If they are alignable, take the sequence with the highest score and highest coverage.
    (Take both, if score and coverage are not correlated.)
    """
    ##########################################################################
    # sort predictions by genomic region
    if isinstance(old_predictions, PredictionFile.PredictionFile):
        old_predictions.sort(
            ('mSbjctToken', 'mSbjctStrand', 'mSbjctGenomeFrom', 'mSbjctGenomeTo'))
    else:
        old_predictions.sort(lambda x, y: cmp((x.mSbjctToken, x.mSbjctStrand, x.mSbjctGenomeFrom, x.mSbjctGenomeTo),
                                              (y.mSbjctToken, y.mSbjctStrand, y.mSbjctGenomeFrom, y.mSbjctGenomeTo)))

    ##########################################################################
    # filter predictions and resolve conflicts based on genomic overlap
    # deleted segments are put in a temporary storage space.
    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    result = alignlib_lite.makeAlignmentVector()
    alignments = {}
    noverlaps = 0
    nredundants = 0

    nnew = 0
    last_prediction = None

    for this_prediction in old_predictions:
        try:
            (this_query_peptide, this_query_status, this_query_gene,
             this_query_transcript) = \
                re.split("\s+", this_prediction.mQueryToken)
        except ValueError:
            this_query_gene = None

        if not last_prediction:
            last_prediction = this_prediction
            last_query_gene = this_query_gene
            continue

        overlap = min(last_prediction.mSbjctGenomeTo,
                      this_prediction.mSbjctGenomeTo) -\
            max(last_prediction.mSbjctGenomeFrom,
                this_prediction.mSbjctGenomeFrom)
        union = max(last_prediction.mSbjctGenomeTo,
                    this_prediction.mSbjctGenomeTo) -\
            min(last_prediction.mSbjctGenomeFrom,
                this_prediction.mSbjctGenomeFrom)

        # resolve overlap between different genes
        if overlap > 0 and \
                (last_query_gene != this_query_gene or
                 last_query_gene is None):

            noverlaps += 1
            relative_overlap = 100 * overlap / union

            # Start conflict resolution, if overlap is above threshold.
            # Keep higher scoring segment.
            #
            # Check if queries are homologous.
            if relative_overlap >= param_max_percent_overlap:

                if peptide_sequences:
                    if last_prediction.mQueryToken < this_prediction.mQueryToken:
                        key = "%s-%s" % (last_prediction.mQueryToken,
                                         this_prediction.mQueryToken)
                    else:
                        key = "%s-%s" % (this_prediction.mQueryToken,
                                         last_prediction.mQueryToken)

                    if not alignments.has_key(key):
                        result.clear()
                        alignator.align(result,
                                        alignlib_lite.makeSequence(
                                            peptide_sequences[this_prediction.mQueryToken]),
                                        alignlib_lite.makeSequence(peptide_sequences[last_prediction.mQueryToken]))
                        alignments[key] = result.getScore()
                        if result.getScore() >= param_min_score_overlap:
                            nredundants += 1

                    if alignments[key] >= param_min_score_overlap:
                        is_overlap = 1
                    else:
                        is_overlap = 0
                else:
                    is_overlap = 1
            else:
                is_overlap = 0
        else:
            is_overlap = 0

        if is_overlap:
            # take best prediction. If difference is very small, set
            # difference to 0 (difference does not matter). In this case,
            # the first prediction is taken.
            d1 = last_prediction.mQueryCoverage - \
                this_prediction.mQueryCoverage
            if float(abs(d1)) / float(last_prediction.mQueryCoverage) < param_conflicts_min_difference:
                d1 = 0
            d2 = last_prediction.score - this_prediction.score
            if float(abs(d2)) / float(this_prediction.score) < param_conflicts_min_difference:
                d2 = 0
            if d1 >= 0 and d2 >= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (last_prediction.mPredictionId,
                                                                                          last_prediction.mQueryToken,
                                                                                          last_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(this_prediction))
                if param_benchmarks:
                    if CheckBenchmark(this_prediction, last_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap,
                                                                               str(last_prediction))

                removed_predictions.append(this_prediction)
                continue
            elif d1 <= 0 and d2 <= 0:
                if param_loglevel >= 2:
                    print "# CONFLICT: kept %i(%s-%i), overlap=%i(%5.2f), removed: %s" % (this_prediction.mPredictionId,
                                                                                          this_prediction.mQueryToken,
                                                                                          this_prediction.mSbjctGenomeFrom,
                                                                                          overlap, relative_overlap,
                                                                                          str(last_prediction))
                if param_benchmarks:
                    if CheckBenchmark(last_prediction, this_prediction):
                        print "# BENCHMARK KEPT with overlap=%i(%5.2f): %s" % (overlap, relative_overlap,
                                                                               str(this_prediction))
                removed_predictions.append(last_prediction)
                last_prediction = this_prediction
                last_query_gene = this_query_gene
                continue
            else:
                if param_loglevel >= 2:
                    print "# CONFLICT: non-correlated score/coverage. Keeping both %i(%s-%i) (%5.2f/%i/%i) and %i(%s-%i) (%5.2f/%i/%i)" % \
                          (this_prediction.mPredictionId,
                           this_prediction.mQueryToken, this_prediction.mSbjctGenomeFrom,
                           this_prediction.score, this_prediction.mQueryCoverage,
                           this_prediction.mPercentIdentity,
                           last_prediction.mPredictionId,
                           last_prediction.mQueryToken, last_prediction.mSbjctGenomeFrom,
                           last_prediction.score, last_prediction.mQueryCoverage,
                           last_prediction.mPercentIdentity)

        new_predictions.append(last_prediction)
        nnew += 1
        last_query_gene = this_query_gene
        last_prediction = this_prediction

    new_predictions.append(last_prediction)
    nnew += 1

    if param_loglevel >= 1:
        print "# calculated %i alignments for %i potential conflicts (%i above threshold)" % \
              (len(alignments), noverlaps, nredundants)

    return nnew
Example #8
0
def GetOrthologTranscripts(transcripts1, peptides1, cds1, transcripts2,
                           peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(peptides1[x[0]]),
                transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(peptides2[x[0]]),
                transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (
                        seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(
                cds1[prediction_id1], (map_a2b.getRowFrom() - 1) * 3,
                (map_a2b.getRowTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(
                cds2[prediction_id2], (map_a2b.getColFrom() - 1) * 3,
                (map_a2b.getColTo()) * 3 + 1,
                full=False,
                min_overlap=param_min_alignment_exon_overlap,
                min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(lambda x: x.mPeptideFrom / 3 + 1,
                                       c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries,
                param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries,
                param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                        (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(
                    map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                              map_a2b.getNumGaps(), coverage_a, coverage_b)),
                    "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             sbjct_token1, sbjct_strand1, sbjct_from1,
                             sbjct_to1, seqs1[i].getLength(), sbjct_token2,
                             sbjct_strand2, sbjct_from2, sbjct_to2,
                             seqs2[j].getLength(), map_a2b.getRowFrom(),
                             map_a2b.getRowTo(), row_ali, map_a2b.getColFrom(),
                             map_a2b.getColTo(), col_ali, map_a2b.getNumGaps(),
                             coverage_a, coverage_b, nmissed_cmp2ref,
                             mapped_boundaries, nmissed_ref2cmp,
                             reference_boundaries, i, j, len(c1), len(c2), cc,
                             is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(
                        map(str,
                            ("pair", method, prediction_id1, prediction_id2,
                             map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                             map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                             map_a2b.getNumGaps(), coverage_a, coverage_b)),
                        "\t")
                elif method == "location":
                    print string.join(
                        map(str, ("pair", method, prediction_id1,
                                  prediction_id2, sbjct_token1, sbjct_strand1,
                                  sbjct_from1, sbjct_to1, seqs1[i].getLength(),
                                  sbjct_token2, sbjct_strand2, sbjct_from2,
                                  sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (
                        i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs
Example #9
0
    elif param_mode == "transcripts":

        transcript_ids1 = {}
        transcript_ids2 = {}
        for x, y in pairs:
            transcript_ids1[x] = 1
            transcript_ids2[y] = 1

        transcripts1, transcripts2, cds1, cds2 = ReadTranscriptsAndCds(
            transcript_ids1, transcript_ids2)

        if param_loglevel >= 1:
            print "# reading has finished."
            sys.stdout.flush()

        alignator = alignlib_lite.makeAlignatorDPFull(
            alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

        for q1, q2 in pairs:

            ninput += 1

            if param_loglevel >= 1:
                print "# processing %s and %s" % (q1, q2)

            if q1 in transcripts1 and q2 in transcripts2:
                map_a2b = alignlib_lite.makeAlignmentVector()

                alignator.align(map_a2b,
                                alignlib_lite.makeSequence(peptides1[q1]),
                                alignlib_lite.makeSequence(peptides2[q2]))
	nintrons, 
	nsplits, 
	nstopcodons, 
	pidentity, 
	psimilarity, 
	sequence, 
	sbjct_genome_from, 
	sbjct_genome_to, 
	map_query2genome
    FROM %s AS p 
    WHERE p.sbjct_token = '%s' AND
    p.sbjct_strand = '%s' AND 
    OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement % (param_tablename_predictions_target,
                                reference.mSbjctToken, reference.mSbjctStrand,
                                reference.mSbjctGenomeFrom, reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
                                                0,
Example #11
0
def PrintCluster(cluster,
                 cluster_id,
                 lengths,
                 peptide_sequences=None,
                 regex_preferred=None):
    """print a cluster.

    Take longest sequence as representative. If preferred is given, only take
    genes matching preferred identifier.
    """

    if regex_preferred:
        rx = re.compile(regex_preferred)
    else:
        rx = None

    max_al = 0
    max_pl = 0
    rep_a = None
    rep_p = None
    for c in cluster:
        l = 0
        if c in lengths: l = lengths[c]

        if l > max_al:
            max_al = l
            rep_a = c

        if rx and rx.search(c) and l > max_pl:
            max_pl = l
            rep_p = c

    if max_pl > 0:
        max_l = max_pl
        rep = rep_p
    else:
        max_l = max_al
        rep = rep_a

    for mem in cluster:
        l = 0
        if mem in lengths: l = lengths[mem]
        if peptide_sequences:
            map_rep2mem = alignlib_lite.makeAlignmentVector()

            if rep == mem and rep in lengths:
                alignlib_lite.addDiagonal2Alignment(map_rep2mem, 1,
                                                    lengths[rep], 0)
            elif mem in peptide_sequences and \
                     rep in peptide_sequences:
                alignator = alignlib_lite.makeAlignatorDPFull(
                    alignlib_lite.ALIGNMENT_LOCAL, -10.0, -1.0)
                alignator.align(
                    map_rep2mem,
                    alignlib_lite.makeSequence(peptide_sequences[rep]),
                    alignlib_lite.makeSequence(peptide_sequences[mem]))

            f = alignlib_lite.AlignmentFormatEmissions(map_rep2mem)
            print string.join(map(str, (rep, mem, l, f)), "\t")

        else:
            print string.join(map(str, (rep, mem, l)), "\t")

    sys.stdout.flush()

    return cluster_id
def GetOrthologTranscripts(transcripts1, peptides1, cds1,
                           transcripts2, peptides2, cds2):
    """sort out ortholog relationships between
    transcripts of orthologous genes.

    Orthologs have:
        the same number of exons        
        compatible intron/exon boundaries

    For the remaining transcript pairs, take reciprocal bet hits.

    I see the following:
    0: 0(100%), 1: 0(94%), 2: 0,1(100%)
    0: 0(100%), 1: 0,1,2(100%)

    Selecting 1-0 first, would result in a suboptimal match, because one transcript
    is longer than the other, while matching up 0-0 and 2-1 would be better.

    Objective function: it is the maximal matching/assignment problem. Use greedy
    implementation instead. Assign as much as possible according to descending weights.
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

    # for long sequence: use dot alignment with tuple size of three
    dottor = alignlib_lite.makeAlignatorTuples(3)
    alignator_dots = alignlib_lite.makeAlignatorDotsSquared(
        param_gop, param_gep, dottor)

    seqs1 = map(lambda x: alignlib_lite.makeSequence(
        peptides1[x[0]]), transcripts1)
    seqs2 = map(lambda x: alignlib_lite.makeSequence(
        peptides2[x[0]]), transcripts2)

    if param_loglevel >= 4:
        print "# building sequence 1"
    for i in range(len(seqs1)):
        if not cds1.has_key(transcripts1[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# building sequence 2"

    for i in range(len(seqs2)):
        if not cds2.has_key(transcripts2[i][0]):
            if param_loglevel >= 4:
                print "# %s not found" % transcripts1[i][0]

    if param_loglevel >= 4:
        print "# all-vs-all alignment"

    # do all versus all alignment
    alis1 = []
    alis2 = []
    for i in range(len(seqs1)):
        alis1.append([])
    for i in range(len(seqs2)):
        alis2.append([])

    if param_loglevel >= 3:

        print "#################################"

        for i in range(len(seqs1)):
            for cd in cds1[transcripts1[i][0]]:
                print "#", str(cd)
        print "# versus"
        for i in range(len(seqs2)):
            for cd in cds2[transcripts2[i][0]]:
                print "#", str(cd)
        sys.stdout.flush()

    weights = {}
    for i in range(len(seqs1)):
        prediction_id1, sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1 = transcripts1[
            i]

        for j in range(len(seqs2)):
            prediction_id2, sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2 = transcripts2[
                j]
            map_a2b = alignlib_lite.makeAlignmentVector()

            m = seqs1[i].getLength() * seqs2[j].getLength()

            if param_loglevel >= 3:
                print "# Starting alignment of pair (%i,%i) of lengths %s:%i and %s:%i" %\
                      (i, j, prediction_id1, seqs1[
                       i].getLength(), prediction_id2, seqs2[j].getLength())
                sys.stdout.flush()

            if m > param_max_matrix_size:
                # switch to tuple alignment if sequences are too large
                if param_loglevel >= 2:
                    print "# WARNING: sequences are of length %i and %i: switching to dot alignment." % (seqs1[i].getLength(), seqs2[j].getLength())
                    sys.stdout.flush()

                alignator_dots.align(map_a2b, seqs1[i], seqs2[j])
            else:
                alignator.align(map_a2b, seqs1[i], seqs2[j])

            coverage_a = 100.0 * \
                (map_a2b.getRowTo() - map_a2b.getRowFrom() + 1) / \
                seqs1[i].getLength()
            coverage_b = 100.0 * \
                (map_a2b.getColTo() - map_a2b.getColFrom() + 1) / \
                seqs2[j].getLength()

            # get copy of cds, but only those overlapping with alignment
            c1 = Exons.GetExonsRange(cds1[prediction_id1],
                                     (map_a2b.getRowFrom() - 1) * 3,
                                     (map_a2b.getRowTo()) * 3 + 1,
                                     full=False,
                                     min_overlap=param_min_alignment_exon_overlap,
                                     min_exon_size=param_min_exon_size)
            c2 = Exons.GetExonsRange(cds2[prediction_id2],
                                     (map_a2b.getColFrom() - 1) * 3,
                                     (map_a2b.getColTo()) * 3 + 1,
                                     full=False,
                                     min_overlap=param_min_alignment_exon_overlap,
                                     min_exon_size=param_min_exon_size)

            # check exon boundaries, look at starts, skip first exon
            def MyMap(a, x):
                while x <= a.getRowTo():
                    c = a.mapRowToCol(x)
                    if c:
                        return c
                    x += 1
                else:
                    return 0

            mapped_boundaries = map(
                lambda x: MyMap(map_a2b, x.mPeptideFrom / 3 + 1), c1[1:])
            mapped_boundaries.sort()
            reference_boundaries = map(
                lambda x: x.mPeptideFrom / 3 + 1, c2[1:])
            reference_boundaries.sort()

            nmissed_cmp2ref = Exons.CountMissedBoundaries(
                mapped_boundaries, reference_boundaries, param_boundaries_max_slippage)
            nmissed_ref2cmp = Exons.CountMissedBoundaries(
                reference_boundaries, mapped_boundaries, param_boundaries_max_slippage)

            min_nmissed = min(nmissed_cmp2ref, nmissed_ref2cmp)

            # set is_ok for the whole thing
            # no intron: is ok
            is_ok = 0
            if (len(c1) == 1 and len(c2) == 1):
                is_ok = 1
            else:
                # allow for missed boundaries, if param_boundaries_allow_missed
                # > 0
                if min_nmissed == 0:
                    is_ok = 1
                else:
                    if param_boundaries_allow_missed and \
                            len(mapped_boundaries) >= param_boundaries_allow_missed and \
                            min_nmissed <= param_boundaries_max_missed:
                        is_ok = 1

            cc = min(coverage_a, coverage_b)
            if cc >= param_min_coverage:
                is_ok_coverage = 1
            else:
                is_ok_coverage = 0

            # check for missing introns
            is_ok_exons = 1
            if abs(len(c1) - len(c2)) != 0:
                if param_missing_max_missing:
                    if ((abs(len(c1) - len(c2)) > param_missing_max_missing) or
                            (min(len(c1), len(c2)) < param_missing_min_present)):
                        is_ok_exons = 0
                else:
                    is_ok_exons = 0

            if param_loglevel >= 3:
                print "# i=", i, "li=", len(c1), "j=", j, "lj=", len(c2), \
                      "boundaries_ok=", is_ok, \
                      "nexons_ok=", is_ok_exons, \
                      "missed_c2r=", nmissed_cmp2ref, \
                      "missed_r2c=", nmissed_ref2cmp, \
                      "min_cov=", cc, \
                      "mapped=", mapped_boundaries, \
                      "reference=", reference_boundaries

                print "#", string.join(map(str, (alignlib_lite.AlignmentFormatEmissions(map_a2b),
                                                 map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t")
                sys.stdout.flush()

            # dump out pairs
            for method in param_write_pairs:
                if method == "all":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1,
                        prediction_id2,
                        sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[
                            i].getLength(),
                        sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[
                            j].getLength(),
                        map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                        map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                        map_a2b.getNumGaps(), coverage_a, coverage_b,
                        nmissed_cmp2ref, mapped_boundaries,
                        nmissed_ref2cmp, reference_boundaries,
                        i, j, len(c1), len(c2), cc, is_ok, is_ok_exons, is_ok_coverage)), "\t")
                elif method == "alignment":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1, prediction_id2,
                        map_a2b.getRowFrom(), map_a2b.getRowTo(), row_ali,
                        map_a2b.getColFrom(), map_a2b.getColTo(), col_ali,
                        map_a2b.getNumGaps(), coverage_a, coverage_b)), "\t")
                elif method == "location":
                    print string.join(map(str, (
                        "pair", method,
                        prediction_id1,
                        prediction_id2,
                        sbjct_token1, sbjct_strand1, sbjct_from1, sbjct_to1, seqs1[
                            i].getLength(),
                        sbjct_token2, sbjct_strand2, sbjct_from2, sbjct_to2, seqs2[j].getLength())), "\t")
            if not is_ok_exons:
                if param_loglevel >= 4:
                    print "# rejected %i and %i: too many exons difference." % (i, j)
                continue

            if param_check_exon_boundaries:
                if not is_ok:
                    continue

            if cc < param_min_coverage:
                continue

            if not weights.has_key(cc):
                weights[cc] = []

            alis1[i].append((coverage_a, j))
            alis2[j].append((coverage_b, i))

            weights[cc].append((i, j, map_a2b))

    # sort out alignments
    ww = weights.keys()
    ww.sort()
    ww.reverse()

    pairs = []
    assigned1 = {}
    assigned2 = {}

    if param_loglevel >= 3:
        print "# alis1=", alis1
        print "# alis2=", alis2
        print "# --------------------------------------"

    for w in ww:
        for i, j, map_a2b in weights[w]:
            if not assigned1.has_key(i) and not assigned2.has_key(j):
                pairs.append((transcripts1[i], transcripts2[j], w, map_a2b))
                assigned1[i] = 1
                assigned2[j] = 1
        if len(assigned1) == len(transcripts1):
            break
        if len(assigned2) == len(transcripts2):
            break

    return pairs
    elif param_mode == "transcripts":

        transcript_ids1 = {}
        transcript_ids2 = {}
        for x, y in pairs:
            transcript_ids1[x] = 1
            transcript_ids2[y] = 1

        transcripts1, transcripts2, cds1, cds2 = ReadTranscriptsAndCds(
            transcript_ids1, transcript_ids2)

        if param_loglevel >= 1:
            print "# reading has finished."
            sys.stdout.flush()

        alignator = alignlib_lite.makeAlignatorDPFull(
            alignlib_lite.ALIGNMENT_LOCAL, -10.0, -2.0)

        for q1, q2 in pairs:

            ninput += 1

            if param_loglevel >= 1:
                print "# processing %s and %s" % (q1, q2)

            if q1 in transcripts1 and q2 in transcripts2:
                map_a2b = alignlib_lite.makeAlignmentVector()

                alignator.align(map_a2b,
                                alignlib_lite.makeSequence(peptides1[q1]),
                                alignlib_lite.makeSequence(peptides2[q2]))
Example #14
0
def ProcessRegion(predictions, region_id, region,
                  peptide_sequences=None,
                  filter_queries={}):
    """process a set of matches to a region.

    resolve region according to homology.
    """

    if options.loglevel >= 3:
        options.stdlog.write(
            "###################################################################\n")
        options.stdlog.write(
            "# resolving %i predictions in region %s\n" % (len(predictions), str(region)))
        sys.stdout.flush()

    predictions.sort(lambda x, y: cmp(x.score, y.score))
    predictions.reverse()

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep)
    result = alignlib_lite.makeAlignmentVector()

    cluster = []

    map_sequence2cluster = range(0, len(predictions))
    edges = []

    noutput, nskipped = 0, 0

    if peptide_sequences:
        for x in range(len(predictions)):
            if options.loglevel >= 5:
                options.stdlog.write("# filtering from %i with prediction %i: %s\n" % (
                    x, predictions[x].mPredictionId, predictions[x].mQueryToken))
                sys.stdout.flush()

            if map_sequence2cluster[x] != x:
                continue

            region_id += 1
            edges = []

            if predictions[x].mQueryToken not in filter_queries:
                edges.append(predictions[x])
            else:
                nskipped += 1

            for y in range(x + 1, len(predictions)):

                if map_sequence2cluster[y] != y:
                    continue

                if predictions[x].mQueryToken < predictions[y].mQueryToken:
                    key = "%s-%s" % (predictions[x].mQueryToken,
                                     predictions[y].mQueryToken)
                else:
                    key = "%s-%s" % (predictions[y].mQueryToken,
                                     predictions[x].mQueryToken)

                # check if predictions are overlapping on the genomic sequence
                if min(predictions[x].mSbjctGenomeTo,   predictions[y].mSbjctGenomeTo) - \
                   max(predictions[x].mSbjctGenomeFrom, predictions[y].mSbjctGenomeFrom) < 0:
                    if options.loglevel >= 4:
                        options.stdlog.write("# alignment of predictions %i and %i: no overlap on genomic sequence, thus skipped\n" %
                                             (predictions[x].mPredictionId,
                                              predictions[y].mPredictionId))
                        sys.stdout.flush()
                    continue

                if not global_alignments.has_key(key):

                    seq1 = peptide_sequences[predictions[x].mQueryToken]
                    seq2 = peptide_sequences[predictions[y].mQueryToken]
                    result.clear()
                    s1 = alignlib_lite.makeSequence(seq1)
                    s2 = alignlib_lite.makeSequence(seq2)
                    alignator.align(result, s1, s2)

                    c1 = 100 * \
                        (result.getRowTo() - result.getRowFrom()) / len(seq1)
                    c2 = 100 * \
                        (result.getColTo() - result.getColFrom()) / len(seq2)
                    min_cov = min(c1, c2)
                    max_cov = max(c1, c2)

                    identity = alignlib_lite.calculatePercentIdentity(
                        result, s1, s2) * 100

                    # check if predictions overlap and they are homologous
                    if result.getScore() >= options.overlap_min_score and \
                       max_cov >= options.overlap_max_coverage and \
                       min_cov >= options.overlap_min_coverage and \
                       identity >= options.overlap_min_identity:
                        global_alignments[key] = True
                    else:
                        global_alignments[key] = False

                    if options.loglevel >= 4:
                        options.stdlog.write("# alignment=%s score=%i pid=%5.2f c1=%i c2=%i min_cov=%i max_cov=%i homolog=%s\n" %
                                             (key,
                                              result.getScore(),
                                              identity,
                                              c1, c2, min_cov, max_cov,
                                              global_alignments[key]))
                        sys.stdout.flush()

                if global_alignments[key]:
                    map_sequence2cluster[y] = x
                    if predictions[y].mQueryToken not in filter_queries:
                        edges.append(predictions[y])
                    else:
                        nskipped += 1

            noutput += PrintEdges(region_id, region, edges)

    return region_id, noutput, nskipped
Example #15
0
	nintrons, 
	nsplits, 
	nstopcodons, 
	pidentity, 
	psimilarity, 
	sequence, 
	sbjct_genome_from, 
	sbjct_genome_to, 
	map_query2genome
    FROM %s AS p 
    WHERE p.sbjct_token = '%s' AND
    p.sbjct_strand = '%s' AND 
    OVERLAP( %i, %i, p.sbjct_genome_from, sbjct_genome_to) > 0 
    """

    alignator = alignlib_lite.makeAlignatorDPFull(
        alignlib_lite.ALIGNMENT_LOCAL, param_gop, param_gep)
    map_reference2target = alignlib_lite.makeAlignmentVector()
    assignment_id = 0

    for line in cr.fetchall():

        reference = PredictionParser.PredictionParserEntry()
        reference.FillFromTable(line)

        ct = dbhandle.cursor()
        ct.execute(statement %
                   (param_tablename_predictions_target, reference.mSbjctToken,
                    reference.mSbjctStrand, reference.mSbjctGenomeFrom,
                    reference.mSbjctGenomeTo))

        reference_exons = Exons.Alignment2Exons(reference.mMapPeptide2Genome,
Example #16
0
def EliminateRedundantEntries( rep, 
                               data,
                               eliminated_predictions,
                               options, 
                               peptides,
                               extended_peptides,
                               filter_quality = None,
                               this_quality = None ):
    """eliminate redundant entries in a set."""
    
    eliminated = []

    rep_id = rep.transcript_id
    rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid

    alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep )
    result = alignlib_lite.makeAlignmentVector()
    
    rep_seq = peptides[rep_id]
    rep_extended_seq = extended_peptides[rep_id]

    for entry in data:

        mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id,
                                                       entry.mQueryCoverage,
                                                       entry.mPid,
                                                       entry.mQuality )

        mem_seq = peptides[mem_id]
        mem_extended_seq = extended_peptides[mem_id]

        if options.loglevel >= 4:
            options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality))
            
        if mem_id in eliminated_predictions: continue

        if mem_extended_seq == rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "i") )

        elif mem_extended_seq in rep_extended_seq:
            eliminated_predictions[mem_id] = rep_id
            eliminated.append( (mem_id, "p") )

        else:
            if mem_quality != this_quality or \
                   mem_quality in options.quality_exclude_same:
          
                seq1 = alignlib_lite.makeSequence( str(rep_seq) )
                seq2 = alignlib_lite.makeSequence( str(mem_seq) )            

                alignator.align( result, seq1, seq2 )

                if options.loglevel >= 5:
                    options.stdlog.write( "# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit( result, seq1, seq2 ) )
                
                pidentity = 100 * alignlib_lite.calculatePercentIdentity( result, seq1, seq2 )
                
                num_gaps = result.getNumGaps()

                if options.loglevel >= 4:
                    options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\
                                              ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) )
                    
                if pidentity >= options.min_identity:

                    keep = False
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        keep = True
                        reason = "covpid"
                    elif num_gaps >= options.max_gaps and \
                         mem_coverage > rep_coverage - options.safety_coverage:
                        keep = True
                        reason = "gaps"
                    elif mem_coverage >= rep_coverage - options.safety_coverage and \
                             100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage:
                        keep = True
                        reason = "memcov"

                    if keep:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "h") )
                        
                elif pidentity >= options.min_identity_non_genes and \
                         this_quality in options.quality_genes and \
                         mem_quality not in options.quality_genes:
                    if rep_coverage < mem_coverage - options.safety_coverage or \
                       rep_pid < mem_pid - options.safety_pide:
                        options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\
                              (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) )
                    else:
                        eliminated_predictions[mem_id] = rep_id                
                        eliminated.append( (mem_id, "l") )

    return eliminated