Esempio n. 1
0
def merge_pacbporfs(
    pacbporfD,
    pacbporfA,
    queryOrfSetObj,
    sbjctOrfSetObj,
    allow_query_projecting=True,
    allow_sbjct_projecting=True,
    allow_query_mapping=True,
    allow_sbjct_mapping=True,
    allow_projecting=True,
    allow_mapping=True,
    verbose=False,
):
    """
    Merge 2 PacbPORF objects with an interface into a gene structure

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit/create **kwargs dictionary for some forced attributes
    kwargs = {}
    _update_kwargs(kwargs, KWARGS_SPLICESITES)

    # deal with allow_xxx attributes
    if not allow_projecting:
        allow_query_projecting = False
        allow_sbjct_projecting = False
    if not allow_mapping:
        allow_query_mapping = False
        allow_sbjct_mapping = False

    # check if Orf objects of PacbPORFS are identical
    queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id
    sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id

    # return data structure of introns
    introns = {"query": [], "sbjct": []}

    # Scan Orfs for splice sites.
    # This has probably been performed before, but when not done,
    # cached donor & acceptor sites lists seems to be empty -> no introns
    pacbporfD.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs["min_donor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_donor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"],
    )
    pacbporfD.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs["min_donor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_donor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"],
    )
    pacbporfA.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs["min_acceptor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_acceptor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"],
    )
    pacbporfA.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs["min_acceptor_pssm_score"],
        allow_non_canonical=kwargs["allow_non_canonical_acceptor"],
        non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"],
    )

    if not queryOrfsIdentical and not sbjctOrfsIdentical:

        introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1)

        if (
            pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
        ):
            introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA)
            introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA)
            introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj)

            introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj)

            introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}
            introns4 = {}
            introns5 = {}
            introns6 = {}
            introns7 = {}
            introns8 = {}

        introns9 = merge_pacbporfs_with_conserved_acceptor_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns9 = _filter_aligned_introns_on_pssm_entropy_combination(introns9)

        introns10 = merge_pacbporfs_with_conserved_donor_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns10 = _filter_aligned_introns_on_pssm_entropy_combination(introns10)

        # store introns obtained by most simplest case projecting/mapping
        introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1]))
        introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1]))

        # only store introns from intron2 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, cigpacbp) in introns2:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)

        # only store introns from intron3 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS) in introns3:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)

        # only store introns from intron4 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS2)

        # only store introns from intron5 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            if intrQ:
                k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else:
                k1 = None
            if intrS:
                k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else:
                k2 = None
            if intrQ2:
                k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else:
                k3 = None
            if intrS2:
                k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else:
                k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS2)

        # only store introns from intron6 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6:
            if intrQ:
                k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else:
                k1 = None
            if intrS:
                k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else:
                k2 = None
            if intrQ2:
                k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else:
                k3 = None
            if intrS2:
                k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else:
                k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS2)

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns["query"]:
            introns["query"].remove(None)
        while None in introns["sbjct"]:
            introns["sbjct"].remove(None)

        # only store introns from intron7 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysS:
                introns["query"].append(intrQ)
                introns["sbjct"].append(intrS)
                introns["sbjct"].append(intrS2)

        # only store introns from intron8 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ:
                introns["query"].append(intrQ)
                introns["query"].append(intrQ2)
                introns["sbjct"].append(intrS)

        # only store introns from introns9 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS) in introns9:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (2163, 2283):
                print "STRACC", k1, intrQ, k1 not in keysQ
                print "STRACC", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns["query"].append(intrQ)
            if k2 not in keysS:
                introns["sbjct"].append(intrS)

        # only store introns from introns10 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intrQ, intrS) in introns10:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (1642, 1858):
                print "STRDON", k1, intrQ, k1 not in keysQ
                print "STRDON", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns["query"].append(intrQ)
            if k2 not in keysS:
                introns["sbjct"].append(intrS)

        # finally, do the bridging thingy
        introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA)

        # only store introns from introns0 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        for intrQ in introns0:
            if intrQ.coords() not in keysQ:
                introns["query"].append(intrQ)

        # introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] )
        # introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] )
        # introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] )
        # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] )
        # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] )
        # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] )
        # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] )
        # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] )
        # introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] )
        # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] )
        # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] )
        # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] )
        # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] )
        # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] )

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns["query"]:
            introns["query"].remove(None)
        while None in introns["sbjct"]:
            introns["sbjct"].remove(None)

    elif not queryOrfsIdentical:
        seqerror = merge_pacbporf_with_sequenceerror_in_query(pacbporfD, pacbporfA)
        introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA)

        if (
            pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
        ):
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query(pacbporfD, pacbporfA, queryOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_query(pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store sequencerror if it exists
        if seqerror:
            introns["query"].append(seqerror)

        # store introns obtained by most simplest case projecting/mapping
        introns["query"].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns["query"].append(intr1)
                introns["query"].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns["query"].append(intr1)
                introns["query"].append(intr2)
                introns["query"].append(intr3)

        if not introns["query"] and allow_sbjct_mapping and allow_query_mapping:
            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA)

            # potential stopless 3n intron in SBJCT
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA)

            if (
                pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
                and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            ):
                introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)

            else:
                # do not allow more complex intron merging
                introns3 = {}

            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns["query"]]
            for intrQ, intrS in introns1:
                if intrQ.coords() not in keys:
                    introns["query"].append(intrQ)
                    keys = [intron.coords() for intron in introns["query"]]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrQ.coords() not in keys:
                    introns["query"].append(intrQ)
                    keys = [intron.coords() for intron in introns["query"]]
            for intrQ, intrS in introns3:
                if intrQ.coords() not in keys:
                    introns["query"].append(intrQ)
                    keys = [intron.coords() for intron in introns["query"]]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns["query"].append(intron)
                    keys = [intron.coords() for intron in introns["query"]]

            keys = [intron.coords() for intron in introns["sbjct"]]
            for intrQ, intrS in introns1:
                if intrS.coords() not in keys:
                    introns["query"].append(intrS)
                    keys = [intron.coords() for intron in introns["sbjct"]]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrS.coords() not in keys:
                    introns["query"].append(intrS)
                    keys = [intron.coords() for intron in introns["sbjct"]]
            for intrQ, intrS in introns3:
                if intrS.coords() not in keys:
                    introns["query"].append(intrS)
                    keys = [intron.coords() for intron in introns["sbjct"]]

        elif not introns["query"]:

            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA)
            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns["query"]]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns["query"].append(intron)
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif not sbjctOrfsIdentical:
        introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA)

        if (
            pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
        ):
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store introns obtained by most simplest case projecting/mapping
        introns["sbjct"].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns["sbjct"].append(intr1)
                introns["sbjct"].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns["sbjct"].append(intr1)
                introns["sbjct"].append(intr2)
                introns["sbjct"].append(intr3)

        if not introns["sbjct"] and allow_sbjct_mapping and allow_query_mapping:
            # potential stopless 3n intron in QUERY
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA)

            if (
                pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
                and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD
            ):
                introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)
            else:
                # do not allow more complex intron merging
                introns3 = {}

            # store introns
            introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1]))
            introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1]))
            introns["query"].extend([intrQ for (intrQ, intrS, cigpacbp) in introns2])
            introns["query"].extend([intrQ for (intrQ, intrS) in introns3])
            introns["sbjct"].extend([intrS for (intrQ, intrS, cigpacbp) in introns2])
            introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3])
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif queryOrfsIdentical and sbjctOrfsIdentical:
        if allow_query_mapping:
            introns1 = merge_pacbporfs_by_inframe_intron_in_query(pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns1 = []

        if allow_sbjct_mapping:
            introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct(pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns2 = []

        if allow_sbjct_mapping and allow_query_mapping:
            introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3)
            # apply stopless3n intron filtering
            introns3 = _filter_aligned_stopless_3n_introns(introns3)

        else:
            # no mapping (unigene or continious alignment provided)
            introns3 = []

        # introns4 = merge_pacbporfs_with_closeby_independant_introns(
        #                pacbporfD,pacbporfA)
        # introns5 = merge_pacbporfs_with_phase_shift_introns(
        #                pacbporfD,pacbporfA)

        introns["query"].extend([prj.projected_introns[0] for prj in introns1])
        introns["sbjct"].extend([prj.projected_introns[0] for prj in introns2])
        introns["query"].extend([intrQ for (intrQ, intrS) in introns3])
        introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3])

    else:
        # none of these cases; allow_projecting or allow_mapping == False!
        pass

    # Filter for stopless3n introns
    introns["query"] = _filter_stopless_3n_introns(introns["query"])
    introns["sbjct"] = _filter_stopless_3n_introns(introns["sbjct"])

    # return list of introns
    return introns
Esempio n. 2
0
def _find_qq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(target,informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue
            for (prevpos,nextpos) in [ (pos-1,pos) for pos in range(1,len(thepacbporfs)) ]:
                prevPF = thepacbporfs[prevpos]
                nextPF = thepacbporfs[nextpos]
                if prevPF.orfS.id == nextPF.orfS.id:

                    # check if PacbPORFs are positioned more or less okay
                    if prevPF.distance_towards(nextPF) > 20: continue

                    # check if exonQ is positioned ~between these PacbPORFs
                    if exonQ.orf.dnapos2aapos(exonQ.end) < max(prevPF.alignment_protein_range_query())-12:
                        continue
                    if exonQ.orf.dnapos2aapos(exonQ.start) > min(nextPF.alignment_protein_range_query())+12:
                        continue

                    # check if gap can be projected already by a perfect intron
                    introns = merge_pacbporfs_by_intron_in_query(
                                prevPF,nextPF,max_aa_offset=1)
                    # if introns found => continue
                    if introns: continue

                    # orfObj is the orfS of prevPF or nextPF (just take any)
                    orfObj = prevPF.orfS
                    # assign elegiable range of tinyexon match on SBJCT
                    aapos_sbjct_range = range(
                            max(prevPF.alignment_protein_range_sbjct())-12,
                            min(nextPF.alignment_protein_range_sbjct())+12
                            )

                    tinyexonmatches = _find_match_on_orfobj(exonQ,orfObj)
                    for (aaseq,aapos) in tinyexonmatches:
                        # check if the match is obtained in the expected
                        # sbjct AA range; if not, ignore the match
                        if aapos not in aapos_sbjct_range: continue

                        # make pacbporf object
                        pacbpobj = PacbP(input=(
                                exonQ.proteinsequence(), aaseq,
                                exonQ.orf.dnapos2aapos(exonQ.start), aapos ) )
                        pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj)
                        pacbporfobj.extend_pacbporf_after_stops()
        
                        # remove included pacbporfs
                        is_suborsuperset = False
                        for accepted_pacbporf in thepacbporfs:
                            if pacbporfobj.issubsetorsuperset(accepted_pacbporf):
                                is_suborsuperset = True
                                break
                        if is_suborsuperset:
                            continue
    

                        # check if 2 (perfect) introns can be projected
                        introns5p = merge_pacbporfs_by_intron_in_query(
                                prevPF,pacbporfobj,
                                max_aa_offset=1,
                                max_intron_nt_length=None)
                                #max_intron_nt_length=140)
                        introns3p = merge_pacbporfs_by_intron_in_query(
                                pacbporfobj,nextPF,
                                max_aa_offset=1,
                                max_intron_nt_length=None)
                                #max_intron_nt_length=140)

                        # continue if not is_confirmed_by_intron_projection
                        if not introns5p or not introns3p: continue
    
                        # check if placeable in PCG/pacbporflist
                        distPrev = prevPF.distance_towards(pacbporfobj)
                        distNext = pacbporfobj.distance_towards(nextPF)
                        ovrlPrev = pacbporfobj.overlap(prevPF)
                        ovrlNext = pacbporfobj.overlap(nextPF)
                        if distPrev and distNext:
                            rejected = False
                        elif not distPrev and ovrlPrev:
                            rejected = False
                        elif not distNext and ovrlNext:
                            rejected = False
                        elif ovrlPrev and ovrlNext:
                            rejected = False
                        else:
                            rejected = True

                        print "OKAY", exonQ.proteinsequence(), aaseq, rejected, informant, (distPrev,distNext,ovrlPrev,ovrlNext)

                        # label pacbporf as found by tinyexon QQ
                        pacbporfobj._tinyexon_label = "QQ"

                        # store to target_tinyexon_pacbporf_data
                        key = (exonQ.proteinsequence(),exonQ.start)
                        _update_tinyexon_pacbporf_dict(
                                target_tinyexon_pacbporf_data,
                                key,pacbporfobj,rejected,informant)


    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
            target_tinyexon_pacbporf_data,
            min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Esempio n. 3
0
def _find_qp_and_pq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(target,informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue
            for orfObj in PCG.get_orfs_of_graph(organism=informant):
                tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ,orfObj)
                for (aaseq,aapos) in tinyexonmatches:
                    # make pacbporf object
                    pacbpobj = PacbP(input=(
                            exonQ.proteinsequence(), aaseq,
                            exonQ.orf.dnapos2aapos(exonQ.start), aapos ) )
                    pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj)
                    pacbporfobj.extend_pacbporf_after_stops()
    
                    # remove included pacbporfs
                    is_suborsuperset = False
                    for accepted_pacbporf in thepacbporfs:
                        if pacbporfobj.issubsetorsuperset(accepted_pacbporf):
                            is_suborsuperset = True
                            break
                    if is_suborsuperset:
                        continue

                    # check if a (perfect) intron can be projected
                    is_confirmed_by_intron_projection = False
                    for accepted_pacbporf in thepacbporfs:
                        if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id:
                            if min(accepted_pacbporf.alignment_dna_range_query()) > min(pacbporfobj.alignment_dna_range_query()):
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        pacbporfobj,accepted_pacbporf,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                        #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            else:
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        accepted_pacbporf,pacbporfobj,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                        #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            if len(introns) >= 1:
                                is_confirmed_by_intron_projection = True
                                break

                    # continue if not is_confirmed_by_intron_projection
                    if not is_confirmed_by_intron_projection: continue

                    # check if placeable in PCG/pacbporflist
                    rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0

                    # label pacbporf as found by tinyexon QP
                    pacbporfobj._tinyexon_label = "QP"

                    # store to target_tinyexon_pacbporf_data
                    key = (exonQ.proteinsequence(),exonQ.start)
                    _update_tinyexon_pacbporf_dict(
                            target_tinyexon_pacbporf_data,
                            key,pacbporfobj,rejected,informant)


    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
            target_tinyexon_pacbporf_data,
            min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Esempio n. 4
0
def _find_qq_tinyexons_as_pacbporfs(target,
                                    tinyexondata,
                                    PCG,
                                    min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
            PCG.get_pacbps_by_organisms(target, informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue
            for (prevpos, nextpos) in [(pos - 1, pos)
                                       for pos in range(1, len(thepacbporfs))]:
                prevPF = thepacbporfs[prevpos]
                nextPF = thepacbporfs[nextpos]
                if prevPF.orfS.id == nextPF.orfS.id:

                    # check if PacbPORFs are positioned more or less okay
                    if prevPF.distance_towards(nextPF) > 20: continue

                    # check if exonQ is positioned ~between these PacbPORFs
                    if exonQ.orf.dnapos2aapos(exonQ.end) < max(
                            prevPF.alignment_protein_range_query()) - 12:
                        continue
                    if exonQ.orf.dnapos2aapos(exonQ.start) > min(
                            nextPF.alignment_protein_range_query()) + 12:
                        continue

                    # check if gap can be projected already by a perfect intron
                    introns = merge_pacbporfs_by_intron_in_query(
                        prevPF, nextPF, max_aa_offset=1)
                    # if introns found => continue
                    if introns: continue

                    # orfObj is the orfS of prevPF or nextPF (just take any)
                    orfObj = prevPF.orfS
                    # assign elegiable range of tinyexon match on SBJCT
                    aapos_sbjct_range = range(
                        max(prevPF.alignment_protein_range_sbjct()) - 12,
                        min(nextPF.alignment_protein_range_sbjct()) + 12)

                    tinyexonmatches = _find_match_on_orfobj(exonQ, orfObj)
                    for (aaseq, aapos) in tinyexonmatches:
                        # check if the match is obtained in the expected
                        # sbjct AA range; if not, ignore the match
                        if aapos not in aapos_sbjct_range: continue

                        # make pacbporf object
                        pacbpobj = PacbP(
                            input=(exonQ.proteinsequence(), aaseq,
                                   exonQ.orf.dnapos2aapos(exonQ.start), aapos))
                        pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf,
                                                     orfObj)
                        pacbporfobj.extend_pacbporf_after_stops()

                        # remove included pacbporfs
                        is_suborsuperset = False
                        for accepted_pacbporf in thepacbporfs:
                            if pacbporfobj.issubsetorsuperset(
                                    accepted_pacbporf):
                                is_suborsuperset = True
                                break
                        if is_suborsuperset:
                            continue

                        # check if 2 (perfect) introns can be projected
                        introns5p = merge_pacbporfs_by_intron_in_query(
                            prevPF,
                            pacbporfobj,
                            max_aa_offset=1,
                            max_intron_nt_length=None)
                        #max_intron_nt_length=140)
                        introns3p = merge_pacbporfs_by_intron_in_query(
                            pacbporfobj,
                            nextPF,
                            max_aa_offset=1,
                            max_intron_nt_length=None)
                        #max_intron_nt_length=140)

                        # continue if not is_confirmed_by_intron_projection
                        if not introns5p or not introns3p: continue

                        # check if placeable in PCG/pacbporflist
                        distPrev = prevPF.distance_towards(pacbporfobj)
                        distNext = pacbporfobj.distance_towards(nextPF)
                        ovrlPrev = pacbporfobj.overlap(prevPF)
                        ovrlNext = pacbporfobj.overlap(nextPF)
                        if distPrev and distNext:
                            rejected = False
                        elif not distPrev and ovrlPrev:
                            rejected = False
                        elif not distNext and ovrlNext:
                            rejected = False
                        elif ovrlPrev and ovrlNext:
                            rejected = False
                        else:
                            rejected = True

                        print "OKAY", exonQ.proteinsequence(
                        ), aaseq, rejected, informant, (distPrev, distNext,
                                                        ovrlPrev, ovrlNext)

                        # label pacbporf as found by tinyexon QQ
                        pacbporfobj._tinyexon_label = "QQ"

                        # store to target_tinyexon_pacbporf_data
                        key = (exonQ.proteinsequence(), exonQ.start)
                        _update_tinyexon_pacbporf_dict(
                            target_tinyexon_pacbporf_data, key, pacbporfobj,
                            rejected, informant)

    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
        target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Esempio n. 5
0
def _find_qp_and_pq_tinyexons_as_pacbporfs(target,
                                           tinyexondata,
                                           PCG,
                                           min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
            PCG.get_pacbps_by_organisms(target, informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue
            for orfObj in PCG.get_orfs_of_graph(organism=informant):
                tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ, orfObj)
                for (aaseq, aapos) in tinyexonmatches:
                    # make pacbporf object
                    pacbpobj = PacbP(
                        input=(exonQ.proteinsequence(), aaseq,
                               exonQ.orf.dnapos2aapos(exonQ.start), aapos))
                    pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj)
                    pacbporfobj.extend_pacbporf_after_stops()

                    # remove included pacbporfs
                    is_suborsuperset = False
                    for accepted_pacbporf in thepacbporfs:
                        if pacbporfobj.issubsetorsuperset(accepted_pacbporf):
                            is_suborsuperset = True
                            break
                    if is_suborsuperset:
                        continue

                    # check if a (perfect) intron can be projected
                    is_confirmed_by_intron_projection = False
                    for accepted_pacbporf in thepacbporfs:
                        if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id:
                            if min(accepted_pacbporf.alignment_dna_range_query(
                            )) > min(pacbporfobj.alignment_dna_range_query()):
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        pacbporfobj,
                                        accepted_pacbporf,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                    #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            else:
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        accepted_pacbporf,
                                        pacbporfobj,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                    #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            if len(introns) >= 1:
                                is_confirmed_by_intron_projection = True
                                break

                    # continue if not is_confirmed_by_intron_projection
                    if not is_confirmed_by_intron_projection: continue

                    # check if placeable in PCG/pacbporflist
                    rejected = [
                        pf.is_postioned_compatibly(pacbporfobj)
                        for pf in thepacbporfs
                    ].count(False) > 0

                    # label pacbporf as found by tinyexon QP
                    pacbporfobj._tinyexon_label = "QP"

                    # store to target_tinyexon_pacbporf_data
                    key = (exonQ.proteinsequence(), exonQ.start)
                    _update_tinyexon_pacbporf_dict(
                        target_tinyexon_pacbporf_data, key, pacbporfobj,
                        rejected, informant)

    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
        target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Esempio n. 6
0
def merge_pacbporfs(pacbporfD,
                    pacbporfA,
                    queryOrfSetObj,
                    sbjctOrfSetObj,
                    allow_query_projecting=True,
                    allow_sbjct_projecting=True,
                    allow_query_mapping=True,
                    allow_sbjct_mapping=True,
                    allow_projecting=True,
                    allow_mapping=True,
                    verbose=False):
    """
    Merge 2 PacbPORF objects with an interface into a gene structure

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intron, intron ), in query and sbjct
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit/create **kwargs dictionary for some forced attributes
    kwargs = {}
    _update_kwargs(kwargs, KWARGS_SPLICESITES)

    # deal with allow_xxx attributes
    if not allow_projecting:
        allow_query_projecting = False
        allow_sbjct_projecting = False
    if not allow_mapping:
        allow_query_mapping = False
        allow_sbjct_mapping = False

    # check if Orf objects of PacbPORFS are identical
    queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id
    sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id

    # return data structure of introns
    introns = {'query': [], 'sbjct': []}

    # Scan Orfs for splice sites.
    # This has probably been performed before, but when not done,
    # cached donor & acceptor sites lists seems to be empty -> no introns
    pacbporfD.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs['min_donor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_donor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_donor_pssm_score'])
    pacbporfD.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="donor",
        min_pssm_score=kwargs['min_donor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_donor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_donor_pssm_score'])
    pacbporfA.orfQ.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs['min_acceptor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_acceptor_pssm_score'])
    pacbporfA.orfS.scan_orf_for_pssm_splice_sites(
        splicetype="acceptor",
        min_pssm_score=kwargs['min_acceptor_pssm_score'],
        allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
        non_canonical_min_pssm_score=kwargs[
            'non_canonical_min_acceptor_pssm_score'])

    if not queryOrfsIdentical and not sbjctOrfsIdentical:

        introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns1 = _filter_aligned_introns_on_pssm_entropy_combination(
            introns1)


        if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
        pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
            introns2 = merge_pacbporfs_with_closeby_independant_introns(
                pacbporfD, pacbporfA)
            introns3 = merge_pacbporfs_with_phase_shift_introns(
                pacbporfD, pacbporfA)
            introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA,
                                                    queryOrfSetObj,
                                                    sbjctOrfSetObj)

            introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron(
                pacbporfD, pacbporfA, queryOrfSetObj)

            introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron(
                pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron(
                pacbporfD, pacbporfA, sbjctOrfSetObj)

            introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron(
                pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}
            introns4 = {}
            introns5 = {}
            introns6 = {}
            introns7 = {}
            introns8 = {}

        introns9 = merge_pacbporfs_with_conserved_acceptor_introns(
            pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns9 = _filter_aligned_introns_on_pssm_entropy_combination(
            introns9)

        introns10 = merge_pacbporfs_with_conserved_donor_introns(
            pacbporfD, pacbporfA)
        # filter for **best** candidates based on PSSM/entropy combination
        introns10 = _filter_aligned_introns_on_pssm_entropy_combination(
            introns10)

        # store introns obtained by most simplest case projecting/mapping
        introns['query'].extend(Set([intrQ for (intrQ, intrS) in introns1]))
        introns['sbjct'].extend(Set([intrS for (intrQ, intrS) in introns1]))

        # only store introns from intron2 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, cigpacbp) in introns2:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)

        # only store introns from intron3 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS) in introns3:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)

        # only store introns from intron4 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS2)

        # only store introns from intron5 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4:
            if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else: k1 = None
            if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else: k2 = None
            if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else: k3 = None
            if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else: k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS2)

        # only store introns from intron6 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6:
            if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            else: k1 = None
            if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos)
            else: k2 = None
            if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            else: k3 = None
            if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos)
            else: k4 = None
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS2)

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns['query']:
            introns['query'].remove(None)
        while None in introns['sbjct']:
            introns['sbjct'].remove(None)

        # only store introns from intron7 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrS2.donor.pos, intrS2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysS:
                introns['query'].append(intrQ)
                introns['sbjct'].append(intrS)
                introns['sbjct'].append(intrS2)

        # only store introns from intron8 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos)
            if k1 not in keysQ and k2 not in keysS and k3 not in keysQ:
                introns['query'].append(intrQ)
                introns['query'].append(intrQ2)
                introns['sbjct'].append(intrS)

        # only store introns from introns9 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS) in introns9:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (2163, 2283):
                print "STRACC", k1, intrQ, k1 not in keysQ
                print "STRACC", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns['query'].append(intrQ)
            if k2 not in keysS:
                introns['sbjct'].append(intrS)

        # only store introns from introns10 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        keysS = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['sbjct']]
        for (intrQ, intrS) in introns10:
            k1 = (intrQ.donor.pos, intrQ.acceptor.pos)
            k2 = (intrS.donor.pos, intrS.acceptor.pos)
            if k1 == (1642, 1858):
                print "STRDON", k1, intrQ, k1 not in keysQ
                print "STRDON", k1, intrS, k2 not in keysS
            # do NOT check if any of the introns is present yet;
            # allow addition of each of these
            if k1 not in keysQ:
                introns['query'].append(intrQ)
            if k2 not in keysS:
                introns['sbjct'].append(intrS)

        # finally, do the bridging thingy
        introns0 = merge_pacbporfs_with_query_intron_bridgeing(
            pacbporfD, pacbporfA)

        # only store introns from introns0 that are NOT encountered already in introns1
        keysQ = [(intron.donor.pos, intron.acceptor.pos)
                 for intron in introns['query']]
        for intrQ in introns0:
            if intrQ.coords() not in keysQ:
                introns['query'].append(intrQ)

        #introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] )
        #introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] )
        #introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] )
        #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] )
        #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] )
        #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] )
        #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] )
        #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] )
        #introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] )
        #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] )
        #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] )
        #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] )
        #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] )
        #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] )

        # remove the 'None' in introns['sbjct'] due to latest addition
        while None in introns['query']:
            introns['query'].remove(None)
        while None in introns['sbjct']:
            introns['sbjct'].remove(None)

    elif not queryOrfsIdentical:
        seqerror = merge_pacbporf_with_sequenceerror_in_query(
            pacbporfD, pacbporfA)
        introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA)


        if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
        pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query(
                pacbporfD, pacbporfA, queryOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_query(
                pacbporfD, pacbporfA, queryOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store sequencerror if it exists
        if seqerror: introns['query'].append(seqerror)

        # store introns obtained by most simplest case projecting/mapping
        introns['query'].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['query']]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns['query'].append(intr1)
                introns['query'].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['query']]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns['query'].append(intr1)
                introns['query'].append(intr2)
                introns['query'].append(intr3)

        if not introns['query'] and allow_sbjct_mapping and allow_query_mapping:
            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(
                pacbporfD, pacbporfA)

            # potential stopless 3n intron in SBJCT
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(
                introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(
                pacbporfD, pacbporfA)

            if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
            pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
                introns3 = merge_pacbporfs_with_phase_shift_introns(
                    pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(
                    introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)

            else:
                # do not allow more complex intron merging
                introns3 = {}

            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns['query']]
            for intrQ, intrS in introns1:
                if intrQ.coords() not in keys:
                    introns['query'].append(intrQ)
                    keys = [intron.coords() for intron in introns['query']]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrQ.coords() not in keys:
                    introns['query'].append(intrQ)
                    keys = [intron.coords() for intron in introns['query']]
            for intrQ, intrS in introns3:
                if intrQ.coords() not in keys:
                    introns['query'].append(intrQ)
                    keys = [intron.coords() for intron in introns['query']]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns['query'].append(intron)
                    keys = [intron.coords() for intron in introns['query']]

            keys = [intron.coords() for intron in introns['sbjct']]
            for intrQ, intrS in introns1:
                if intrS.coords() not in keys:
                    introns['query'].append(intrS)
                    keys = [intron.coords() for intron in introns['sbjct']]
            for (intrQ, intrS, cigpacbp) in introns2:
                if intrS.coords() not in keys:
                    introns['query'].append(intrS)
                    keys = [intron.coords() for intron in introns['sbjct']]
            for intrQ, intrS in introns3:
                if intrS.coords() not in keys:
                    introns['query'].append(intrS)
                    keys = [intron.coords() for intron in introns['sbjct']]

        elif not introns['query']:

            # just bridge Orfs by **best** intron(s).
            introns0 = merge_pacbporfs_with_query_intron_bridgeing(
                pacbporfD, pacbporfA)
            # only store introns from that are NOT encountered already
            keys = [intron.coords() for intron in introns['query']]
            for intron in introns0:
                if intron.coords() not in keys:
                    introns['query'].append(intron)
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif not sbjctOrfsIdentical:
        introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA)

        if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
        pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
            introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct(
                pacbporfD, pacbporfA, sbjctOrfSetObj)
            introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct(
                pacbporfD, pacbporfA, sbjctOrfSetObj)
        else:
            # do not allow more complex intron merging
            introns2 = {}
            introns3 = {}

        # store introns obtained by most simplest case projecting/mapping
        introns['sbjct'].extend([prj.projected_introns[0] for prj in introns1])

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['sbjct']]
        for (intr1, intr2, exon) in introns2:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            if k1 not in keys and k2 not in keys:
                introns['sbjct'].append(intr1)
                introns['sbjct'].append(intr2)

        # only store introns from intron2 that are NOT encountered already in introns1
        keys = [(intron.donor.pos, intron.acceptor.pos)
                for intron in introns['sbjct']]
        for (intr1, intr2, intr3, exon1, exon2) in introns3:
            k1 = (intr1.donor.pos, intr1.acceptor.pos)
            k2 = (intr2.donor.pos, intr2.acceptor.pos)
            k3 = (intr3.donor.pos, intr3.acceptor.pos)
            if k1 not in keys and k2 not in keys and k3 not in keys:
                introns['sbjct'].append(intr1)
                introns['sbjct'].append(intr2)
                introns['sbjct'].append(intr3)

        if not introns['sbjct'] and allow_sbjct_mapping and allow_query_mapping:
            # potential stopless 3n intron in QUERY
            introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns1 = _filter_aligned_introns_on_pssm_entropy_combination(
                introns1)
            # apply stopless3n intron filtering
            introns1 = _filter_aligned_stopless_3n_introns(introns1)

            introns2 = merge_pacbporfs_with_closeby_independant_introns(
                pacbporfD, pacbporfA)


            if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\
            pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD:
                introns3 = merge_pacbporfs_with_phase_shift_introns(
                    pacbporfD, pacbporfA)
                # filter for **best** candidates based on PSSM/entropy combination
                introns3 = _filter_aligned_introns_on_pssm_entropy_combination(
                    introns3)
                # apply stopless3n intron filtering
                introns3 = _filter_aligned_stopless_3n_introns(introns3)
            else:
                # do not allow more complex intron merging
                introns3 = {}

            # store introns
            introns['query'].extend(Set([intrQ
                                         for (intrQ, intrS) in introns1]))
            introns['sbjct'].extend(Set([intrS
                                         for (intrQ, intrS) in introns1]))
            introns['query'].extend(
                [intrQ for (intrQ, intrS, cigpacbp) in introns2])
            introns['query'].extend([intrQ for (intrQ, intrS) in introns3])
            introns['sbjct'].extend(
                [intrS for (intrQ, intrS, cigpacbp) in introns2])
            introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3])
        else:
            # projecting introns yielded results; do not try mapping
            pass

    elif queryOrfsIdentical and sbjctOrfsIdentical:
        if allow_query_mapping:
            introns1 = merge_pacbporfs_by_inframe_intron_in_query(
                pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns1 = []

        if allow_sbjct_mapping:
            introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct(
                pacbporfD, pacbporfA)
        else:
            # no mapping (unigene or continious alignment provided)
            introns2 = []

        if allow_sbjct_mapping and allow_query_mapping:
            introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA)
            # filter for **best** candidates based on PSSM/entropy combination
            introns3 = _filter_aligned_introns_on_pssm_entropy_combination(
                introns3)
            # apply stopless3n intron filtering
            introns3 = _filter_aligned_stopless_3n_introns(introns3)

        else:
            # no mapping (unigene or continious alignment provided)
            introns3 = []

        #introns4 = merge_pacbporfs_with_closeby_independant_introns(
        #                pacbporfD,pacbporfA)
        #introns5 = merge_pacbporfs_with_phase_shift_introns(
        #                pacbporfD,pacbporfA)

        introns['query'].extend([prj.projected_introns[0] for prj in introns1])
        introns['sbjct'].extend([prj.projected_introns[0] for prj in introns2])
        introns['query'].extend([intrQ for (intrQ, intrS) in introns3])
        introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3])

    else:
        # none of these cases; allow_projecting or allow_mapping == False!
        pass

    # Filter for stopless3n introns
    introns['query'] = _filter_stopless_3n_introns(introns['query'])
    introns['sbjct'] = _filter_stopless_3n_introns(introns['sbjct'])

    # return list of introns
    return introns