def merge_pacbporfs( pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj, allow_query_projecting=True, allow_sbjct_projecting=True, allow_query_mapping=True, allow_sbjct_mapping=True, allow_projecting=True, allow_mapping=True, verbose=False, ): """ Merge 2 PacbPORF objects with an interface into a gene structure @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit/create **kwargs dictionary for some forced attributes kwargs = {} _update_kwargs(kwargs, KWARGS_SPLICESITES) # deal with allow_xxx attributes if not allow_projecting: allow_query_projecting = False allow_sbjct_projecting = False if not allow_mapping: allow_query_mapping = False allow_sbjct_mapping = False # check if Orf objects of PacbPORFS are identical queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id # return data structure of introns introns = {"query": [], "sbjct": []} # Scan Orfs for splice sites. # This has probably been performed before, but when not done, # cached donor & acceptor sites lists seems to be empty -> no introns pacbporfD.orfQ.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs["min_donor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_donor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"], ) pacbporfD.orfS.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs["min_donor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_donor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_donor_pssm_score"], ) pacbporfA.orfQ.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs["min_acceptor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_acceptor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"], ) pacbporfA.orfS.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs["min_acceptor_pssm_score"], allow_non_canonical=kwargs["allow_non_canonical_acceptor"], non_canonical_min_pssm_score=kwargs["non_canonical_min_acceptor_pssm_score"], ) if not queryOrfsIdentical and not sbjctOrfsIdentical: introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA) introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA) introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj) introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj) introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj) introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron(pacbporfD, pacbporfA, sbjctOrfSetObj) introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron(pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} introns4 = {} introns5 = {} introns6 = {} introns7 = {} introns8 = {} introns9 = merge_pacbporfs_with_conserved_acceptor_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns9 = _filter_aligned_introns_on_pssm_entropy_combination(introns9) introns10 = merge_pacbporfs_with_conserved_donor_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns10 = _filter_aligned_introns_on_pssm_entropy_combination(introns10) # store introns obtained by most simplest case projecting/mapping introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1])) # only store introns from intron2 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, cigpacbp) in introns2: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) # only store introns from intron3 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS) in introns3: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) # only store introns from intron4 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) k4 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["query"].append(intrQ2) introns["sbjct"].append(intrS2) # only store introns from intron5 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["query"].append(intrQ2) introns["sbjct"].append(intrS2) # only store introns from intron6 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["query"].append(intrQ2) introns["sbjct"].append(intrS2) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns["query"]: introns["query"].remove(None) while None in introns["sbjct"]: introns["sbjct"].remove(None) # only store introns from intron7 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysS: introns["query"].append(intrQ) introns["sbjct"].append(intrS) introns["sbjct"].append(intrS2) # only store introns from intron8 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ: introns["query"].append(intrQ) introns["query"].append(intrQ2) introns["sbjct"].append(intrS) # only store introns from introns9 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS) in introns9: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (2163, 2283): print "STRACC", k1, intrQ, k1 not in keysQ print "STRACC", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns["query"].append(intrQ) if k2 not in keysS: introns["sbjct"].append(intrS) # only store introns from introns10 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intrQ, intrS) in introns10: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (1642, 1858): print "STRDON", k1, intrQ, k1 not in keysQ print "STRDON", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns["query"].append(intrQ) if k2 not in keysS: introns["sbjct"].append(intrS) # finally, do the bridging thingy introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA) # only store introns from introns0 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] for intrQ in introns0: if intrQ.coords() not in keysQ: introns["query"].append(intrQ) # introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] ) # introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] ) # introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] ) # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] ) # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] ) # introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] ) # introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] ) # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] ) # introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] ) # introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] ) # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] ) # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] ) # introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] ) # introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] ) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns["query"]: introns["query"].remove(None) while None in introns["sbjct"]: introns["sbjct"].remove(None) elif not queryOrfsIdentical: seqerror = merge_pacbporf_with_sequenceerror_in_query(pacbporfD, pacbporfA) introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query(pacbporfD, pacbporfA, queryOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_query(pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store sequencerror if it exists if seqerror: introns["query"].append(seqerror) # store introns obtained by most simplest case projecting/mapping introns["query"].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns["query"].append(intr1) introns["query"].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["query"]] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns["query"].append(intr1) introns["query"].append(intr2) introns["query"].append(intr3) if not introns["query"] and allow_sbjct_mapping and allow_query_mapping: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA) # potential stopless 3n intron in SBJCT introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns["query"]] for intrQ, intrS in introns1: if intrQ.coords() not in keys: introns["query"].append(intrQ) keys = [intron.coords() for intron in introns["query"]] for (intrQ, intrS, cigpacbp) in introns2: if intrQ.coords() not in keys: introns["query"].append(intrQ) keys = [intron.coords() for intron in introns["query"]] for intrQ, intrS in introns3: if intrQ.coords() not in keys: introns["query"].append(intrQ) keys = [intron.coords() for intron in introns["query"]] for intron in introns0: if intron.coords() not in keys: introns["query"].append(intron) keys = [intron.coords() for intron in introns["query"]] keys = [intron.coords() for intron in introns["sbjct"]] for intrQ, intrS in introns1: if intrS.coords() not in keys: introns["query"].append(intrS) keys = [intron.coords() for intron in introns["sbjct"]] for (intrQ, intrS, cigpacbp) in introns2: if intrS.coords() not in keys: introns["query"].append(intrS) keys = [intron.coords() for intron in introns["sbjct"]] for intrQ, intrS in introns3: if intrS.coords() not in keys: introns["query"].append(intrS) keys = [intron.coords() for intron in introns["sbjct"]] elif not introns["query"]: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing(pacbporfD, pacbporfA) # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns["query"]] for intron in introns0: if intron.coords() not in keys: introns["query"].append(intron) else: # projecting introns yielded results; do not try mapping pass elif not sbjctOrfsIdentical: introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct(pacbporfD, pacbporfA, sbjctOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store introns obtained by most simplest case projecting/mapping introns["sbjct"].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns["sbjct"].append(intr1) introns["sbjct"].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns["sbjct"]] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns["sbjct"].append(intr1) introns["sbjct"].append(intr2) introns["sbjct"].append(intr3) if not introns["sbjct"] and allow_sbjct_mapping and allow_query_mapping: # potential stopless 3n intron in QUERY introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination(introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns(pacbporfD, pacbporfA) if ( pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD ): introns3 = merge_pacbporfs_with_phase_shift_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # store introns introns["query"].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns["sbjct"].extend(Set([intrS for (intrQ, intrS) in introns1])) introns["query"].extend([intrQ for (intrQ, intrS, cigpacbp) in introns2]) introns["query"].extend([intrQ for (intrQ, intrS) in introns3]) introns["sbjct"].extend([intrS for (intrQ, intrS, cigpacbp) in introns2]) introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3]) else: # projecting introns yielded results; do not try mapping pass elif queryOrfsIdentical and sbjctOrfsIdentical: if allow_query_mapping: introns1 = merge_pacbporfs_by_inframe_intron_in_query(pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns1 = [] if allow_sbjct_mapping: introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct(pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns2 = [] if allow_sbjct_mapping and allow_query_mapping: introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination(introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # no mapping (unigene or continious alignment provided) introns3 = [] # introns4 = merge_pacbporfs_with_closeby_independant_introns( # pacbporfD,pacbporfA) # introns5 = merge_pacbporfs_with_phase_shift_introns( # pacbporfD,pacbporfA) introns["query"].extend([prj.projected_introns[0] for prj in introns1]) introns["sbjct"].extend([prj.projected_introns[0] for prj in introns2]) introns["query"].extend([intrQ for (intrQ, intrS) in introns3]) introns["sbjct"].extend([intrS for (intrQ, intrS) in introns3]) else: # none of these cases; allow_projecting or allow_mapping == False! pass # Filter for stopless3n introns introns["query"] = _filter_stopless_3n_introns(introns["query"]) introns["sbjct"] = _filter_stopless_3n_introns(introns["sbjct"]) # return list of introns return introns
def _find_qq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target,informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue for (prevpos,nextpos) in [ (pos-1,pos) for pos in range(1,len(thepacbporfs)) ]: prevPF = thepacbporfs[prevpos] nextPF = thepacbporfs[nextpos] if prevPF.orfS.id == nextPF.orfS.id: # check if PacbPORFs are positioned more or less okay if prevPF.distance_towards(nextPF) > 20: continue # check if exonQ is positioned ~between these PacbPORFs if exonQ.orf.dnapos2aapos(exonQ.end) < max(prevPF.alignment_protein_range_query())-12: continue if exonQ.orf.dnapos2aapos(exonQ.start) > min(nextPF.alignment_protein_range_query())+12: continue # check if gap can be projected already by a perfect intron introns = merge_pacbporfs_by_intron_in_query( prevPF,nextPF,max_aa_offset=1) # if introns found => continue if introns: continue # orfObj is the orfS of prevPF or nextPF (just take any) orfObj = prevPF.orfS # assign elegiable range of tinyexon match on SBJCT aapos_sbjct_range = range( max(prevPF.alignment_protein_range_sbjct())-12, min(nextPF.alignment_protein_range_sbjct())+12 ) tinyexonmatches = _find_match_on_orfobj(exonQ,orfObj) for (aaseq,aapos) in tinyexonmatches: # check if the match is obtained in the expected # sbjct AA range; if not, ignore the match if aapos not in aapos_sbjct_range: continue # make pacbporf object pacbpobj = PacbP(input=( exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos ) ) pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if 2 (perfect) introns can be projected introns5p = merge_pacbporfs_by_intron_in_query( prevPF,pacbporfobj, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) introns3p = merge_pacbporfs_by_intron_in_query( pacbporfobj,nextPF, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) # continue if not is_confirmed_by_intron_projection if not introns5p or not introns3p: continue # check if placeable in PCG/pacbporflist distPrev = prevPF.distance_towards(pacbporfobj) distNext = pacbporfobj.distance_towards(nextPF) ovrlPrev = pacbporfobj.overlap(prevPF) ovrlNext = pacbporfobj.overlap(nextPF) if distPrev and distNext: rejected = False elif not distPrev and ovrlPrev: rejected = False elif not distNext and ovrlNext: rejected = False elif ovrlPrev and ovrlNext: rejected = False else: rejected = True print "OKAY", exonQ.proteinsequence(), aaseq, rejected, informant, (distPrev,distNext,ovrlPrev,ovrlNext) # label pacbporf as found by tinyexon QQ pacbporfobj._tinyexon_label = "QQ" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(),exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key,pacbporfobj,rejected,informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _find_qp_and_pq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target,informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue for orfObj in PCG.get_orfs_of_graph(organism=informant): tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ,orfObj) for (aaseq,aapos) in tinyexonmatches: # make pacbporf object pacbpobj = PacbP(input=( exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos ) ) pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if a (perfect) intron can be projected is_confirmed_by_intron_projection = False for accepted_pacbporf in thepacbporfs: if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id: if min(accepted_pacbporf.alignment_dna_range_query()) > min(pacbporfobj.alignment_dna_range_query()): try: introns = merge_pacbporfs_by_intron_in_query( pacbporfobj,accepted_pacbporf, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] else: try: introns = merge_pacbporfs_by_intron_in_query( accepted_pacbporf,pacbporfobj, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] if len(introns) >= 1: is_confirmed_by_intron_projection = True break # continue if not is_confirmed_by_intron_projection if not is_confirmed_by_intron_projection: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon QP pacbporfobj._tinyexon_label = "QP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(),exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key,pacbporfobj,rejected,informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _find_qq_tinyexons_as_pacbporfs(target, tinyexondata, PCG, min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue for (prevpos, nextpos) in [(pos - 1, pos) for pos in range(1, len(thepacbporfs))]: prevPF = thepacbporfs[prevpos] nextPF = thepacbporfs[nextpos] if prevPF.orfS.id == nextPF.orfS.id: # check if PacbPORFs are positioned more or less okay if prevPF.distance_towards(nextPF) > 20: continue # check if exonQ is positioned ~between these PacbPORFs if exonQ.orf.dnapos2aapos(exonQ.end) < max( prevPF.alignment_protein_range_query()) - 12: continue if exonQ.orf.dnapos2aapos(exonQ.start) > min( nextPF.alignment_protein_range_query()) + 12: continue # check if gap can be projected already by a perfect intron introns = merge_pacbporfs_by_intron_in_query( prevPF, nextPF, max_aa_offset=1) # if introns found => continue if introns: continue # orfObj is the orfS of prevPF or nextPF (just take any) orfObj = prevPF.orfS # assign elegiable range of tinyexon match on SBJCT aapos_sbjct_range = range( max(prevPF.alignment_protein_range_sbjct()) - 12, min(nextPF.alignment_protein_range_sbjct()) + 12) tinyexonmatches = _find_match_on_orfobj(exonQ, orfObj) for (aaseq, aapos) in tinyexonmatches: # check if the match is obtained in the expected # sbjct AA range; if not, ignore the match if aapos not in aapos_sbjct_range: continue # make pacbporf object pacbpobj = PacbP( input=(exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos)) pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset( accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if 2 (perfect) introns can be projected introns5p = merge_pacbporfs_by_intron_in_query( prevPF, pacbporfobj, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) introns3p = merge_pacbporfs_by_intron_in_query( pacbporfobj, nextPF, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) # continue if not is_confirmed_by_intron_projection if not introns5p or not introns3p: continue # check if placeable in PCG/pacbporflist distPrev = prevPF.distance_towards(pacbporfobj) distNext = pacbporfobj.distance_towards(nextPF) ovrlPrev = pacbporfobj.overlap(prevPF) ovrlNext = pacbporfobj.overlap(nextPF) if distPrev and distNext: rejected = False elif not distPrev and ovrlPrev: rejected = False elif not distNext and ovrlNext: rejected = False elif ovrlPrev and ovrlNext: rejected = False else: rejected = True print "OKAY", exonQ.proteinsequence( ), aaseq, rejected, informant, (distPrev, distNext, ovrlPrev, ovrlNext) # label pacbporf as found by tinyexon QQ pacbporfobj._tinyexon_label = "QQ" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(), exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key, pacbporfobj, rejected, informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _find_qp_and_pq_tinyexons_as_pacbporfs(target, tinyexondata, PCG, min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue for orfObj in PCG.get_orfs_of_graph(organism=informant): tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ, orfObj) for (aaseq, aapos) in tinyexonmatches: # make pacbporf object pacbpobj = PacbP( input=(exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos)) pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if a (perfect) intron can be projected is_confirmed_by_intron_projection = False for accepted_pacbporf in thepacbporfs: if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id: if min(accepted_pacbporf.alignment_dna_range_query( )) > min(pacbporfobj.alignment_dna_range_query()): try: introns = merge_pacbporfs_by_intron_in_query( pacbporfobj, accepted_pacbporf, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] else: try: introns = merge_pacbporfs_by_intron_in_query( accepted_pacbporf, pacbporfobj, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] if len(introns) >= 1: is_confirmed_by_intron_projection = True break # continue if not is_confirmed_by_intron_projection if not is_confirmed_by_intron_projection: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon QP pacbporfobj._tinyexon_label = "QP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(), exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key, pacbporfobj, rejected, informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def merge_pacbporfs(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj, allow_query_projecting=True, allow_sbjct_projecting=True, allow_query_mapping=True, allow_sbjct_mapping=True, allow_projecting=True, allow_mapping=True, verbose=False): """ Merge 2 PacbPORF objects with an interface into a gene structure @type pacbporfD: PacbPORF object @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects @type pacbporfA: PacbPORF object @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects @type verbose: Boolean @param verbose: print status/debugging messages to STDOUT @rtype: list @return: list with ( intron, intron ), in query and sbjct """ # input validation IsPacbPORF(pacbporfD) IsPacbPORF(pacbporfA) # edit/create **kwargs dictionary for some forced attributes kwargs = {} _update_kwargs(kwargs, KWARGS_SPLICESITES) # deal with allow_xxx attributes if not allow_projecting: allow_query_projecting = False allow_sbjct_projecting = False if not allow_mapping: allow_query_mapping = False allow_sbjct_mapping = False # check if Orf objects of PacbPORFS are identical queryOrfsIdentical = pacbporfD.orfQ.id == pacbporfA.orfQ.id sbjctOrfsIdentical = pacbporfD.orfS.id == pacbporfA.orfS.id # return data structure of introns introns = {'query': [], 'sbjct': []} # Scan Orfs for splice sites. # This has probably been performed before, but when not done, # cached donor & acceptor sites lists seems to be empty -> no introns pacbporfD.orfQ.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) pacbporfD.orfS.scan_orf_for_pssm_splice_sites( splicetype="donor", min_pssm_score=kwargs['min_donor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_donor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_donor_pssm_score']) pacbporfA.orfQ.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) pacbporfA.orfS.scan_orf_for_pssm_splice_sites( splicetype="acceptor", min_pssm_score=kwargs['min_acceptor_pssm_score'], allow_non_canonical=kwargs['allow_non_canonical_acceptor'], non_canonical_min_pssm_score=kwargs[ 'non_canonical_min_acceptor_pssm_score']) if not queryOrfsIdentical and not sbjctOrfsIdentical: introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination( introns1) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns2 = merge_pacbporfs_with_closeby_independant_introns( pacbporfD, pacbporfA) introns3 = merge_pacbporfs_with_phase_shift_introns( pacbporfD, pacbporfA) introns4 = merge_pacbporfs_by_tinyexons(pacbporfD, pacbporfA, queryOrfSetObj, sbjctOrfSetObj) introns5 = merge_pacbporfs_by_query_tinyexon_and_sbjct_intron( pacbporfD, pacbporfA, queryOrfSetObj) introns6 = merge_pacbporfs_by_sbjct_tinyexon_and_query_intron( pacbporfD, pacbporfA, sbjctOrfSetObj) introns7 = merge_pacbporfs_by_sbjct_equal_length_exon_and_query_intron( pacbporfD, pacbporfA, sbjctOrfSetObj) introns8 = merge_pacbporfs_by_query_equal_length_exon_and_sbjct_intron( pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} introns4 = {} introns5 = {} introns6 = {} introns7 = {} introns8 = {} introns9 = merge_pacbporfs_with_conserved_acceptor_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns9 = _filter_aligned_introns_on_pssm_entropy_combination( introns9) introns10 = merge_pacbporfs_with_conserved_donor_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns10 = _filter_aligned_introns_on_pssm_entropy_combination( introns10) # store introns obtained by most simplest case projecting/mapping introns['query'].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns['sbjct'].extend(Set([intrS for (intrQ, intrS) in introns1])) # only store introns from intron2 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, cigpacbp) in introns2: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) # only store introns from intron3 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS) in introns3: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 not in keysQ and k2 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) # only store introns from intron4 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) k4 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['query'].append(intrQ2) introns['sbjct'].append(intrS2) # only store introns from intron5 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns4: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['query'].append(intrQ2) introns['sbjct'].append(intrS2) # only store introns from intron6 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS, pacbporf, intrQ2, intrS2) in introns6: if intrQ: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) else: k1 = None if intrS: k2 = (intrS.donor.pos, intrS.acceptor.pos) else: k2 = None if intrQ2: k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) else: k3 = None if intrS2: k4 = (intrS2.donor.pos, intrS2.acceptor.pos) else: k4 = None if k1 not in keysQ and k2 not in keysS and k3 not in keysQ and k4 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['query'].append(intrQ2) introns['sbjct'].append(intrS2) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns['query']: introns['query'].remove(None) while None in introns['sbjct']: introns['sbjct'].remove(None) # only store introns from intron7 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrS, pacbporf1, intrQ, pacbporf2, intrS2) in introns7: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrS2.donor.pos, intrS2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysS: introns['query'].append(intrQ) introns['sbjct'].append(intrS) introns['sbjct'].append(intrS2) # only store introns from intron8 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, pacbporf1, intrS, pacbporf2, intrQ2) in introns8: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) k3 = (intrQ2.donor.pos, intrQ2.acceptor.pos) if k1 not in keysQ and k2 not in keysS and k3 not in keysQ: introns['query'].append(intrQ) introns['query'].append(intrQ2) introns['sbjct'].append(intrS) # only store introns from introns9 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS) in introns9: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (2163, 2283): print "STRACC", k1, intrQ, k1 not in keysQ print "STRACC", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns['query'].append(intrQ) if k2 not in keysS: introns['sbjct'].append(intrS) # only store introns from introns10 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] keysS = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intrQ, intrS) in introns10: k1 = (intrQ.donor.pos, intrQ.acceptor.pos) k2 = (intrS.donor.pos, intrS.acceptor.pos) if k1 == (1642, 1858): print "STRDON", k1, intrQ, k1 not in keysQ print "STRDON", k1, intrS, k2 not in keysS # do NOT check if any of the introns is present yet; # allow addition of each of these if k1 not in keysQ: introns['query'].append(intrQ) if k2 not in keysS: introns['sbjct'].append(intrS) # finally, do the bridging thingy introns0 = merge_pacbporfs_with_query_intron_bridgeing( pacbporfD, pacbporfA) # only store introns from introns0 that are NOT encountered already in introns1 keysQ = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] for intrQ in introns0: if intrQ.coords() not in keysQ: introns['query'].append(intrQ) #introns['query'].extend([ intrQ for (intrQ,intrS) in introns1 ] ) #introns['query'].extend([ intrQ for (intrQ,intrS,cigpacbp) in introns2 ] ) #introns['query'].extend([ intrQ for (intrQ,intrS) in introns3 ] ) #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns4 ] ) #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns4 ] ) #introns['query'].extend([ intrQ for (intrQ,a,b,c,d) in introns5 ] ) #introns['query'].extend([ intrQ for (a,b,c,intrQ,d) in introns5 ] ) #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns1 ] ) #introns['sbjct'].extend([ intrS for (intrQ,intrS,cigpacbp) in introns2 ] ) #introns['sbjct'].extend([ intrS for (intrQ,intrS) in introns3 ] ) #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns4 ] ) #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns4 ] ) #introns['sbjct'].extend([ intrS for (a,intrS,b,c,d) in introns5 ] ) #introns['sbjct'].extend([ intrS for (a,b,c,d,intrS) in introns5 ] ) # remove the 'None' in introns['sbjct'] due to latest addition while None in introns['query']: introns['query'].remove(None) while None in introns['sbjct']: introns['sbjct'].remove(None) elif not queryOrfsIdentical: seqerror = merge_pacbporf_with_sequenceerror_in_query( pacbporfD, pacbporfA) introns1 = merge_pacbporfs_by_intron_in_query(pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_query( pacbporfD, pacbporfA, queryOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_query( pacbporfD, pacbporfA, queryOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store sequencerror if it exists if seqerror: introns['query'].append(seqerror) # store introns obtained by most simplest case projecting/mapping introns['query'].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns['query'].append(intr1) introns['query'].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['query']] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns['query'].append(intr1) introns['query'].append(intr2) introns['query'].append(intr3) if not introns['query'] and allow_sbjct_mapping and allow_query_mapping: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing( pacbporfD, pacbporfA) # potential stopless 3n intron in SBJCT introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination( introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns( pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns3 = merge_pacbporfs_with_phase_shift_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination( introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns['query']] for intrQ, intrS in introns1: if intrQ.coords() not in keys: introns['query'].append(intrQ) keys = [intron.coords() for intron in introns['query']] for (intrQ, intrS, cigpacbp) in introns2: if intrQ.coords() not in keys: introns['query'].append(intrQ) keys = [intron.coords() for intron in introns['query']] for intrQ, intrS in introns3: if intrQ.coords() not in keys: introns['query'].append(intrQ) keys = [intron.coords() for intron in introns['query']] for intron in introns0: if intron.coords() not in keys: introns['query'].append(intron) keys = [intron.coords() for intron in introns['query']] keys = [intron.coords() for intron in introns['sbjct']] for intrQ, intrS in introns1: if intrS.coords() not in keys: introns['query'].append(intrS) keys = [intron.coords() for intron in introns['sbjct']] for (intrQ, intrS, cigpacbp) in introns2: if intrS.coords() not in keys: introns['query'].append(intrS) keys = [intron.coords() for intron in introns['sbjct']] for intrQ, intrS in introns3: if intrS.coords() not in keys: introns['query'].append(intrS) keys = [intron.coords() for intron in introns['sbjct']] elif not introns['query']: # just bridge Orfs by **best** intron(s). introns0 = merge_pacbporfs_with_query_intron_bridgeing( pacbporfD, pacbporfA) # only store introns from that are NOT encountered already keys = [intron.coords() for intron in introns['query']] for intron in introns0: if intron.coords() not in keys: introns['query'].append(intron) else: # projecting introns yielded results; do not try mapping pass elif not sbjctOrfsIdentical: introns1 = merge_pacbporfs_by_intron_in_sbjct(pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns2 = merge_pacbporfs_by_intron_tinyexon_intron_in_sbjct( pacbporfD, pacbporfA, sbjctOrfSetObj) introns3 = merge_pacbporfs_by_two_tinyexons_in_sbjct( pacbporfD, pacbporfA, sbjctOrfSetObj) else: # do not allow more complex intron merging introns2 = {} introns3 = {} # store introns obtained by most simplest case projecting/mapping introns['sbjct'].extend([prj.projected_introns[0] for prj in introns1]) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intr1, intr2, exon) in introns2: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) if k1 not in keys and k2 not in keys: introns['sbjct'].append(intr1) introns['sbjct'].append(intr2) # only store introns from intron2 that are NOT encountered already in introns1 keys = [(intron.donor.pos, intron.acceptor.pos) for intron in introns['sbjct']] for (intr1, intr2, intr3, exon1, exon2) in introns3: k1 = (intr1.donor.pos, intr1.acceptor.pos) k2 = (intr2.donor.pos, intr2.acceptor.pos) k3 = (intr3.donor.pos, intr3.acceptor.pos) if k1 not in keys and k2 not in keys and k3 not in keys: introns['sbjct'].append(intr1) introns['sbjct'].append(intr2) introns['sbjct'].append(intr3) if not introns['sbjct'] and allow_sbjct_mapping and allow_query_mapping: # potential stopless 3n intron in QUERY introns1 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns1 = _filter_aligned_introns_on_pssm_entropy_combination( introns1) # apply stopless3n intron filtering introns1 = _filter_aligned_stopless_3n_introns(introns1) introns2 = merge_pacbporfs_with_closeby_independant_introns( pacbporfD, pacbporfA) if pacbporfD.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD and\ pacbporfA.gap_ratio_score() < PACBPORF_HIGH_GAP_RATIO_THRESHOLD: introns3 = merge_pacbporfs_with_phase_shift_introns( pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination( introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # do not allow more complex intron merging introns3 = {} # store introns introns['query'].extend(Set([intrQ for (intrQ, intrS) in introns1])) introns['sbjct'].extend(Set([intrS for (intrQ, intrS) in introns1])) introns['query'].extend( [intrQ for (intrQ, intrS, cigpacbp) in introns2]) introns['query'].extend([intrQ for (intrQ, intrS) in introns3]) introns['sbjct'].extend( [intrS for (intrQ, intrS, cigpacbp) in introns2]) introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3]) else: # projecting introns yielded results; do not try mapping pass elif queryOrfsIdentical and sbjctOrfsIdentical: if allow_query_mapping: introns1 = merge_pacbporfs_by_inframe_intron_in_query( pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns1 = [] if allow_sbjct_mapping: introns2 = merge_pacbporfs_by_inframe_intron_in_sbjct( pacbporfD, pacbporfA) else: # no mapping (unigene or continious alignment provided) introns2 = [] if allow_sbjct_mapping and allow_query_mapping: introns3 = merge_pacbporfs_with_introns(pacbporfD, pacbporfA) # filter for **best** candidates based on PSSM/entropy combination introns3 = _filter_aligned_introns_on_pssm_entropy_combination( introns3) # apply stopless3n intron filtering introns3 = _filter_aligned_stopless_3n_introns(introns3) else: # no mapping (unigene or continious alignment provided) introns3 = [] #introns4 = merge_pacbporfs_with_closeby_independant_introns( # pacbporfD,pacbporfA) #introns5 = merge_pacbporfs_with_phase_shift_introns( # pacbporfD,pacbporfA) introns['query'].extend([prj.projected_introns[0] for prj in introns1]) introns['sbjct'].extend([prj.projected_introns[0] for prj in introns2]) introns['query'].extend([intrQ for (intrQ, intrS) in introns3]) introns['sbjct'].extend([intrS for (intrQ, intrS) in introns3]) else: # none of these cases; allow_projecting or allow_mapping == False! pass # Filter for stopless3n introns introns['query'] = _filter_stopless_3n_introns(introns['query']) introns['sbjct'] = _filter_stopless_3n_introns(introns['sbjct']) # return list of introns return introns