def PCG2similarityarray(PCG,organism,aalength,omit_unigenes=True): """ """ array_algsimilarity = zeros(aalength) for orgS in PCG.organism_set(): if organism == orgS: continue pacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(organism,orgS)) if pacbporfs and omit_unigenes and hasattr(pacbporfs[0].orfS,ORF_IS_UNIGENE_LABEL): continue orgSimArray = pacbporflist2similarityarray(pacbporfs,"query",aalength) array_algsimilarity+=orgSimArray # return similarity array return array_algsimilarity
def PCG2codingarray(PCG, organism, aalength, omit_unigenes=True): """ """ array_algpresence = zeros(aalength) for orgS in PCG.organism_set(): if organism == orgS: continue pacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(organism, orgS)) if pacbporfs and omit_unigenes and hasattr(pacbporfs[0].orfS, ORF_IS_UNIGENE_LABEL): continue orgPresArray = pacbporflist2codingarray(pacbporfs, "query", aalength) array_algpresence += orgPresArray # return coding/presence array return array_algpresence
def _convert_tinyexon_proteinmatches_to_pacbporfs(target, protmatches, tinyexondata, PCG, min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} # fish these protein matches from the tinyexons and convert to PacbPORFs for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue if exonQ.proteinsequence() not in protmatches.keys(): continue for exonS in tinyexondata[informant]: if exonS.length > exonQ.length: break if exonS.proteinsequence() not in protmatches.keys(): continue # omit non-identical exons if not _are_tinyexons_similar(exonQ, exonS): continue # if here: similar exons. make PacbPORF pacbporf = exononorfs2pacbporf(exonQ, exonS, matrix=TINYEXON_MATRIX) if not pacbporf: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporf) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon PP pacbporf._tinyexon_label = "PP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(), exonQ.start) _update_tinyexon_pacbporf_dict(target_tinyexon_pacbporf_data, key, pacbporf, rejected, informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _has_pp_tinyexonpacbporf_perfect_introns(tinyexonPF,target,informant,PCG): """ """ # check if a (perfect) introns can be mapped is_confirmed_with_introns = False if tinyexonPF._tinyexon_label != 'PP': return False # get ordered PacbPORFS for this informant thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(target,informant)) for pos in range(1,len(thepacbporfs)): prevPF,nextPF = thepacbporfs[pos-1],thepacbporfs[pos] if prevPF.distance_towards(tinyexonPF) > 0 and\ tinyexonPF.distance_towards(nextPF) > 0: intronsPREV = merge_pacbporfs_with_introns( prevPF,tinyexonPF,max_aa_offset=0, max_intron_nt_length=None) intronsNEXT = merge_pacbporfs_with_introns( tinyexonPF,nextPF,max_aa_offset=0, max_intron_nt_length=None) if len(intronsPREV) >= 1 and len(intronsNEXT) >= 1: perfect_prev_intron = False perfect_next_intron = False for intronQ,intronS in intronsPREV: intronQ.assign_bp_and_ppts() intronS.assign_bp_and_ppts() if intronQ.branchpoint and intronS.branchpoint: perfect_prev_intron = True break for intronQ,intronS in intronsNEXT: intronQ.assign_bp_and_ppts() intronS.assign_bp_and_ppts() if intronQ.branchpoint and intronS.branchpoint: perfect_next_intron = True break # check if both intron options have a perfect candidate if perfect_prev_intron and perfect_next_intron: is_confirmed_with_introns = True # break out break # return is_confirmed_with_introns status return is_confirmed_with_introns
def _has_pp_tinyexonpacbporf_perfect_introns(tinyexonPF, target, informant, PCG): """ """ # check if a (perfect) introns can be mapped is_confirmed_with_introns = False if tinyexonPF._tinyexon_label != 'PP': return False # get ordered PacbPORFS for this informant thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for pos in range(1, len(thepacbporfs)): prevPF, nextPF = thepacbporfs[pos - 1], thepacbporfs[pos] if prevPF.distance_towards(tinyexonPF) > 0 and\ tinyexonPF.distance_towards(nextPF) > 0: intronsPREV = merge_pacbporfs_with_introns( prevPF, tinyexonPF, max_aa_offset=0, max_intron_nt_length=None) intronsNEXT = merge_pacbporfs_with_introns( tinyexonPF, nextPF, max_aa_offset=0, max_intron_nt_length=None) if len(intronsPREV) >= 1 and len(intronsNEXT) >= 1: perfect_prev_intron = False perfect_next_intron = False for intronQ, intronS in intronsPREV: intronQ.assign_bp_and_ppts() intronS.assign_bp_and_ppts() if intronQ.branchpoint and intronS.branchpoint: perfect_prev_intron = True break for intronQ, intronS in intronsNEXT: intronQ.assign_bp_and_ppts() intronS.assign_bp_and_ppts() if intronQ.branchpoint and intronS.branchpoint: perfect_next_intron = True break # check if both intron options have a perfect candidate if perfect_prev_intron and perfect_next_intron: is_confirmed_with_introns = True # break out break # return is_confirmed_with_introns status return is_confirmed_with_introns
def _convert_tinyexon_proteinmatches_to_pacbporfs(target,protmatches, tinyexondata,PCG,min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} # fish these protein matches from the tinyexons and convert to PacbPORFs for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target,informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue if exonQ.proteinsequence() not in protmatches.keys(): continue for exonS in tinyexondata[informant]: if exonS.length > exonQ.length: break if exonS.proteinsequence() not in protmatches.keys(): continue # omit non-identical exons if not _are_tinyexons_similar(exonQ,exonS): continue # if here: similar exons. make PacbPORF pacbporf = exononorfs2pacbporf(exonQ,exonS,matrix=TINYEXON_MATRIX) if not pacbporf: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporf) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon PP pacbporf._tinyexon_label = "PP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(),exonQ.start) _update_tinyexon_pacbporf_dict(target_tinyexon_pacbporf_data, key,pacbporf,rejected,informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def update_PCG_with_signalpexons(signalpexonseqs, PCG, OPTIONS, min_pacbporf_identityscore=0.20, verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant, infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(OPTIONS.target, informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), )) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH / 3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=(alignedseqs[OPTIONS.target], alignment, alignedseqs[informant]), coords=coords) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf, informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append(signalpexonPacbpORF) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print(informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list, order_by='bits', reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf, OPTIONS.target, informant, PCG, source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added
def _find_qq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target,informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue for (prevpos,nextpos) in [ (pos-1,pos) for pos in range(1,len(thepacbporfs)) ]: prevPF = thepacbporfs[prevpos] nextPF = thepacbporfs[nextpos] if prevPF.orfS.id == nextPF.orfS.id: # check if PacbPORFs are positioned more or less okay if prevPF.distance_towards(nextPF) > 20: continue # check if exonQ is positioned ~between these PacbPORFs if exonQ.orf.dnapos2aapos(exonQ.end) < max(prevPF.alignment_protein_range_query())-12: continue if exonQ.orf.dnapos2aapos(exonQ.start) > min(nextPF.alignment_protein_range_query())+12: continue # check if gap can be projected already by a perfect intron introns = merge_pacbporfs_by_intron_in_query( prevPF,nextPF,max_aa_offset=1) # if introns found => continue if introns: continue # orfObj is the orfS of prevPF or nextPF (just take any) orfObj = prevPF.orfS # assign elegiable range of tinyexon match on SBJCT aapos_sbjct_range = range( max(prevPF.alignment_protein_range_sbjct())-12, min(nextPF.alignment_protein_range_sbjct())+12 ) tinyexonmatches = _find_match_on_orfobj(exonQ,orfObj) for (aaseq,aapos) in tinyexonmatches: # check if the match is obtained in the expected # sbjct AA range; if not, ignore the match if aapos not in aapos_sbjct_range: continue # make pacbporf object pacbpobj = PacbP(input=( exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos ) ) pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if 2 (perfect) introns can be projected introns5p = merge_pacbporfs_by_intron_in_query( prevPF,pacbporfobj, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) introns3p = merge_pacbporfs_by_intron_in_query( pacbporfobj,nextPF, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) # continue if not is_confirmed_by_intron_projection if not introns5p or not introns3p: continue # check if placeable in PCG/pacbporflist distPrev = prevPF.distance_towards(pacbporfobj) distNext = pacbporfobj.distance_towards(nextPF) ovrlPrev = pacbporfobj.overlap(prevPF) ovrlNext = pacbporfobj.overlap(nextPF) if distPrev and distNext: rejected = False elif not distPrev and ovrlPrev: rejected = False elif not distNext and ovrlNext: rejected = False elif ovrlPrev and ovrlNext: rejected = False else: rejected = True print "OKAY", exonQ.proteinsequence(), aaseq, rejected, informant, (distPrev,distNext,ovrlPrev,ovrlNext) # label pacbporf as found by tinyexon QQ pacbporfobj._tinyexon_label = "QQ" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(),exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key,pacbporfobj,rejected,informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _find_qp_and_pq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target,informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue for orfObj in PCG.get_orfs_of_graph(organism=informant): tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ,orfObj) for (aaseq,aapos) in tinyexonmatches: # make pacbporf object pacbpobj = PacbP(input=( exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos ) ) pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if a (perfect) intron can be projected is_confirmed_by_intron_projection = False for accepted_pacbporf in thepacbporfs: if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id: if min(accepted_pacbporf.alignment_dna_range_query()) > min(pacbporfobj.alignment_dna_range_query()): try: introns = merge_pacbporfs_by_intron_in_query( pacbporfobj,accepted_pacbporf, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] else: try: introns = merge_pacbporfs_by_intron_in_query( accepted_pacbporf,pacbporfobj, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] if len(introns) >= 1: is_confirmed_by_intron_projection = True break # continue if not is_confirmed_by_intron_projection if not is_confirmed_by_intron_projection: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon QP pacbporfobj._tinyexon_label = "QP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(),exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key,pacbporfobj,rejected,informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _find_qq_tinyexons_as_pacbporfs(target, tinyexondata, PCG, min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue for (prevpos, nextpos) in [(pos - 1, pos) for pos in range(1, len(thepacbporfs))]: prevPF = thepacbporfs[prevpos] nextPF = thepacbporfs[nextpos] if prevPF.orfS.id == nextPF.orfS.id: # check if PacbPORFs are positioned more or less okay if prevPF.distance_towards(nextPF) > 20: continue # check if exonQ is positioned ~between these PacbPORFs if exonQ.orf.dnapos2aapos(exonQ.end) < max( prevPF.alignment_protein_range_query()) - 12: continue if exonQ.orf.dnapos2aapos(exonQ.start) > min( nextPF.alignment_protein_range_query()) + 12: continue # check if gap can be projected already by a perfect intron introns = merge_pacbporfs_by_intron_in_query( prevPF, nextPF, max_aa_offset=1) # if introns found => continue if introns: continue # orfObj is the orfS of prevPF or nextPF (just take any) orfObj = prevPF.orfS # assign elegiable range of tinyexon match on SBJCT aapos_sbjct_range = range( max(prevPF.alignment_protein_range_sbjct()) - 12, min(nextPF.alignment_protein_range_sbjct()) + 12) tinyexonmatches = _find_match_on_orfobj(exonQ, orfObj) for (aaseq, aapos) in tinyexonmatches: # check if the match is obtained in the expected # sbjct AA range; if not, ignore the match if aapos not in aapos_sbjct_range: continue # make pacbporf object pacbpobj = PacbP( input=(exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos)) pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset( accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if 2 (perfect) introns can be projected introns5p = merge_pacbporfs_by_intron_in_query( prevPF, pacbporfobj, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) introns3p = merge_pacbporfs_by_intron_in_query( pacbporfobj, nextPF, max_aa_offset=1, max_intron_nt_length=None) #max_intron_nt_length=140) # continue if not is_confirmed_by_intron_projection if not introns5p or not introns3p: continue # check if placeable in PCG/pacbporflist distPrev = prevPF.distance_towards(pacbporfobj) distNext = pacbporfobj.distance_towards(nextPF) ovrlPrev = pacbporfobj.overlap(prevPF) ovrlNext = pacbporfobj.overlap(nextPF) if distPrev and distNext: rejected = False elif not distPrev and ovrlPrev: rejected = False elif not distNext and ovrlNext: rejected = False elif ovrlPrev and ovrlNext: rejected = False else: rejected = True print "OKAY", exonQ.proteinsequence( ), aaseq, rejected, informant, (distPrev, distNext, ovrlPrev, ovrlNext) # label pacbporf as found by tinyexon QQ pacbporfobj._tinyexon_label = "QQ" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(), exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key, pacbporfobj, rejected, informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def _find_qp_and_pq_tinyexons_as_pacbporfs(target, tinyexondata, PCG, min_discovery_count=2): """ """ target_tinyexon_pacbporf_data = {} for informant in tinyexondata.keys(): if informant == target: continue thepacbporfs = order_pacbporf_list( PCG.get_pacbps_by_organisms(target, informant)) for exonQ in tinyexondata[target]: if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue for orfObj in PCG.get_orfs_of_graph(organism=informant): tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ, orfObj) for (aaseq, aapos) in tinyexonmatches: # make pacbporf object pacbpobj = PacbP( input=(exonQ.proteinsequence(), aaseq, exonQ.orf.dnapos2aapos(exonQ.start), aapos)) pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj) pacbporfobj.extend_pacbporf_after_stops() # remove included pacbporfs is_suborsuperset = False for accepted_pacbporf in thepacbporfs: if pacbporfobj.issubsetorsuperset(accepted_pacbporf): is_suborsuperset = True break if is_suborsuperset: continue # check if a (perfect) intron can be projected is_confirmed_by_intron_projection = False for accepted_pacbporf in thepacbporfs: if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id: if min(accepted_pacbporf.alignment_dna_range_query( )) > min(pacbporfobj.alignment_dna_range_query()): try: introns = merge_pacbporfs_by_intron_in_query( pacbporfobj, accepted_pacbporf, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] else: try: introns = merge_pacbporfs_by_intron_in_query( accepted_pacbporf, pacbporfobj, max_aa_offset=0, max_intron_nt_length=None) #max_intron_nt_length=140) except IndexError: # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query introns = [] if len(introns) >= 1: is_confirmed_by_intron_projection = True break # continue if not is_confirmed_by_intron_projection if not is_confirmed_by_intron_projection: continue # check if placeable in PCG/pacbporflist rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0 # label pacbporf as found by tinyexon QP pacbporfobj._tinyexon_label = "QP" # store to target_tinyexon_pacbporf_data key = (exonQ.proteinsequence(), exonQ.start) _update_tinyexon_pacbporf_dict( target_tinyexon_pacbporf_data, key, pacbporfobj, rejected, informant) # cleanup tinyexon protein matches that have been observed to litte _remove_dict_elements_with_short_value_list( target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count) # return target_tinyexon_pacbporf_data return target_tinyexon_pacbporf_data
def update_PCG_with_signalpexons(signalpexonseqs,PCG,OPTIONS, min_pacbporf_identityscore=0.20,verbose=True): """ """ if not signalpexonseqs.has_key(OPTIONS.target): return False is_any_pacbporf_added = False for targetSPexon in signalpexonseqs[OPTIONS.target]: target = OPTIONS.target for informant,infSPlist in signalpexonseqs.iteritems(): if informant == OPTIONS.target: continue # check if informant has been deleted in the meanwhile if informant not in PCG.organism_set(): continue # list to store signalp exons into signalpexon_pacbp_list = [] # get ordered pacbporfs fromt he PCG thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(OPTIONS.target,informant)) if not thepacbporfs: # no alignments present for this organism (can happen!) continue for informantSPexon in infSPlist: coords = [ targetSPexon.protein_start(), targetSPexon.protein_end(), informantSPexon.protein_start(), informantSPexon.protein_end(), ] # prior to making ClustalW-PacbP, check PacbPCOORD placeability # into the list of pacbporfs pacbpCoordsObj = PacbPCOORDS(input=( targetSPexon.proteinsequence(), informantSPexon.proteinsequence(), targetSPexon.protein_start(), informantSPexon.protein_start(), ) ) if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]: # *NOT* placable in current ordered list of PacbPORFS continue dist = pacbpCoordsObj.distance_towards(thepacbporfs[0]) if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH/3: # WAY TO FAR in front of current gene structure parts. # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS continue elif dist == 0: # NOT placeable in front of the rest of the PacbPORFS. continue else: pass # perform ClustalW alignment on the SP exons (alignedseqs,alignment) =\ clustalw( seqs= { OPTIONS.target: targetSPexon.proteinsequence(), informant: informantSPexon.proteinsequence() } ) # make pacbp from clustalw alignment pacbp = pacbp_from_clustalw( alignment=( alignedseqs[OPTIONS.target], alignment, alignedseqs[informant] ), coords=coords ) # is there any alignment constructed? if not pacbp: continue # ignore (very) poor identyscore alignments if pacbp.identityscore < min_pacbporf_identityscore: continue # if here make extended pacbpORF signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf,informantSPexon.orf) signalpexonPacbpORF.extend_pacbporf_after_stops() # and store in signalpexon_pacbp_list signalpexon_pacbp_list.append( signalpexonPacbpORF ) ################################################################ if verbose: print alignedseqs[OPTIONS.target], OPTIONS.target print alignment print alignedseqs[informant], informant if pacbp: print pacbp, (OPTIONS.target, targetSPexon.orf.id), print (informant, informantSPexon.orf.id), print "DISTANCE::", dist pacbp.print_protein() print "" ################################################################ # If there are signalpexon-guided pacbporfs found, store the one # with the highest bitscore if signalpexon_pacbp_list: signalpexon_pacbp_list = order_list_by_attribute( signalpexon_pacbp_list,order_by='bits',reversed=True) # store best bitscoring pacbporf to PCG signalp_pacbporf = signalpexon_pacbp_list[0] pacbporf2PCG(signalp_pacbporf,OPTIONS.target,informant,PCG,source='SignalP-ClustalW') is_any_pacbporf_added = True #################################################################### if verbose: print "SignalP Exon added to PCG:", signalp_pacbporf, informant #################################################################### else: pass # return pointer is_any_pacbporf_added return is_any_pacbporf_added