def exononorfs2pacbporf(exonQ, exonS, matrix=None): """ Create a PacbPORF object from 2 ExonOnOrf objects @type exonQ: ExonOnOrf object @param exonQ: ExonOnOrf object (query) @type exonS: ExonOnOrf object @param exonS: ExonOnOrf object (sbjct) @attention: exonQ and exonS proteinsequence() MUST be identical in length! @rtype: pacb.PacbPORF instance @return: pacb.PacbPORF instance """ # prepare input data query = exonQ.proteinsequence() sbjct = exonS.proteinsequence() qaas = exonQ.orf.dnapos2aapos(exonQ.start) saas = exonS.orf.dnapos2aapos(exonS.start) # make pacbp and then pacbporf object if matrix: pacbpobj = pacb.PacbP(input=(query, sbjct, qaas, saas), MATRIX=matrix) else: pacbpobj = pacb.PacbP(input=(query, sbjct, qaas, saas)) if pacbpobj.length == 0: return None pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, exonS.orf) pacbporfobj.extend_pacbporf_after_stops() return pacbporfobj
def pacbp_from_clustalw(alignment=(), coords=()): """ Create a PacbP object from a ClustalW alignment @type alignment: tuple (of 3 strings) @param alignment: (query,match,sbjct) in !ALIGNED! format (indentical string lengths with gaps) @type coords: tuple (of 4 integers) @param coords: (qaastart,qaaend,saastart,saaend) integers, AA-coords! @rtype: pacb.PacbP instance @return: pacb.PacbP instance """ query, match, sbjct = alignment qaastart, qaaend, saastart, saaend = coords # trim leading and tailing gaps in the alignment while query and query[0] == '-': query = query[1:] match = match[1:] sbjct = sbjct[1:] saastart += 1 while sbjct and sbjct[0] == '-': query = query[1:] match = match[1:] sbjct = sbjct[1:] qaastart += 1 while query and query[-1] == '-': query = query[0:-1] match = match[0:-1] sbjct = sbjct[0:-1] saaend -= 1 while sbjct and sbjct[-1] == '-': query = query[0:-1] match = match[0:-1] sbjct = sbjct[0:-1] qaaend -= 1 # trim non-aligned characters as well! while match and match[0] == ' ': query = query[1:] match = match[1:] sbjct = sbjct[1:] qaastart += 1 saastart += 1 while match and match[-1] == ' ': query = query[0:-1] match = match[0:-1] sbjct = sbjct[0:-1] qaaend -= 1 saaend -= 1 if query and sbjct: # make a PacbPORF of this alignment pacbpinput = (query, sbjct, qaastart, saastart) pacbp = pacb.PacbP(input=pacbpinput) pacbp.source = 'clustalw' else: # no proper clustalw alignable sequences -> no PacbP pacbp = None # return the created pacbp return pacbp
def pacbporf2pacbp(pacbporf): """ Convert PacbPORF object (backwards to) a PacbP object @type pacbporf: pacb.PacbPORF instance @param pacbporf: pacb.PacbPORF instance @rtype: pacb.PacbP instance @return: pacb.PacbP instance """ if str(pacbporf.__class__) == 'pacb.PacbP': # object is already a PacbP, not a PacbPORF return pacbporf elif str(pacbporf.__class__) == 'pacb.pacbp.PacbP': # object is already a PacbP, not a PacbPORF return pacbporf else: # object isa PacbPDNA or PacbPDNA; convert to PacbP staPos = pacbporf._get_original_alignment_pos_start() endPos = pacbporf._get_original_alignment_pos_end() query = pacbporf.query[staPos.position:endPos.position + 1] sbjct = pacbporf.sbjct[staPos.position:endPos.position + 1] pacbpinput = (query, sbjct, staPos.query_pos, staPos.sbjct_pos) pacbpobj = pacb.PacbP(input=pacbpinput, MATRIX=pacbporf.MATRIX) # update required attributes pacbpobj.source = pacbporf.source pacbpobj._IS_DELETE_PROTECTED = pacbporf._IS_DELETE_PROTECTED pacbpobj._IS_EDIT_PROTECTED = pacbporf._IS_EDIT_PROTECTED # return the object return pacbpobj
def get_frameshifted_cbg(cbg, input, verbose=True): """ Get a CBG with frameshifts (in some of if Orfs) compared to this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to check for frameshifts @type input: dict @param input: input <dict data structure> with lists of Orfs @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: CodingBlockGraph or None @return: CodingBlockGraph (when existing) or None """ # get elegiable lists of Orfs orfs = _get_elegiable_frameshift_orfsets(cbg, input) # check how many Orfs are elgiable... if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count(): # no frameshift possible here... return None # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): # REMAP fastaheaders as ids to retrieve the Orfs after blast.. for orf in orfs[org].orfs: orf.fastaheader = str(orf.id) fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(), fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print[orf.id for orf in orfs[org].orfs], print[str(orf) for orf in orfs[org].orfs] ######################################################################## for orgQ, orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, orfQ.protein_sequence, dbname="./" + blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfid = alignment.title.replace(">", "").split(" ")[0].replace( "_", "") orfS = orfs[orgS].get_orf_by_id(int(orfid)) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes(): pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ, node2=nodeS)[0] else: # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ############################################################ if verbose: print "NEW:", pacbporf ############################################################ uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): ############################################################ if verbose: print "org_set_size() PCG < CBG" ############################################################ # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() - 1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count() - 1, max_missing_edges=0) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.remove_cbgs_with_lt_nodes(cbg.node_count()) cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_graphlist_by_total_weight_and_identity() ############################################################################ if verbose: print "FScbgs (%s)" % len(cbgList) for fscbg in cbgList: print fscbg ############################################################################ if not cbgList: # no (better) frameshifted CBG return None elif cbgList and not cbgList[0].node_set().symmetric_difference( cbg.node_set()): # best CBG is not frameshifted, but CBG itself return None else: # score the difference between the frameshifted and current CBG score_cbg = cbg.total_weight() * cbg.omsr_identityscore() score_fscbg = cbgList[0].total_weight( ) * cbgList[0].omsr_identityscore() # check overlap between the frameshifted and current CBG a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg) ######################################################################## if verbose: print "CBG", cbg cbg.printmultiplealignment() for fscbg in cbgList: print "fsCBG:", fscbg fscbg.printmultiplealignment() ######################################################################## if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1), (1, 0, 0)): # CBG and frameshifted CBG do not share a single AA overlap... # This does not represent a frameshifted CBG as we searched for return False elif score_fscbg > score_cbg: # return the highest scoring, frameshifted CBG return cbgList[0] else: # no, still not convinced that this is a frameshifted CBG return False
def _blastorfset2blastdb(geneQ, geneS, blastdbfname, input, crossdata, GSgraph, blastoptions=None, elegiable_orfsQ=[], elegiable_orfsS_ids=[], logging=False): """ """ hitcnt = 0 for orfQ in elegiable_orfsQ: # check if protein sequence present in Orf object # in obscure cases of unigenes, no protein sequence present! if not orfQ.protein_sequence: continue # make unique node identifier and blast header nodeQ = (geneQ, orfQ.id) header = "%s_orf_%s" % (geneQ, orfQ.id) # do the blastp! blastrec = blastall_seq2db( header, orfQ.protein_sequence, dbname=blastdbfname, extra_blastp_params=blastoptions.extra_blastp_params) # check if blast failed (then, blastrec == False) if not blastrec: continue # check if there are any hits/hsps! if len(blastrec.alignments) == 0: # no hits; continue continue for alignment in blastrec.alignments: # get back orfpointerB from the SBJCT and create nodeS _parts = alignment.title.split("_") geneS = "_".join(_parts[0:-2]).replace('>', '') _orfpointerS = int(_parts[-1]) nodeS = (geneS, _orfpointerS) # ignore hit if nodeS orfid not occurring in the NON-empty list elegiable_orfsS_ids if elegiable_orfsS_ids and _orfpointerS not in elegiable_orfsS_ids: continue # get the Orf object of this sbjct sequence orfS = input[geneS]['orfs'].get_orf_by_id(_orfpointerS) # loop over the HSPs for hsp in alignment.hsps: # If hits are really tiny (happens in case of BLOSUM45 matrix), # discard them directly before precious time is lost... if len(hsp.query ) <= blastoptions.BLASTP_DIRECTLY_IGNORE_TINY_HITS: continue # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY hsp.query_end = hsp.query_end + orfQ.protein_startPY hsp.sbjct_end = hsp.sbjct_end + orfS.protein_startPY # VERY exceptional case: HSP starts or ends with a gap # I expect this is an error in Blastp .... strip_exterior_gaps(hsp) if hsp.query.find(" ") > 0: # VERY exceptional case: erroneously NCBI parsed HSP: # Score 8 (7 bits), expectation 1.7e+01, alignment length 41 # Query: 1622 STHTYDAC TRCI----PFVDTGHKHENPTEALLDSTA 1654 # TR P + H + P+++ S++ # Sbjct: 489 TEHIYLHT TRSTWPPKPPTNASHANTKPSKSHHRSSS 525 if len(hsp.query.split(" ")[-1]) == len(hsp.match): hsp.query = hsp.query.split(" ")[-1] hsp.sbjct = hsp.sbjct.split(" ")[-1] elif len(hsp.query.split(" ")[0]) == len(hsp.match): hsp.query = hsp.query.split(" ")[0] hsp.sbjct = hsp.sbjct.split(" ")[0] elif len(hsp.query) == len(hsp.match): # spaces in both query/match/sbjct while hsp.query.find(" ") > 0: pos = hsp.query.find(" ") if hsp.sbjct[pos] == " " and hsp.match[pos] == " ": hsp.query = hsp.query[0:pos] + hsp.query[pos + 1:] hsp.match = hsp.match[0:pos] + hsp.match[pos + 1:] hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos + 1:] else: # HPS is not repairable -> quit trying break elif len(hsp.query) == len(hsp.sbjct): while hsp.query.find(" ") > 0: pos = hsp.query.find(" ") hsp.query = hsp.query[0:pos] + hsp.query[pos + 1:] hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos + 1:] # recreate alignment match string is done upon # creation of PacbP object else: pass # VERY exceptional case: HSP starts or ends with a gap # I expect this is an error in Blastp .... strip_exterior_gaps(hsp) try: pacbp = pacb.PacbP(blastp_hsp=hsp, MATRIX=blastoptions.MATRIX) except: # VERY exceptional miscelaneous cases: erroneously NCBI parsed HSP: print hsp print "'%s' X" % hsp.query, len( hsp.query), hsp.query_start, hsp.query_end print "'%s' X" % hsp.match, len(hsp.match) print "'%s' X" % hsp.sbjct, len( hsp.sbjct), hsp.sbjct_start, hsp.sbjct_end pacbp = pacb.PacbP(blastp_hsp=hsp, MATRIX=blastoptions.MATRIX) # make pacbp of this hsp pacbp = pacb.PacbP(blastp_hsp=hsp, MATRIX=blastoptions.MATRIX) # if logging is requested for, print this pacbp to STDOUT if logging: print ">>> Q", nodeQ, orfQ.tcode_symbolic( ), "S", nodeS, orfS.tcode_symbolic( ), pacbp, blastoptions.MATRIX.name, hsp.expect, hsp.bits print ">>>", blastoptions.extra_blastp_params if pacbp.length > 100: print pacbp.query[0:40] + '.' * 7 + str( pacbp.length - 80) + '.' * 7 + pacbp.query[-40:] print pacbp.match[0:40] + '.' * 7 + str( pacbp.length - 80) + '.' * 7 + pacbp.match[-40:] print pacbp.sbjct[0:40] + '.' * 7 + str( pacbp.length - 80) + '.' * 7 + pacbp.sbjct[-40:] else: print pacbp.query print pacbp.match print pacbp.sbjct # blastoptions.BLASTP_HSP_MINIMAL_LENGTH represents the minimal # length of the aligned part. (To) short pacbp's are abandoned if pacbp.length < blastoptions.BLASTP_HSP_MINIMAL_LENGTH: if pacbp.identityscore == float(pacbp.length): # escape for 100% identical tiny pacbps pass elif pacbp.identity + pacbp.similarity == pacbp.length: # escape for 100% similar tiny pacbps pass else: # pacbp is to small. Discard! if logging: print "to small..." continue # check if the pacbp is not conflicting with the currect GSG graph # if so, ignore now because it will not yield a proper edge in an (accepted) CBG! if GSgraph and len( GSgraph ) and GSgraph.is_pacbp_conflicting_with_genestructure( pacbp, orgQ=geneQ, orgS=geneS): ###print "GSGconflict!", nodeQ,nodeS, GSgraph.is_pacbp_conflicting_with_genestructure(pacbp,orgQ=geneQ,orgS=geneS), len(pacbp) continue # here we have a potentially accepted pacbp. # make a/the unique key of this pacbp key = (pacbp.bits, pacbp.length, orfQ.id, _orfpointerS) # check for evalue criterion if (blastoptions.BLASTP_HSP_MAXIMAL_EXPECT or blastoptions.BLASTP_HSP_MAXIMAL_EXPECT == 0.0 ) and hsp.expect > blastoptions.BLASTP_HSP_MAXIMAL_EXPECT: # pacbp is long enough but has a to high evalue crossdata[(geneQ, geneS)]['lowscoring_pacbs'][key] = pacbp if logging: print "to low bitscore or expect" continue # check for bitscore criterion if (blastoptions.BLASTP_HSP_MINIMAL_BITS or blastoptions.BLASTP_HSP_MINIMAL_BITS == 0 ) and pacbp.bits < blastoptions.BLASTP_HSP_MINIMAL_BITS: # pacbp is long enough but has a to low bitscore crossdata[(geneQ, geneS)]['lowscoring_pacbs'][key] = pacbp if logging: print "to low bitscore or expect" continue # !!Hurray!! an accepted pacbp. Store to crossdata # store it to the 'accepted_pacbs' dict of crossdata crossdata[(geneQ, geneS)]['accepted_pacbs'][key] = pacbp hitcnt += 1 if logging: print "ACCEPTED" # done -> check next orf! # return the filled crossdata structure return crossdata, hitcnt
def find_intermediary_codingblockgraph_with_tinyexon(graphL,graphR,input={},similaritymatrix=None,min_bitscore_ratio=0.3): """ """ tinyexon_crossdata = {} tinyexons_seen = 0 for org in graphL.organism_set(): theOrfL = graphL.get_orfs_of_graph(organism=org)[0] theOrfR = graphR.get_orfs_of_graph(organism=org)[0] # continue if identical orfs # TODO: maybe check as well for spanning ranges? # TODO: in theory, a tinyorf can exist on this orf as well... if theOrfL.id == theOrfR.id: continue msrL = graphL.minimal_spanning_range(organism=org) msrR = graphR.minimal_spanning_range(organism=org) # check for get eligable donors on orfL and acceptors on orfR if org in graphL._splicedonorgraph.organism_set() and\ org in graphR._spliceacceptorgraph.organism_set(): eligable_donors = graphL._splicedonorgraph.get_organism_objects(org) eligable_acceptors = graphR._spliceacceptorgraph.get_organism_objects(org) orflist = input[org]['orfs'].orfs # search for tinyexons tinyexonlist = bridge_two_pacbporfs_by_tinyexon(theOrfL,theOrfR, preceding_donor_sites= eligable_donors, subsequent_acceptor_sites= eligable_acceptors, orflist=orflist ) doubletinyexons = bridge_two_pacbporfs_by_two_tinyexons(theOrfL,theOrfR, preceding_donor_sites= eligable_donors, subsequent_acceptor_sites= eligable_acceptors, orflist=orflist ) else: # not donors and acceptors on both orfs! return [] # Order the tinyexons with respect to which orf they are located on. # for now, IGNORE tinyexons on the both left and right orf it self!! orf2tinyexons = {} for tinyexon in tinyexonlist: if tinyexon.orf.id in [ theOrfL.id, theOrfR.id ]: continue if orf2tinyexons.has_key(tinyexon.orf.id): orf2tinyexons[tinyexon.orf.id].append(tinyexon) else: orf2tinyexons[tinyexon.orf.id] = [ tinyexon ] # loop over the unique orfids on which tinyexons are predicted for orfid, telist in orf2tinyexons.iteritems(): # loop over all other organisms (except the organism itself) for otherorg in graphL.organism_set(): if otherorg == org: continue orgkey = [org,otherorg] orgkey.sort() _orgkey_reversed = False if orgkey != [org,otherorg]: _orgkey_reversed = True orgkey = tuple(orgkey) if not tinyexon_crossdata.has_key(orgkey): tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} } orfL = graphL.get_orfs_of_graph(organism=otherorg)[0] orfR = graphR.get_orfs_of_graph(organism=otherorg)[0] # main list for all similarities on this orfid similaritiesL = [] similaritiesR = [] for tinyexon in telist: # get protein query sequence from tinyorf query_dna = tinyexon.orf.inputgenomicsequence[tinyexon.acceptor.pos:tinyexon.donor.pos] query = dna2proteinbyframe(query_dna, (3 - tinyexon.acceptor.phase) % 3 ) query_aa_pos = tinyexon.acceptor.pos / 3 _similaritiesL = similaritymatrix.scansbjct(query,orfL.protein_sequence,min_bitscore_ratio=min_bitscore_ratio) if orfL.id == orfR.id: _similaritiesR = [] else: _similaritiesR = similaritymatrix.scansbjct(query,orfR.protein_sequence,min_bitscore_ratio=min_bitscore_ratio) # Append to all similarities on this orfid; append the tinyexon itself too # in order to place the similarity back to a specific tinyexon. # This is needed because there can be >1 tinyexon on the same orf... _similaritiesL = [ (_data,tinyexon) for _data in _similaritiesL ] _similaritiesR = [ (_data,tinyexon) for _data in _similaritiesR ] similaritiesL.extend(_similaritiesL) similaritiesR.extend(_similaritiesR) # re-order the similarities because they can contain data from 2 tinyexons (on the same orf) # ordering is performed on ``ratio * bitscore`` # this - kind of - evalue calculation enables a preferation for longer matches similaritiesL = _order_similarities(similaritiesL) similaritiesR = _order_similarities(similaritiesR) # Now make pacbporfs of only the BEST tinyexon and its # similarity on another organism TAKE_BEST_SIMILARITIES = 2 for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesL[0:TAKE_BEST_SIMILARITIES]: sbjct_aa_pos = sbjct_pos+orfL.protein_startPY query_aa_pos = tinyexon.acceptor.pos / 3 if _orgkey_reversed: ###print s_seq, "'%s'" % match, ratio, orfL.id pacbpkey = (bitscore, len(query), orfL.id, tinyexon.orf.id ) pacbp = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfL,tinyexon.orf) else: ###print q_seq, "'%s'" % match, ratio, orfL.id pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfL.id ) pacbp = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfL) tinyexons_seen+=1 pacbporf.extend_pacbporf_after_stops() tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesR[0:TAKE_BEST_SIMILARITIES]: sbjct_aa_pos = sbjct_pos+orfR.protein_startPY query_aa_pos = tinyexon.acceptor.pos / 3 if _orgkey_reversed: pacbpkey = (bitscore, len(query), orfR.id, tinyexon.orf.id ) pacbp = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfR,tinyexon.orf) else: pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfR.id ) pacbp = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos)) pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfR) tinyexons_seen+=1 pacbporf.extend_pacbporf_after_stops() tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf if not tinyexons_seen: return [] else: # add the nodes/edges from the input graphs as well for ( (a,b,c,d),n1,n2 ) in graphL.pacbps.keys(): orgkey = ( n1[0], n2[0] ) pacbpdna = graphL.pacbps[( (a,b,c,d),n1,n2 )] if not tinyexon_crossdata.has_key(orgkey): tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} } tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna for ( (a,b,c,d),n1,n2 ) in graphR.pacbps.keys(): orgkey = ( n1[0], n2[0] ) pacbpdna = graphR.pacbps[( (a,b,c,d),n1,n2 )] if not tinyexon_crossdata.has_key(orgkey): tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} } tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna # make graph, remove to low connected nodes and split in complete graphs tinyexonsg = create_pacbpcollectiongraph_from_crossdata(tinyexon_crossdata) tinyexonsg.remove_low_connectivity_nodes(min_connectivity=2) splitted_tinyexongraphs = tinyexonsg.find_fully_connected_subgraphs( edges=4, max_missing_edges=0 ) # now remove the graphs that are graphL and graphR ;-) graphLnodes = graphL.get_nodes() graphLnodes.sort() graphRnodes = graphR.get_nodes() graphRnodes.sort() for pos in range(0,len(splitted_tinyexongraphs)): tegNodes = splitted_tinyexongraphs[pos].get_nodes() tegNodes.sort() if tegNodes == graphLnodes: splitted_tinyexongraphs.pop(pos) break for pos in range(0,len(splitted_tinyexongraphs)): tegNodes = splitted_tinyexongraphs[pos].get_nodes() tegNodes.sort() if tegNodes == graphRnodes: splitted_tinyexongraphs.pop(pos) break # make ListOfCodingBlockGraphs cbgList = ListOfCodingBlockGraphs(splitted_tinyexongraphs, input=input, crossdata=tinyexon_crossdata ) # do all what is needed to create K(s) CBGs of these cbgList.harvest_pacbps_from_crossdata() cbgList.split_codingblock_on_alternatives_in_pacbps_dict( filter_for_msr=True, filter_for_omsr=True, ) # remove non-compatible CBGs cbgList.remove_incompatible_cbgs( minimal_node_count=len(input), minimal_edge_count=len(tinyexon_crossdata), filter_for_msr=True, filter_for_omsr=True ) # get list of accepted TinyExonCbgs accepted_tegs = cbgList.codingblockgraphs # and update weights by minimal spanning region for teg in accepted_tegs: teg.update_edge_weights_by_minimal_spanning_range() # and check if they can be placed IN BETWEEN graphL and graphR # TODO some prints final_graphs_with_tinyexons = [] for teg in accepted_tegs: test_codingblock_order, rejected_graphs = make_consensus_genestructure_from_compatible_pacb_graphs( [graphL,graphR,teg],None) print "checking hypo TEG:", teg.get_ordered_nodes(), "of", len(accepted_tegs), "len of join", len(test_codingblock_order) #empty_input = {} #for org in teg.organism_set(): empty_input[org] = None #tmpGSG = GenestructureOfCodingBlockGraphs(empty_input) #tmpGSG.add_codingblocks([graphL,graphR,teg]) #print "tinyexon tmp check:", len(test_codingblock_order), len(tmpGSG), teg.get_ordered_nodes() wt_after = teg.total_weight() if len(test_codingblock_order) == 3: teg_nodes = teg.get_nodes() teg_nodes.sort() middle = test_codingblock_order[1] middle_nodes = middle.get_nodes() middle_nodes.sort() if middle_nodes == teg_nodes: # yahoo, this one is 100% okay! final_graphs_with_tinyexons.append( teg ) if len(final_graphs_with_tinyexons)==1: print final_graphs_with_tinyexons[0].get_nodes() return final_graphs_with_tinyexons elif len(final_graphs_with_tinyexons)>1: print "### WARNING!!!! more than 1 tinyexon graph is found." print "### WARNING!!!! however, only single one is returned." print "### WARNING!!!! returning >1 can cause errors..." return [ final_graphs_with_tinyexons[0] ] else: return []
def blastanalysescbgjunction( gsg, prevCBG, nextCBG, omit_cbg_orfs=False, omit_non_cbg_orfs=False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org, orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf for org, orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input, prevCBG, nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n")) ############################################################ if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org, orfid = header.split("_orf_") orfid = int(orfid) node = (org, orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del (fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del (fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):", len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa" writeMultiFasta(fastadbmfa, fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ, orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ + "_orf_" + str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, protseq, fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS, _orfSid = alignment.title.replace(">", "").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS, int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print[p.bitscore for p in dpcpacbpcol.pacbps.values()] print "PCG nodes:", dpcpacbpcol.get_ordered_nodes() ################################################################ #### do some transformations on the pacbpcol ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1) ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs( #### edges=gsg.node_count()-1 , max_missing_edges=0 ) ##### convert to list of CBGs and do some transformations ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={}) ####cbgList.remove_all_but_complete_cbgs() ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) ####cbgList.remove_cbgs_without_omsr() ####cbgList.update_edge_weights_by_minimal_spanning_range() ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True) min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2]) pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity) max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3 splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.make_pacbps_for_missing_edges() cbgList.remove_all_but_complete_cbgs() cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) # and create_cache() for these CBGs for cbg in cbgList: cbg.create_cache() #################################################################### if verbose: print stw.lap(), "CBGs created", len(cbgList) for newcbg in cbgList: print "new:", newcbg #################################################################### # return list with CBGs return cbgList.codingblockgraphs
def get_reverse_cbg(cbg, frame, verbose=False): """ Get the ReversecomplementCodingBlockGraph in requested frame of this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to reversecomplement @type frame: integer @param frame: 0,1 or 2 @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: ReversecomplementCodingBlockGraph or None @return: ReversecomplementCodingBlockGraph (when existing) or None """ min_orf_length = (cbg.omsrlength() / 2) * 3 orfs = get_reverse_strand_orfsets(cbg, frame, min_orf_length=min_orf_length) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(), fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print[len(o.protein_sequence) for o in orfs[org].orfs] ######################################################################## revpacbps = {} for orgQ, orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] revpacbporfs = {} for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, orfQ.protein_sequence, dbname="./" + blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfS = orfs[orgS].get_orf_by_id( alignment.title.replace(">", "")) # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # skip if hsp is very short if len(hsp.query) < cbg.omsrlength() / 2: continue # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ###pacbporf.print_protein_and_dna() ################################################################ nodeQ = (orgQ, orfQ.protein_startPY) nodeS = (orgS, orfS.protein_startPY) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() - 1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count() - 1, max_missing_edges=0) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_complete_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) ############################################################################ if verbose: for revcbg in cbgList: print "revCBG:", revcbg ############################################################################ if not cbgList: # no CBG on the reverse strand return None else: # return the highest scoring CBG as a ReversecomlementCodingBlockGraph return CodingBlockGraph2ReversecomlementCodingBlockGraph( cbgList.codingblockgraphs[0])