def get_cexpander_string_of_cbg(cbg,verbose=False): """ Run cexpander and get the cexpander binary string for this CBG @attention: recommended not to use, use more detailed cexpanderanalyses() @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes """ # (0) create (~unique) basefname basefname = "cbg_"+ "".join([ "%s%s" % (node[0],node[1]) for node in cbg.get_ordered_nodes() ]) fname_fasta = basefname+".fa" fname_allvsall = basefname+".allvsall" fname_aligned = basefname+".aligned" fname_cexpander = basefname+".cexpander" # (1) write multi fasta of OMSR sequences omsrseqs= cbg.getomsrproteinsequences() writeMultiFasta( omsrseqs , fname_fasta ) strongestN = cbg.strongest_connected_node() strongestO = cbg.organism_by_node(strongestN) # create complete .fa -> cexpanderstring command command = """ python %s %s %s; %s -i %s %s > %s; %s %s ">%s" > %s; cat %s | grep "\$start_values" -A 1000 | grep "\$end_values" -m 1 -B 1000 | sed 's/^.*_values$//' | sed '/^$/d' | tr -d "\\t" | tr -d "\\n" """ % ( EXECUTABLE_CEXPANDER_ALLVSALL, fname_fasta, fname_allvsall, EXECUTABLE_CEXPANDER_CBALIGNP, fname_allvsall, "-y", fname_aligned, EXECUTABLE_CEXPANDER_CEXPANDER, fname_aligned, strongestO, fname_cexpander, fname_cexpander ) ci,co,ce = osPopen3(command) ci.close() # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well! cexpanderstring = co.readlines()[-1] co.close() error = ce.read() ce.close() # (6) cleanup files osSystem("rm -f %s.*" % basefname ) ############################################################ if verbose: linesize=100 print cbg for offset in range(0,len(cexpanderstring),linesize): print omsrseqs[strongestO][offset:offset+linesize] print cexpanderstring[offset:offset+linesize] ############################################################ # (7) return the cexpander string return ( cexpanderstring, strongestO )
def get_cexpander_string_of_cbg(cbg, verbose=False): """ Run cexpander and get the cexpander binary string for this CBG @attention: recommended not to use, use more detailed cexpanderanalyses() @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes """ # (0) create (~unique) basefname basefname = "cbg_" + "".join( ["%s%s" % (node[0], node[1]) for node in cbg.get_ordered_nodes()]) fname_fasta = basefname + ".fa" fname_allvsall = basefname + ".allvsall" fname_aligned = basefname + ".aligned" fname_cexpander = basefname + ".cexpander" # (1) write multi fasta of OMSR sequences omsrseqs = cbg.getomsrproteinsequences() writeMultiFasta(omsrseqs, fname_fasta) strongestN = cbg.strongest_connected_node() strongestO = cbg.organism_by_node(strongestN) # create complete .fa -> cexpanderstring command command = """ python %s %s %s; %s -i %s %s > %s; %s %s ">%s" > %s; cat %s | grep "\$start_values" -A 1000 | grep "\$end_values" -m 1 -B 1000 | sed 's/^.*_values$//' | sed '/^$/d' | tr -d "\\t" | tr -d "\\n" """ % (EXECUTABLE_CEXPANDER_ALLVSALL, fname_fasta, fname_allvsall, EXECUTABLE_CEXPANDER_CBALIGNP, fname_allvsall, "-y", fname_aligned, EXECUTABLE_CEXPANDER_CEXPANDER, fname_aligned, strongestO, fname_cexpander, fname_cexpander) ci, co, ce = osPopen3(command) ci.close() # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well! cexpanderstring = co.readlines()[-1] co.close() error = ce.read() ce.close() # (6) cleanup files osSystem("rm -f %s.*" % basefname) ############################################################ if verbose: linesize = 100 print cbg for offset in range(0, len(cexpanderstring), linesize): print omsrseqs[strongestO][offset:offset + linesize] print cexpanderstring[offset:offset + linesize] ############################################################ # (7) return the cexpander string return (cexpanderstring, strongestO)
def get_frameshifted_cbg(cbg, input, verbose=True): """ Get a CBG with frameshifts (in some of if Orfs) compared to this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to check for frameshifts @type input: dict @param input: input <dict data structure> with lists of Orfs @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: CodingBlockGraph or None @return: CodingBlockGraph (when existing) or None """ # get elegiable lists of Orfs orfs = _get_elegiable_frameshift_orfsets(cbg, input) # check how many Orfs are elgiable... if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count(): # no frameshift possible here... return None # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): # REMAP fastaheaders as ids to retrieve the Orfs after blast.. for orf in orfs[org].orfs: orf.fastaheader = str(orf.id) fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(), fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print [orf.id for orf in orfs[org].orfs], print [str(orf) for orf in orfs[org].orfs]
def get_reverse_cbg(cbg,frame,verbose=False): """ Get the ReversecomplementCodingBlockGraph in requested frame of this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to reversecomplement @type frame: integer @param frame: 0,1 or 2 @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: ReversecomplementCodingBlockGraph or None @return: ReversecomplementCodingBlockGraph (when existing) or None """ min_orf_length = (cbg.omsrlength()/2)*3 orfs = get_reverse_strand_orfsets(cbg,frame,min_orf_length=min_orf_length) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): fname = "%s_reversecbg_%s.mfa" % (org,cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(),fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print [len(o.protein_sequence) for o in orfs[org].orfs ] ######################################################################## revpacbps = {} for orgQ,orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] revpacbporfs = {} for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence, dbname="./"+blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfS = orfs[orgS].get_orf_by_id(alignment.title.replace(">","")) # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # skip if hsp is very short if len(hsp.query) < cbg.omsrlength()/2: continue # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp),orfQ,orfS) ################################################################ if verbose: print pacbporf, orgQ,orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ###pacbporf.print_protein_and_dna() ################################################################ nodeQ = ( orgQ, orfQ.protein_startPY ) nodeS = ( orgS, orfS.protein_startPY ) uqkey = pacbporf.construct_unique_key(nodeQ,nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([ fname+".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes( pacbpcol.get_nodes() ) for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore,length,orfQid,orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count()-1 , max_missing_edges=0 ) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={}) cbgList.remove_all_but_complete_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight',reversed=True) ############################################################################ if verbose: for revcbg in cbgList: print "revCBG:", revcbg ############################################################################ if not cbgList: # no CBG on the reverse strand return None else: # return the highest scoring CBG as a ReversecomlementCodingBlockGraph return CodingBlockGraph2ReversecomlementCodingBlockGraph( cbgList.codingblockgraphs[0])
def get_frameshifted_cbg(cbg, input, verbose=True): """ Get a CBG with frameshifts (in some of if Orfs) compared to this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to check for frameshifts @type input: dict @param input: input <dict data structure> with lists of Orfs @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: CodingBlockGraph or None @return: CodingBlockGraph (when existing) or None """ # get elegiable lists of Orfs orfs = _get_elegiable_frameshift_orfsets(cbg, input) # check how many Orfs are elgiable... if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count(): # no frameshift possible here... return None # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): # REMAP fastaheaders as ids to retrieve the Orfs after blast.. for orf in orfs[org].orfs: orf.fastaheader = str(orf.id) fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(), fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print[orf.id for orf in orfs[org].orfs], print[str(orf) for orf in orfs[org].orfs] ######################################################################## for orgQ, orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, orfQ.protein_sequence, dbname="./" + blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfid = alignment.title.replace(">", "").split(" ")[0].replace( "_", "") orfS = orfs[orgS].get_orf_by_id(int(orfid)) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes(): pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ, node2=nodeS)[0] else: # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ############################################################ if verbose: print "NEW:", pacbporf ############################################################ uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): ############################################################ if verbose: print "org_set_size() PCG < CBG" ############################################################ # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() - 1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count() - 1, max_missing_edges=0) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.remove_cbgs_with_lt_nodes(cbg.node_count()) cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_graphlist_by_total_weight_and_identity() ############################################################################ if verbose: print "FScbgs (%s)" % len(cbgList) for fscbg in cbgList: print fscbg ############################################################################ if not cbgList: # no (better) frameshifted CBG return None elif cbgList and not cbgList[0].node_set().symmetric_difference( cbg.node_set()): # best CBG is not frameshifted, but CBG itself return None else: # score the difference between the frameshifted and current CBG score_cbg = cbg.total_weight() * cbg.omsr_identityscore() score_fscbg = cbgList[0].total_weight( ) * cbgList[0].omsr_identityscore() # check overlap between the frameshifted and current CBG a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg) ######################################################################## if verbose: print "CBG", cbg cbg.printmultiplealignment() for fscbg in cbgList: print "fsCBG:", fscbg fscbg.printmultiplealignment() ######################################################################## if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1), (1, 0, 0)): # CBG and frameshifted CBG do not share a single AA overlap... # This does not represent a frameshifted CBG as we searched for return False elif score_fscbg > score_cbg: # return the highest scoring, frameshifted CBG return cbgList[0] else: # no, still not convinced that this is a frameshifted CBG return False
def _create_hmm_profile(cbg, area="OMSR", prevcbg=None, nextcbg=None, strip_nonaligned_residues=False, verbose=False, **kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1)) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1)) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del (coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(prevcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])]) end = max(coords[nodeCbg]) + 1 coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # decrease coord range by nextcbg if applicable if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(nextcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1]) coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in [ "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "RIGTHORFEND" ]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in [ "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF" ]: maxlength = max([len(vlist) for vlist in coords.values()]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k, seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del (coords[k]) del (fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([(key, [min(vlist), max(vlist) + 1]) for key, vlist in coords.iteritems()]) # perform clustalw multiple alignment (alignedseqs, alignment) = clustalw(seqs=fastaseqs) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR", "MINSR"]: alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs, alignment, coords = strip_poorly_supported_tails( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs, alignment, coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None for node, algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs, fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def cexpanderanalyses(cbg,min_cols=0,projected_on=":::", output='binary',cbgregion='omsr',verbose=False): """ Run cexpander and get the CexpanderOutput object of this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph @type min_cols: DEPRECATED integer @param min_cols: DEPRECATED default 0, only change when expert user! @type projected_on: DEPRECATED string @param projceted_on: DEPRECATED default ':::', only change when expert user! @type output: string @param output: one of 'binary', 'float' (default 'binary') @type cbgregion: string @param cbgregion: one of 'omsr', 'maxsr', 'omsr2orfend' (default 'omsr') @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype cxpdrOutput: CexpanderOutput @return cxpdrOutput: CexpanderOutput object of this CBG """ # create (~unique) basefname nodestringlist = [] for node in cbg.get_ordered_nodes(): nodestringlist.append( "%s%s" % (node[0],node[1]) ) fname_fasta = "cbg_" + "".join(nodestringlist) + ".fa" if cbg.node_count() > 2: # write multi fasta of OMSR sequences if cbgregion == 'maxsr': fastaseqs= cbg.getmaxsrproteinsequences() elif cbgregion == 'omsr2orfend': fastaseqs= cbg.getomsr2orfendproteinsequences() # append dummy W amino acid that mimics the # alignment of STOP codons for k,v in fastaseqs.iteritems(): fastaseqs[k] = v+"W" else: # omsr or Non-exstsing keyword... fastaseqs= cbg.getomsrproteinsequences() writeMultiFasta( fastaseqs , fname_fasta ) if projected_on == ":::": pass elif not projected_on: strongestN = cbg.strongest_connected_node() strongestO = cbg.organism_by_node(strongestN) projected_on = strongestO elif projected_on in cbg.organism_set(): pass else: raise OrganismNotPresentInGraph # get cxpdrOutput object; file-cleanup is taken care for cxpdrOutput = runcexpander(fname_fasta, cbalignp_commandline = " -y", output=output) # correct hard-added 'W' residue in cexpander OMSR2ORFEND if cbgregion == 'omsr2orfend': IS_FIRST = True for trf in cxpdrOutput._transferblocks: trf.positions-=1 if trf.binarystring[-1] == "1": trf.score-=1 trf.binarystring = trf.binarystring[0:-1] trf.ratio = trf._binarystring2matchratio(trf.binarystring) if IS_FIRST: cxpdrOutput.set_transferblock(trf.header) IS_FIRST = False for k,seq in cxpdrOutput.sequences.iteritems(): cxpdrOutput.sequences[k] = seq[0:-1] # EOF correct hard-added 'W' residue in cexpander OMSR2ORFEND else: # weird case of CBG with only 2 nodes; # can happen when ``weakest organism`` is removed # from the GSG based on GTG analyses # Fake cexpander output here pacbporf = cbg.pacbps.values()[0] bstring = "1" *( pacbporf._original_alignment_pos_end -\ pacbporf._original_alignment_pos_start ) cxpdrOutput = CexpanderOutput() cxpdrOutput.binarystring = bstring cxpdrOutput.header = cbg.organism_by_node( cbg.get_ordered_nodes()[0]) cxpdrOutput.positions = len(bstring) cxpdrOutput.score = len(bstring) # (2) return the output object return cxpdrOutput
def blastanalysescbgjunction( gsg, prevCBG, nextCBG, omit_cbg_orfs=False, omit_non_cbg_orfs=False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org, orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf for org, orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input, prevCBG, nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n")) ############################################################ if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org, orfid = header.split("_orf_") orfid = int(orfid) node = (org, orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del (fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del (fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):", len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa" writeMultiFasta(fastadbmfa, fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ, orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ + "_orf_" + str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, protseq, fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS, _orfSid = alignment.title.replace(">", "").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS, int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print[p.bitscore for p in dpcpacbpcol.pacbps.values()] print "PCG nodes:", dpcpacbpcol.get_ordered_nodes() ################################################################ #### do some transformations on the pacbpcol ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1) ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs( #### edges=gsg.node_count()-1 , max_missing_edges=0 ) ##### convert to list of CBGs and do some transformations ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={}) ####cbgList.remove_all_but_complete_cbgs() ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) ####cbgList.remove_cbgs_without_omsr() ####cbgList.update_edge_weights_by_minimal_spanning_range() ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True) min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2]) pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity) max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3 splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.make_pacbps_for_missing_edges() cbgList.remove_all_but_complete_cbgs() cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) # and create_cache() for these CBGs for cbg in cbgList: cbg.create_cache() #################################################################### if verbose: print stw.lap(), "CBGs created", len(cbgList) for newcbg in cbgList: print "new:", newcbg #################################################################### # return list with CBGs return cbgList.codingblockgraphs
def get_reverse_cbg(cbg, frame, verbose=False): """ Get the ReversecomplementCodingBlockGraph in requested frame of this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph to reversecomplement @type frame: integer @param frame: 0,1 or 2 @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype: ReversecomplementCodingBlockGraph or None @return: ReversecomplementCodingBlockGraph (when existing) or None """ min_orf_length = (cbg.omsrlength() / 2) * 3 orfs = get_reverse_strand_orfsets(cbg, frame, min_orf_length=min_orf_length) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps for org in orfs.keys(): fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode()) writeMultiFasta(orfs[org].tofastadict(), fname) multifastas[org] = fname ######################################################################## if verbose: print "ORFS:", org, len(orfs[org].orfs), print[len(o.protein_sequence) for o in orfs[org].orfs] ######################################################################## revpacbps = {} for orgQ, orgS in cbg.pairwisecrosscombinations_organism(): # create blastdb if it does not exist yet if not blastdbs.has_key(orgS): formatdb(fname=multifastas[orgS]) blastdbs[orgS] = multifastas[orgS] revpacbporfs = {} for orfQ in orfs[orgQ].orfs: # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, orfQ.protein_sequence, dbname="./" + blastdbs[orgS]) if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # obtain coordinates from sbjct orf identifier orfS = orfs[orgS].get_orf_by_id( alignment.title.replace(">", "")) # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # skip if hsp is very short if len(hsp.query) < cbg.omsrlength() / 2: continue # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ###pacbporf.print_protein_and_dna() ################################################################ nodeQ = (orgQ, orfQ.protein_startPY) nodeS = (orgS, orfS.protein_startPY) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) if not pacbpcol.organism_set_size() == cbg.organism_set_size(): # no CBG on the reverse strand return None # ``deepcopy`` PacbPcollection dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ############################################################################ if verbose: print pacbpcol, "bitscores:", print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()] ############################################################################ # do some transformations on the pacbpcol pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() - 1) splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=cbg.node_count() - 1, max_missing_edges=0) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_complete_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) ############################################################################ if verbose: for revcbg in cbgList: print "revCBG:", revcbg ############################################################################ if not cbgList: # no CBG on the reverse strand return None else: # return the highest scoring CBG as a ReversecomlementCodingBlockGraph return CodingBlockGraph2ReversecomlementCodingBlockGraph( cbgList.codingblockgraphs[0])
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None, strip_nonaligned_residues=False, verbose=False,**kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) ) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) ) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node])+1,theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del(coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( prevcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] ) end = max(coords[nodeCbg])+1 coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # decrease coord range by nextcbg if applicable if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( nextcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] ) coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF", "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF"]: maxlength = max([ len(vlist) for vlist in coords.values() ]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k,seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del(coords[k]) del(fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ]) # perform clustalw multiple alignment (alignedseqs,alignment) = clustalw( seqs= fastaseqs ) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR","MINSR"]: alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs,alignment,coords = strip_poorly_supported_tails( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 ) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs,alignment,coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None for node,algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs,fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile ) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def cexpanderanalyses(cbg, min_cols=0, projected_on=":::", output='binary', cbgregion='omsr', verbose=False): """ Run cexpander and get the CexpanderOutput object of this CBG @type cbg: CodingBlockGraph @param cbg: CodingBlockGraph @type min_cols: DEPRECATED integer @param min_cols: DEPRECATED default 0, only change when expert user! @type projected_on: DEPRECATED string @param projceted_on: DEPRECATED default ':::', only change when expert user! @type output: string @param output: one of 'binary', 'float' (default 'binary') @type cbgregion: string @param cbgregion: one of 'omsr', 'maxsr', 'omsr2orfend' (default 'omsr') @type verbose: Boolean @param verbose: print intermediate info to STDOUT for debugging purposes @rtype cxpdrOutput: CexpanderOutput @return cxpdrOutput: CexpanderOutput object of this CBG """ # create (~unique) basefname nodestringlist = [] for node in cbg.get_ordered_nodes(): nodestringlist.append("%s%s" % (node[0], node[1])) fname_fasta = "cbg_" + "".join(nodestringlist) + ".fa" if cbg.node_count() > 2: # write multi fasta of OMSR sequences if cbgregion == 'maxsr': fastaseqs = cbg.getmaxsrproteinsequences() elif cbgregion == 'omsr2orfend': fastaseqs = cbg.getomsr2orfendproteinsequences() # append dummy W amino acid that mimics the # alignment of STOP codons for k, v in fastaseqs.iteritems(): fastaseqs[k] = v + "W" else: # omsr or Non-exstsing keyword... fastaseqs = cbg.getomsrproteinsequences() writeMultiFasta(fastaseqs, fname_fasta) if projected_on == ":::": pass elif not projected_on: strongestN = cbg.strongest_connected_node() strongestO = cbg.organism_by_node(strongestN) projected_on = strongestO elif projected_on in cbg.organism_set(): pass else: raise OrganismNotPresentInGraph # get cxpdrOutput object; file-cleanup is taken care for cxpdrOutput = runcexpander(fname_fasta, cbalignp_commandline=" -y", output=output) # correct hard-added 'W' residue in cexpander OMSR2ORFEND if cbgregion == 'omsr2orfend': IS_FIRST = True for trf in cxpdrOutput._transferblocks: trf.positions -= 1 if trf.binarystring[-1] == "1": trf.score -= 1 trf.binarystring = trf.binarystring[0:-1] trf.ratio = trf._binarystring2matchratio(trf.binarystring) if IS_FIRST: cxpdrOutput.set_transferblock(trf.header) IS_FIRST = False for k, seq in cxpdrOutput.sequences.iteritems(): cxpdrOutput.sequences[k] = seq[0:-1] # EOF correct hard-added 'W' residue in cexpander OMSR2ORFEND else: # weird case of CBG with only 2 nodes; # can happen when ``weakest organism`` is removed # from the GSG based on GTG analyses # Fake cexpander output here pacbporf = cbg.pacbps.values()[0] bstring = "1" *( pacbporf._original_alignment_pos_end -\ pacbporf._original_alignment_pos_start ) cxpdrOutput = CexpanderOutput() cxpdrOutput.binarystring = bstring cxpdrOutput.header = cbg.organism_by_node(cbg.get_ordered_nodes()[0]) cxpdrOutput.positions = len(bstring) cxpdrOutput.score = len(bstring) # (2) return the output object return cxpdrOutput
def blastanalysescbgjunction(gsg,prevCBG,nextCBG, omit_cbg_orfs = False, omit_non_cbg_orfs = False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org,orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf for org,orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input,prevCBG,nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n") ) ############################################################ if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org,orfid = header.split("_orf_") orfid = int(orfid) node = (org,orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del(fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del(fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):",len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa" writeMultiFasta(fastadbmfa,fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ,orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ+"_orf_"+str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS,_orfSid = alignment.title.replace(">","").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS,int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp),orfQ,orfS) ################################################################ if verbose: print pacbporf, orgQ,orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = ( orgQ, orfQ.id ) nodeS = ( orgS, orfS.id ) uqkey = pacbporf.construct_unique_key(nodeQ,nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([ fname+".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes( pacbpcol.get_nodes() ) for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore,length,orfQid,orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]