def multiplealignment(self): """ """ # get sequences & coordinated and rewrite Nodes to Organism identifiers seqs,coords = self.get_maxsr_proteinsequences_and_coords() coords = dict([ (self.organism_by_node(node),[min(vlist),max(vlist)+1]) for node,vlist in coords.iteritems() ]) seqs = dict([ (self.organism_by_node(node),seq) for node,seq in seqs.iteritems() ]) # align sequences with ClustalW (alignedseqs,alignment) = clustalw( seqs= seqs ) # trim alignment for leading & trailing gaps alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(alignedseqs,alignment,coords) # return single string of multilined fasta return "\n".join([">%s_orf_%s\n%s" % (k,self.node_by_organism(k)[1],v) for k,v in alignedseqs.iteritems()])
def multiplealignment(self): """ """ # get sequences & coordinated and rewrite Nodes to Organism identifiers seqs, coords = self.get_maxsr_proteinsequences_and_coords() coords = dict([(self.organism_by_node(node), [min(vlist), max(vlist) + 1]) for node, vlist in coords.iteritems()]) seqs = dict([(self.organism_by_node(node), seq) for node, seq in seqs.iteritems()]) # align sequences with ClustalW (alignedseqs, alignment) = clustalw(seqs=seqs) # trim alignment for leading & trailing gaps alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( alignedseqs, alignment, coords) # return single string of multilined fasta return "\n".join([ ">%s_orf_%s\n%s" % (k, self.node_by_organism(k)[1], v) for k, v in alignedseqs.iteritems() ])
def _create_hmm_profile(cbg, area="OMSR", prevcbg=None, nextcbg=None, strip_nonaligned_residues=False, verbose=False, **kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1)) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords, verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node, coordrange in coords.iteritems(): coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1)) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del (coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(prevcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])]) end = max(coords[nodeCbg]) + 1 coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # decrease coord range by nextcbg if applicable if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection(nextcbg.organism_set()): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1]) coords[nodeCbg] = Set(range(sta, end)) if not coords[nodeCbg]: del (coords[nodeCbg]) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in [ "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "RIGTHORFEND" ]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v) - min(v)) for k, v in coords.iteritems()]), len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in [ "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF" ]: maxlength = max([len(vlist) for vlist in coords.values()]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k, seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del (coords[k]) del (fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([(key, [min(vlist), max(vlist) + 1]) for key, vlist in coords.iteritems()]) # perform clustalw multiple alignment (alignedseqs, alignment) = clustalw(seqs=fastaseqs) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR", "MINSR"]: alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs, alignment, coords = strip_poorly_supported_tails( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs, alignment, coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None for node, algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs, fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def hmmhit2pacbp(queryorf, queryorg, querycoords, sbjctorf, sbjctorg, hmmhit, verbose=False): """ """ # trim hmmhit for unmatched characters (sbjct_header, sbjct_start, sbjct_end, query_start, query_end, query, match, sbjct, score, expect) = hmmhit while match and match[0] == ' ': query = query[1:] match = match[1:] sbjct = sbjct[1:] sbjct_start += 1 query_start += 1 while match and match[-1] == ' ': query = query[0:-1] match = match[0:-1] sbjct = sbjct[0:-1] sbjct_end -= 1 query_end -= 1 # get orf, node and AA and DNA coordinates of this sbjct hit; # correct for -1 offset in start coordinate!! sbjct_aa_start = sbjct_start - 1 + sbjctorf.protein_startPY sbjct_aa_end = sbjct_end + sbjctorf.protein_startPY sbjctNode = (sbjctorg, sbjctorf.id) query = query.replace(".", "-").upper() sbjct = sbjct.replace(".", "-").upper() ############################################################################ if verbose: print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % ( sbjctorg, sbjctorf.id) print "hmmhit2pacbp Q '%s'" % query print "hmmhit2pacbp m '%s'" % match print "hmmhit2pacbp S '%s'" % sbjct print "hmmQ:", query, query_start, query_end, "gaps:", print query.count('-'), len(query) print "hmmM:", match print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end, print "len:", sbjct_aa_end - sbjct_aa_start, len(sbjct) ############################################################################ # get Node and sequence of the query queryNode = (queryorg, queryorf.id) queryseq = deepcopy(query) # calculate query sequence position on queryorf query_aa_start = querycoords[0] + query_start - 1 query_aa_end = query_aa_start + len(queryseq) - queryseq.count('-') ############################################################################ if verbose: print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end, print "len:", query_aa_end - query_aa_start, len(queryseq) ############################################################################ # make a deepcopy; sbjct is needed unchanged for the next iteration # in the for loop, but here we want to trim of gap sequences sbjctseq = deepcopy(sbjct) sbjctaastart = deepcopy(sbjct_aa_start) sbjctaaend = deepcopy(sbjct_aa_end) while queryseq and queryseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] sbjctaastart += 1 while sbjctseq and sbjctseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] query_aa_start += 1 while queryseq and queryseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] sbjctaaend -= 1 while sbjctseq and sbjctseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] query_aa_end -= 1 # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM # profiles are build from clustalw alignments which have loosely aligned # tails (SPRDIF sequences). Problem with HMM is, that in the result file # no information is written on where in teh constructed HMM this hit # starts. This **sucks** because special care was taken in ABFGP code to # make shure the exact aa-coordinates of the applied sequences to ClustalW # are known. Hmmbuild here nullifies this effort by not giving start # coordinates. Therefore, we have to check the exact start position # of the HMM match on the queryorf. if queryseq.replace("-", "") != queryorf.getaas(query_aa_start, query_aa_end): # obtain (search) query sequence, replace gaps by X symbol searchqueryseq = queryseq.upper().replace("-", "X") # count length of the query sequence; here IGNORE THE GAPS!! seqlen = len(queryseq.upper().replace("-", "")) # make fasta sequence dictionary seqdict = { 'query_hmm': searchqueryseq, 'query_orf': queryorf.protein_sequence, } # make coords dictionary for remapping coords = { 'query_hmm': [0, seqlen], 'query_orf': [queryorf.protein_startPY, queryorf.protein_endPY], } # perform clustalw multiple alignment (alignedseqs, alignment) = clustalw(seqs=seqdict) # strip exterior gaps alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords)) if alignedseqs['query_hmm'].count("-") > 0: # in (very) exceptional cases, gaps can be introduced in the # clustalw alignment in the HMM seq. This normally does not # occur! Fix this here by placing gaps in sbjctseq too. sbjctseq_as_list = list(sbjctseq) for pos in range(0, len(alignedseqs['query_hmm'])): if alignedseqs['query_hmm'][pos] == "-": sbjctseq_as_list.insert(pos, "-") if alignedseqs['query_hmm'].find("-", pos) == -1: break sbjctseq = "".join(sbjctseq_as_list) ######################################################################## if verbose: print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]" print "\t", "FALSE::", queryseq, "[ WITH GAPS ]" for k, algseq in alignedseqs.iteritems(): print "\t", "FALSE::", algseq, k, coords[k], len(algseq) print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq) print "\t", "FALSE::", alignment, "ALMNT", len(alignment) print "\t", "SOLVED:", len( alignedseqs['query_orf']) == len(sbjctseq) ######################################################################## # update query sequence & coordinates if len(alignedseqs['query_orf']) == len(sbjctseq): queryseq = alignedseqs['query_orf'] query_aa_start = coords['query_orf'][0] query_aa_end = coords['query_orf'][1] else: # still not identical lengths. ClustalW recovery of HMM hit # failed miserably. For now: omit # TODO: resolve this case!! # example: --filewithloci examples/bilal/CFU_830450.bothss.csv # ## HMM clustalw input profile: False MAXSR True # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598] # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388] # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1) # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD' # hmmhit2pacbp m '+ ka + F W k + nLG Wl E d' # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID' # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34 # hmmM: + ka + F W k + nLG Wl E d # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34 # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ] # FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ] # FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70 # FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71 # FALSE:: **:*** *.: ::*:: * .* :.:*: * *: : :: ALMNT 70 # SOLVED: False # Pacbp creation failed! return False, None if queryseq and sbjctseq: ################################################################ if len(queryseq) != len(sbjctseq): # this will result in a exception to be raised: # pacb.exceptions.InproperlyAppliedArgument # print data here about what went wrong, then # just let the error be raised print queryseq, len(queryseq), sbjctseq, len(sbjctseq) print hmmhit print "Q:", query_aa_start, query_aa_end, print query_aa_end - query_aa_start, "len:", len(queryseq) print "S:", sbjctaastart, sbjctaaend, print sbjctaaend - sbjctaastart, "len:", len(sbjctseq) ################################################################ pacbpinput = (queryseq, sbjctseq, query_aa_start, sbjctaastart) pacbp = PacbP(input=pacbpinput) # remove consistent internal gaps caused hy HMM profile search pacbp.strip_consistent_internal_gaps() pacbp.source = 'hmmsearch' pacbporf = PacbPORF(pacbp, queryorf, sbjctorf) pacbporf.strip_unmatched_ends() if pacbporf.length == 0: # Pacbp creation failed! return False, None else: pacbporf.extend_pacbporf_after_stops() pacbpkey = pacbporf.construct_unique_key(queryNode, sbjctNode) # return unique key and pacbporf return (pacbpkey, queryNode, sbjctNode), pacbporf else: # Pacbp creation failed! return False, None
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None, strip_nonaligned_residues=False, verbose=False,**kwargs): """ """ # area must be one of # OMSR MINSR MAXSR # LEFTSPRDIF RIGTHSPRDIF # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF # RIGTHORFEND # update to default value if not kwargs.has_key('sprdif_min_aa_length'): kwargs['sprdif_min_aa_length'] = 20 if area == "OMSR": if cbg.has_overall_minimal_spanning_range(): coords = cbg.overall_minimal_spanning_range() else: return None, {} elif area == "MINSR": if cbg.has_minimal_spanning_range(): coords = cbg.minimal_spanning_range() else: return None, {} elif area == "MAXSR": if cbg.has_maximal_spanning_range(): coords = cbg.maximal_spanning_range() else: return None, {} elif area == "LEFTSPRDIF": if cbg.has_left_spanningrange_difference(**kwargs): coords = cbg.left_spanningrange_difference(**kwargs) else: return None, {} elif area == "RIGTHSPRDIF": if cbg.has_rigth_spanningrange_difference(**kwargs): coords = cbg.rigth_spanningrange_difference(**kwargs) else: return None, {} elif area == "OMSRANDLEFTSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_left_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.left_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) ) elif area == "OMSRANDRIGTHSPRDIF": kwargs['sprdif_min_aa_length'] = 20 if not cbg.has_overall_minimal_spanning_range() or\ not cbg.has_rigth_spanningrange_difference(**kwargs): return None, {} # if here, start preparing coords coords = cbg.rigth_spanningrange_difference(**kwargs) # remove short contributors to left SPRDIF coords = _remove_short_sprdif_contributors(coords,verbose=verbose) # increase coord range by OMSR area omsr = cbg.overall_minimal_spanning_range() for node,coordrange in coords.iteritems(): coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) ) elif area == "RIGTHORFEND": # area in between MAXSR and orfend if not cbg.has_maximal_spanning_range(): return None, {} # get coords & obtain Orf ends coords = cbg.maximal_spanning_range() nodes = coords.keys() for node in nodes: organism = cbg.organism_by_node(node) theorf = cbg.get_orfs_of_graph(organism=organism)[0] coords[node] = range(max(coords[node])+1,theorf.protein_endPY) # remove zero-length ranges if len(coords[node]) == 0: del(coords[node]) else: raise "WHAT ELSE!?" ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # decrease coord range by prevcbg if applicable if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg: omsr = prevcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( prevcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodePrev = prevcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodePrev): continue sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] ) end = max(coords[nodeCbg])+1 coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # decrease coord range by nextcbg if applicable if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg: omsr = nextcbg.overall_minimal_spanning_range() for org in cbg.organism_set().intersection( nextcbg.organism_set() ): # omsr/coords have Node keys -> translate to Organism keys nodeCbg = cbg.get_organism_nodes(org)[0] nodeNext = nextcbg.get_organism_nodes(org)[0] # check if node not deleted earlier in coords dict if not coords.has_key(nodeCbg): continue if not omsr.has_key(nodeNext): continue sta = min(coords[nodeCbg]) end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] ) coords[nodeCbg] = Set(range(sta,end)) if not coords[nodeCbg]: del( coords[nodeCbg] ) # check if coords still present if not coords: return None, {} ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # do/redo _remove_short_sprdif_contributors id required if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF", "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]: coords = _remove_short_sprdif_contributors(coords) ############################################################################ if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords) ############################################################################ # check if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # check sprdif_min_aa_length if applicable if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF", "OMSRANDLEFTSPRDIF"]: maxlength = max([ len(vlist) for vlist in coords.values() ]) if maxlength < kwargs['sprdif_min_aa_length']: return None, {} # if here, obtain sequences and build HMM search profile # get fasta sequences and fastaseqs = cbg._get_sequences_by_coords(coords) # rewrite dict (node) keys to string keys fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords) # remove empty sequence strings from fastaseqs dict empty_seq_keys = [] for k,seq in fastaseqs.iteritems(): if seq == "" or len(seq) == 1: empty_seq_keys.append(k) for k in empty_seq_keys: del(coords[k]) del(fastaseqs[k]) # check (again) if at least 2 sequences/nodes are remaining if len(coords) <= 1: return None, {} # rewrite coords to (min,max) tuple coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ]) # perform clustalw multiple alignment (alignedseqs,alignment) = clustalw( seqs= fastaseqs ) # strip exterior gaps in case of OMSR/MINSR area if area in ["OMSR","MINSR"]: alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # strip poorly conserved residues in case of RIGTHORFEND if area in ["RIGTHORFEND"]: alignedseqs,alignment,coords = strip_poorly_supported_tails( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 ) # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID! if strip_nonaligned_residues: alignedseqs,alignment,coords = strip_overall_nonaligned_residues( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) # check if alignment was completely consumed or not if not alignment or len(alignment) <= 1: return None, {} ############################################################################ if verbose: print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None for node,algseq in alignedseqs.iteritems(): print algseq, node, coords[node] print alignment ############################################################################ # make unique filename for hmm profile file fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag() # write multiple alignment input file writeMultiFasta(alignedseqs,fname_hmm_profile) # make hmmbuild file of the multiplealignment fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile ) # remove hmm profile multiple alignment file osRemove(fname_hmm_profile) # return HMM serach profile filename return fname_hmmbuild_file, coords
def hmmhit2pacbp(queryorf,queryorg,querycoords,sbjctorf,sbjctorg,hmmhit,verbose=False): """ """ # trim hmmhit for unmatched characters ( sbjct_header, sbjct_start, sbjct_end, query_start, query_end, query, match, sbjct, score, expect ) = hmmhit while match and match[0] == ' ': query = query[1:] match = match[1:] sbjct = sbjct[1:] sbjct_start+=1 query_start+=1 while match and match[-1] == ' ': query = query[0:-1] match = match[0:-1] sbjct = sbjct[0:-1] sbjct_end-=1 query_end-=1 # get orf, node and AA and DNA coordinates of this sbjct hit; # correct for -1 offset in start coordinate!! sbjct_aa_start = sbjct_start - 1 + sbjctorf.protein_startPY sbjct_aa_end = sbjct_end + sbjctorf.protein_startPY sbjctNode = (sbjctorg,sbjctorf.id) query = query.replace(".","-").upper() sbjct = sbjct.replace(".","-").upper() ############################################################################ if verbose: print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % ( sbjctorg,sbjctorf.id) print "hmmhit2pacbp Q '%s'" % query print "hmmhit2pacbp m '%s'" % match print "hmmhit2pacbp S '%s'" % sbjct print "hmmQ:", query, query_start, query_end, "gaps:", print query.count('-'), len(query) print "hmmM:", match print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end, print "len:", sbjct_aa_end-sbjct_aa_start , len(sbjct) ############################################################################ # get Node and sequence of the query queryNode = (queryorg,queryorf.id) queryseq = deepcopy(query) # calculate query sequence position on queryorf query_aa_start = querycoords[0] + query_start - 1 query_aa_end = query_aa_start + len(queryseq) - queryseq.count('-') ############################################################################ if verbose: print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end, print "len:", query_aa_end-query_aa_start, len(queryseq) ############################################################################ # make a deepcopy; sbjct is needed unchanged for the next iteration # in the for loop, but here we want to trim of gap sequences sbjctseq = deepcopy(sbjct) sbjctaastart = deepcopy(sbjct_aa_start) sbjctaaend = deepcopy(sbjct_aa_end) while queryseq and queryseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] sbjctaastart+=1 while sbjctseq and sbjctseq[0] == '-': queryseq = queryseq[1:] sbjctseq = sbjctseq[1:] query_aa_start+=1 while queryseq and queryseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] sbjctaaend-=1 while sbjctseq and sbjctseq[-1] == '-': queryseq = queryseq[0:-1] sbjctseq = sbjctseq[0:-1] query_aa_end-=1 # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM # profiles are build from clustalw alignments which have loosely aligned # tails (SPRDIF sequences). Problem with HMM is, that in the result file # no information is written on where in teh constructed HMM this hit # starts. This **sucks** because special care was taken in ABFGP code to # make shure the exact aa-coordinates of the applied sequences to ClustalW # are known. Hmmbuild here nullifies this effort by not giving start # coordinates. Therefore, we have to check the exact start position # of the HMM match on the queryorf. if queryseq.replace("-","") != queryorf.getaas(query_aa_start,query_aa_end): # obtain (search) query sequence, replace gaps by X symbol searchqueryseq = queryseq.upper().replace("-","X") # count length of the query sequence; here IGNORE THE GAPS!! seqlen = len(queryseq.upper().replace("-","")) # make fasta sequence dictionary seqdict = { 'query_hmm': searchqueryseq, 'query_orf': queryorf.protein_sequence, } # make coords dictionary for remapping coords = { 'query_hmm':[0,seqlen], 'query_orf':[queryorf.protein_startPY,queryorf.protein_endPY], } # perform clustalw multiple alignment (alignedseqs,alignment) = clustalw( seqs= seqdict ) # strip exterior gaps alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps( deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) ) if alignedseqs['query_hmm'].count("-") > 0: # in (very) exceptional cases, gaps can be introduced in the # clustalw alignment in the HMM seq. This normally does not # occur! Fix this here by placing gaps in sbjctseq too. sbjctseq_as_list = list(sbjctseq) for pos in range(0,len(alignedseqs['query_hmm'])): if alignedseqs['query_hmm'][pos] == "-": sbjctseq_as_list.insert(pos,"-") if alignedseqs['query_hmm'].find("-",pos) == -1: break sbjctseq = "".join(sbjctseq_as_list) ######################################################################## if verbose: print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]" print "\t", "FALSE::", queryseq, "[ WITH GAPS ]" for k,algseq in alignedseqs.iteritems(): print "\t", "FALSE::", algseq, k, coords[k], len(algseq) print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq) print "\t", "FALSE::", alignment, "ALMNT", len(alignment) print "\t", "SOLVED:", len(alignedseqs['query_orf']) == len(sbjctseq) ######################################################################## # update query sequence & coordinates if len(alignedseqs['query_orf']) == len(sbjctseq): queryseq = alignedseqs['query_orf'] query_aa_start = coords['query_orf'][0] query_aa_end = coords['query_orf'][1] else: # still not identical lengths. ClustalW recovery of HMM hit # failed miserably. For now: omit # TODO: resolve this case!! # example: --filewithloci examples/bilal/CFU_830450.bothss.csv # ## HMM clustalw input profile: False MAXSR True # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598] # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388] # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1) # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD' # hmmhit2pacbp m '+ ka + F W k + nLG Wl E d' # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID' # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34 # hmmM: + ka + F W k + nLG Wl E d # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34 # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ] # FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ] # FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70 # FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70 # FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71 # FALSE:: **:*** *.: ::*:: * .* :.:*: * *: : :: ALMNT 70 # SOLVED: False # Pacbp creation failed! return False, None if queryseq and sbjctseq: ################################################################ if len(queryseq) != len(sbjctseq): # this will result in a exception to be raised: # pacb.exceptions.InproperlyAppliedArgument # print data here about what went wrong, then # just let the error be raised print queryseq, len(queryseq), sbjctseq, len(sbjctseq) print hmmhit print "Q:", query_aa_start, query_aa_end, print query_aa_end - query_aa_start, "len:", len(queryseq) print "S:", sbjctaastart, sbjctaaend, print sbjctaaend - sbjctaastart, "len:",len(sbjctseq) ################################################################ pacbpinput = (queryseq,sbjctseq,query_aa_start,sbjctaastart) pacbp = PacbP(input=pacbpinput) # remove consistent internal gaps caused hy HMM profile search pacbp.strip_consistent_internal_gaps() pacbp.source = 'hmmsearch' pacbporf = PacbPORF(pacbp,queryorf,sbjctorf) pacbporf.strip_unmatched_ends() if pacbporf.length==0: # Pacbp creation failed! return False, None else: pacbporf.extend_pacbporf_after_stops() pacbpkey = pacbporf.construct_unique_key(queryNode,sbjctNode) # return unique key and pacbporf return (pacbpkey,queryNode,sbjctNode), pacbporf else: # Pacbp creation failed! return False, None