def _parsefilehandle(self,filehandle): """ Parse getorf output file(handle) to Orf objects in OrfSet @type filehandle: filehandle @param filehandle: filehandle to opened getorf output file """ self._dict_to_orfs( parseFasta(filehandle.readlines() ) )
def _parsefilehandle(self, filehandle): """ Parse getorf output file(handle) to Orf objects in OrfSet @type filehandle: filehandle @param filehandle: filehandle to opened getorf output file """ self._dict_to_orfs(parseFasta(filehandle.readlines()))
def getorfs(self, strand="+"): """ Run & Parse EMBOSS getorf from self.sequence @attention: implemented for ORFs from STOP to STOP @type strand: string @param strand: strand of the DNA sequence (default '+') """ # input validation IsProperStrand(strand) self.strand = strand # parse the ouput file seqs = parseFasta(getorf(sequence=self.sequence).split("\n")) self._dict_to_orfs(seqs)
def getorfs(self,strand="+"): """ Run & Parse EMBOSS getorf from self.sequence @attention: implemented for ORFs from STOP to STOP @type strand: string @param strand: strand of the DNA sequence (default '+') """ # input validation IsProperStrand(strand) self.strand = strand # parse the ouput file seqs = parseFasta( getorf( sequence=self.sequence ).split("\n") ) self._dict_to_orfs( seqs )
def _create_hmm_db(organism, inputdict, cbg, prev, next, orf_must_have_start=False, max_intron_nt_length=200, verbose=False): """ Create fasta ORF database for a organism in a CBG and its viscinity @type organism: * (presumably string) @param organism: Organism identifier recognizable in <input data structure> @type inputdict: dict @param inputdict: <input data structure> @type cbg: CodingBlockGraph or related object @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed @type prev: CodingBlockGraph or related object (or None) @param prev: CodingBlockGraph upstream/5p of cbg that must be completed @type next: CodingBlockGraph or related object (or None) @param next: CodingBlockGraph downstream/3p of cbg that must be completed @attention: `prev` and `next` CodingBlockGraphs reduce the search space of ORFs to scan with the HMM profile. This Speeds up and improves the quality of results. @type orf_must_have_start: Boolean @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs @type max_intron_nt_length: integer @param max_intron_nt_length: positive maximum intron length to take into acount when selecting suitable ORFs @type verbose: Boolean @param verbose: report debugging-report on STDOUT (True) or be quiet (False) """ # fullpath filename of result hmm multi fasta database fname_hmm_db_mfa = None if not cbg: return fname_hmm_db_mfa # (1) try to limit searchspace by prev and next CBG prevNode, nextNode = None, None prevMin, nextMax = None, None maskcoords = [] # (1a) check if (informant) organism is in the prev CBG AND if this CBG # has an OMSR -> not per se the case! if prev and organism in prev.organism_set() and\ prev.has_overall_minimal_spanning_range(): prevNode = prev.node_by_organism(organism) try: omsr = prev.overall_minimal_spanning_range(organism=organism) prevMin = (max(omsr) + 1) * 3 maskcoords.append((0, max(omsr))) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1b) check if (informant) organism is in the next CBG AND if this CBG # has an OMSR -> not per se the case! if next and organism in next.organism_set() and\ next.has_overall_minimal_spanning_range(): nextNode = next.node_by_organism(organism) try: omsr = next.overall_minimal_spanning_range(organism=organism) nextMax = min(omsr) * 3 aaseqlen = len(inputdict[organism]['genomeseq']) / 3 maskcoords.append((min(omsr), aaseqlen)) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1c) limit search space if only prev or next was specified if not prev and next and nextMax: prevMin = nextMax - max_intron_nt_length if not next and prev and prevMin: nextMax = prevMin + max_intron_nt_length # (2a) get elegiable sets of orfs from prev and next if not orf_must_have_start: elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end=prevMin, max_orf_start=nextMax) else: # ORFs *must* have starts => searching for a TSS exon/CBG elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end=prevMin, max_orf_start=nextMax, has_starts=True) # (2b) check orf count; can be zero in case of a very tiny region to check if not elegiable_orfs: return fname_hmm_db_mfa # (3) write masked orfs to fasta database multi line string db_fasta = inputdict[organism]['orfs'].tomaskedfasta( coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism) if orf_must_have_start: if len(db_fasta.strip()) == 0: # no UNmasked suitable ORFs remaining! # This is recognized lateron in this function pass else: # mask out all AAs before the first start lines = db_fasta.split("\n") for linenr in range(0, len(lines)): line = lines[linenr] if line[0] != ">": mpos = line.find("M") if mpos > 0: line = "X" * mpos + line[mpos:] lines[linenr] = line # recreate db_fasta string db_fasta = "\n".join(lines) ############################################################################ if verbose: if len(elegiable_orfs) > 10: orfidlist = len(elegiable_orfs) else: orfidlist = [orf.id for orf in elegiable_orfs] print "hmm-elegibable orfs:", organism, orfidlist, "/", print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin, if prev: print prev.has_overall_minimal_spanning_range(), else: print None, print "nextMax:", nextMax, if next: print next.has_overall_minimal_spanning_range() else: print None ############################################################################ # (4) make unique filename for hmm database file fname_base = get_random_string_tag() fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base, organism) # (5) write masked orfs to fasta database fh = open(fname_hmm_db_mfa, 'w') fh.write(db_fasta) fh.close() # (6) make shure that there where orfs written to file; # in case very little orfs are selected and all are masked -> no files! seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines()) if not seqs_in_db: # delete this (empty) file osRemove(fname_hmm_db_mfa) return None # (7) return hmm search database filename return fname_hmm_db_mfa
return ntcoord # end of function _obtain_unigene_gff_nt_end_coord # get fasta by gff, remove headers, concatenate into single DNA seq and make 3-frame translation command = """%s %s %s | grep -v "^>" | tr -d " \n" | %s -filter -frame %s """ % ( PYTHON_PATH, EXECUTABLE_GFF2FASTA, fastafile, EXECUTABLE_TRANSEQ, transeq_frame) if python_version <= 2.5: # Python2.4 os.popen syntax ci, co = os.popen2(command) ci.write("\n".join(unigenegffexons)) ci.close() frametrans = parseFasta(co.readlines()) co.close() else: # Python2.6 subprocess.Popen syntax from subprocess import Popen, PIPE p = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True) (child_stdin, child_stdout) = (p.stdin, p.stdout) child_stdin.write("\n".join(unigenegffexons)) child_stdin.close() frametrans = parseFasta(child_stdout.readlines()) # command to get DNA sequence of unigene exon(s) command = """%s %s %s | grep -v "^>" """ % ( PYTHON_PATH, EXECUTABLE_GFF2FASTA, fastafile,
def blastanalysescbgjunction( gsg, prevCBG, nextCBG, omit_cbg_orfs=False, omit_non_cbg_orfs=False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org, orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf for org, orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org, orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input, prevCBG, nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n")) ############################################################ if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org, orfid = header.split("_orf_") orfid = int(orfid) node = (org, orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del (fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del (fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):", len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):", len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa" writeMultiFasta(fastadbmfa, fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ, orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ + "_orf_" + str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id, protseq, fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS, _orfSid = alignment.title.replace(">", "").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS, int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp), orfQ, orfS) ################################################################ if verbose: print pacbporf, orgQ, orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = (orgQ, orfQ.id) nodeS = (orgS, orfS.id) uqkey = pacbporf.construct_unique_key(nodeQ, nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([fname + ".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes(pacbpcol.get_nodes()) for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore, length, orfQid, orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print[p.bitscore for p in dpcpacbpcol.pacbps.values()] print "PCG nodes:", dpcpacbpcol.get_ordered_nodes() ################################################################ #### do some transformations on the pacbpcol ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1) ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs( #### edges=gsg.node_count()-1 , max_missing_edges=0 ) ##### convert to list of CBGs and do some transformations ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={}) ####cbgList.remove_all_but_complete_cbgs() ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) ####cbgList.remove_cbgs_without_omsr() ####cbgList.update_edge_weights_by_minimal_spanning_range() ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True) min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2]) pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity) max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3 splittedCBGs = pacbpcol.find_fully_connected_subgraphs( edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges) # convert to list of CBGs and do some transformations cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={}) cbgList.remove_all_but_cbgs() cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol) cbgList.make_pacbps_for_missing_edges() cbgList.remove_all_but_complete_cbgs() cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT) cbgList.remove_cbgs_without_omsr() cbgList.update_edge_weights_by_minimal_spanning_range() cbgList.order_list_by_attribute(order_by='total_weight', reversed=True) # and create_cache() for these CBGs for cbg in cbgList: cbg.create_cache() #################################################################### if verbose: print stw.lap(), "CBGs created", len(cbgList) for newcbg in cbgList: print "new:", newcbg #################################################################### # return list with CBGs return cbgList.codingblockgraphs
def _create_hmm_db(organism,inputdict,cbg,prev,next, orf_must_have_start=False,max_intron_nt_length=200, verbose=False): """ Create fasta ORF database for a organism in a CBG and its viscinity @type organism: * (presumably string) @param organism: Organism identifier recognizable in <input data structure> @type inputdict: dict @param inputdict: <input data structure> @type cbg: CodingBlockGraph or related object @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed @type prev: CodingBlockGraph or related object (or None) @param prev: CodingBlockGraph upstream/5p of cbg that must be completed @type next: CodingBlockGraph or related object (or None) @param next: CodingBlockGraph downstream/3p of cbg that must be completed @attention: `prev` and `next` CodingBlockGraphs reduce the search space of ORFs to scan with the HMM profile. This Speeds up and improves the quality of results. @type orf_must_have_start: Boolean @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs @type max_intron_nt_length: integer @param max_intron_nt_length: positive maximum intron length to take into acount when selecting suitable ORFs @type verbose: Boolean @param verbose: report debugging-report on STDOUT (True) or be quiet (False) """ # fullpath filename of result hmm multi fasta database fname_hmm_db_mfa = None if not cbg: return fname_hmm_db_mfa # (1) try to limit searchspace by prev and next CBG prevNode, nextNode = None, None prevMin, nextMax = None, None maskcoords = [] # (1a) check if (informant) organism is in the prev CBG AND if this CBG # has an OMSR -> not per se the case! if prev and organism in prev.organism_set() and\ prev.has_overall_minimal_spanning_range(): prevNode = prev.node_by_organism(organism) try: omsr = prev.overall_minimal_spanning_range(organism=organism) prevMin = (max(omsr)+1)*3 maskcoords.append( ( 0, max(omsr) ) ) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1b) check if (informant) organism is in the next CBG AND if this CBG # has an OMSR -> not per se the case! if next and organism in next.organism_set() and\ next.has_overall_minimal_spanning_range(): nextNode = next.node_by_organism(organism) try: omsr = next.overall_minimal_spanning_range(organism=organism) nextMax = min(omsr)*3 aaseqlen = len(inputdict[organism]['genomeseq'])/3 maskcoords.append( ( min(omsr), aaseqlen ) ) except KeyError: # hmmm.... block has an OMSR, but not for this organism!??!!? pass # (1c) limit search space if only prev or next was specified if not prev and next and nextMax: prevMin = nextMax - max_intron_nt_length if not next and prev and prevMin: nextMax = prevMin + max_intron_nt_length # (2a) get elegiable sets of orfs from prev and next if not orf_must_have_start: elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end = prevMin, max_orf_start = nextMax ) else: # ORFs *must* have starts => searching for a TSS exon/CBG elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs( min_orf_end = prevMin, max_orf_start = nextMax, has_starts=True ) # (2b) check orf count; can be zero in case of a very tiny region to check if not elegiable_orfs: return fname_hmm_db_mfa # (3) write masked orfs to fasta database multi line string db_fasta = inputdict[organism]['orfs'].tomaskedfasta( coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism) if orf_must_have_start: if len(db_fasta.strip()) == 0: # no UNmasked suitable ORFs remaining! # This is recognized lateron in this function pass else: # mask out all AAs before the first start lines = db_fasta.split("\n") for linenr in range(0,len(lines)): line = lines[linenr] if line[0] != ">": mpos = line.find("M") if mpos > 0: line = "X"*mpos+line[mpos:] lines[linenr] = line # recreate db_fasta string db_fasta = "\n".join(lines) ############################################################################ if verbose: if len(elegiable_orfs) > 10: orfidlist = len(elegiable_orfs) else: orfidlist = [ orf.id for orf in elegiable_orfs ] print "hmm-elegibable orfs:", organism, orfidlist, "/", print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin, if prev: print prev.has_overall_minimal_spanning_range(), else: print None, print "nextMax:", nextMax, if next: print next.has_overall_minimal_spanning_range() else: print None ############################################################################ # (4) make unique filename for hmm database file fname_base = get_random_string_tag() fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base,organism) # (5) write masked orfs to fasta database fh = open(fname_hmm_db_mfa,'w') fh.write( db_fasta ) fh.close() # (6) make shure that there where orfs written to file; # in case very little orfs are selected and all are masked -> no files! seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines()) if not seqs_in_db: # delete this (empty) file osRemove( fname_hmm_db_mfa ) return None # (7) return hmm search database filename return fname_hmm_db_mfa
def blastanalysescbgjunction(gsg,prevCBG,nextCBG, omit_cbg_orfs = False, omit_non_cbg_orfs = False, extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS, omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK, verbose=False): """ """ ############################################################ if verbose: stw = StopWatch('blastanalysescbgjunction') stw.start() ############################################################ orfs = {} if not omit_cbg_orfs: # gather Orfs from prevCBG and nextCBG for org,orflist, in prevCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf for org,orflist, in nextCBG.get_orfs_of_graph().iteritems(): orf = orflist[0] orfs[(org,orf.id)] = orf ############################################################ if verbose: print stw.lap(), "orfs (1):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # create masked fasta database in a dict fastadbmfa = parseFasta( create_hmmdb_for_neighbouring_cbgs( gsg.input,prevCBG,nextCBG, omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction, ).split("\n") ) ############################################################ if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa) ############################################################ # remove ORFs that do not belong to prevCBG and nextCBG, # or that DO belong to prevCBG and nextCBG, or neither fastaheaders = fastadbmfa.keys() for header in fastaheaders: org,orfid = header.split("_orf_") orfid = int(orfid) node = (org,orfid) # check for the omit_non_cbg_orfs criterion add_orf = False if omit_non_cbg_orfs: if node not in orfs: del(fastadbmfa[header]) else: add_orf = True # check for the omit_cbg_orfs criterion if omit_cbg_orfs and node in orfs: del(fastadbmfa[header]) if add_orf: # get this Orf and add to orfs orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid) ############################################################ if verbose: print stw.lap(), "fasta db (2):",len(fastadbmfa) print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys()) ############################################################ ############################################################ if verbose: print stw.lap(), "orfs (2):",len(orfs) print _format_orf_nodes_to_string(orfs.keys()) ############################################################ # no query/sbjct range left at all if not fastadbmfa: return [] # check if all organisms are still covered orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()]) if orgSet.symmetric_difference(gsg.organism_set()): return [] # create !single! fasta database fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa" writeMultiFasta(fastadbmfa,fastadbname) formatdb(fname=fastadbname) # remap the identifiers of the orf objects i.o.t.... multifastas = {} blastdbs = {} pacbpcol = PacbpCollectionGraph() dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps ############################################################ if verbose: print stw.lap(), "blastp starting" ############################################################ for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism(): for nodeQ,orfQ in orfs.iteritems(): # only blast the (masked) Orfs of orgQ if prevCBG.organism_by_node(nodeQ) != orgQ: continue # get the masked protein sequence of this orfObj header = orgQ+"_orf_"+str(orfQ.id) # check if key exists in fastadbmfa. In a case where # an Orf is masked out completely, it is absent here! if not fastadbmfa.has_key(header): continue protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)] # run blast_seqs2db blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname, extra_blastp_params=extra_blastp_params) # omit empty blast records if len(blastrec.alignments) == 0: continue for alignment in blastrec.alignments: # get sbjct Org and Orf identifiers _orgS,_orfSid = alignment.title.replace(">","").split("_orf_") if _orgS != orgS: continue nodeS = (_orgS,int(_orfSid)) orfS = orfs[nodeS] # take only the *best* HSP (highest scoring first one) hsp = alignment.hsps[0] # correct to absolute positions hsp.query_start = hsp.query_start + orfQ.protein_startPY hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY # initialize the PacbP pacbporf = pacb.conversion.pacbp2pacbporf( pacb.PacbP(blastp_hsp=hsp),orfQ,orfS) ################################################################ if verbose: print pacbporf, orgQ,orgS, orfQ print pacbporf.query print pacbporf.match print pacbporf.sbjct ################################################################ # create nodes; ( Organism Identifier, Orf Identifier ) nodeQ = ( orgQ, orfQ.id ) nodeS = ( orgS, orfS.id ) uqkey = pacbporf.construct_unique_key(nodeQ,nodeS) if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ) if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS) pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore) # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron! dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf ############################################################ if verbose: print stw.lap(), "blastp done" ############################################################ # file cleanup _file_cleanup(multifastas.values()) _file_cleanup(["formatdb.log"]) _file_cleanup([ fname+".*" for fname in blastdbs.values()]) # check if all Organism/Gene identifiers are covered in PacbPs if not pacbpcol.organism_set_size() == gsg.organism_set_size(): return [] # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol # In dpcpacbpcol the actual PacbPORFs are stores & kept, # whereas pacbpcol itself is splitted in CBGs (which # function does not yet (!?) take the actual pacbps into account) dpcpacbpcol.add_nodes( pacbpcol.get_nodes() ) for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys(): (bitscore,length,orfQid,orfSid) = uqkey dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore) ################################################################ if verbose: print pacbpcol print "PCG bitscores:", print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]
# get fasta by gff, remove headers, concatenate into single DNA seq and make 3-frame translation command = """%s %s %s | grep -v "^>" | tr -d " \n" | %s -filter -frame %s """ % ( PYTHON_PATH, EXECUTABLE_GFF2FASTA, fastafile, EXECUTABLE_TRANSEQ, transeq_frame ) if python_version <= 2.5: # Python2.4 os.popen syntax ci,co = os.popen2(command) ci.write("\n".join(unigenegffexons)) ci.close() frametrans = parseFasta( co.readlines() ) co.close() else: # Python2.6 subprocess.Popen syntax from subprocess import Popen, PIPE p = Popen(command,shell=True,stdin=PIPE, stdout=PIPE, close_fds=True) (child_stdin, child_stdout) = (p.stdin, p.stdout) child_stdin.write("\n".join(unigenegffexons)) child_stdin.close() frametrans = parseFasta( child_stdout.readlines() ) # command to get DNA sequence of unigene exon(s) command = """%s %s %s | grep -v "^>" """ % ( PYTHON_PATH, EXECUTABLE_GFF2FASTA,