def hash_gi_in_fasta(fa): fp=open(fa,'r') h_rid={} for r in seqParse.parse(fp,'fasta'): mObj=re.search(r'^gi\|(\d+)\|',r.id) gi=int(mObj.group(1)) h_rid[gi]=True fp.close() return h_rid
def group_print_ti_cat_fa(catIdx, h_tax2cat, nt_ti, outD): #catIdx=['N','A','B','E','EP','EH','EF','V','U','O','OPT'] h_cFps = {} tmpFn = outD + '/online_tax_cat.tmp' ncbiLineage = [ 'Fungi', 'Protozoan', 'Archaea', 'Bacteria', 'Viroids', 'Viruses', 'other sequences', 'unclassified sequences', 'Eukaryota' ] h_lin2catIdx = { 'Archaea': 'A', 'Bacteria': 'B', 'Eukaryota': 'E', 'Viroids': 'V', 'Viruses': 'V', 'other sequences': 'O', 'unclassified sequences': 'U', 'Fungi': 'EF', 'Protozoan': 'EP', 'N': 'N' } for c in catIdx: cFn = outD + '/' + c + '.fa' h_cFps[c] = open(cFn, 'w') print 'grouping sequence into each cat...' NOT_AVAIL = 'X' fp = open(nt_ti, 'r') for r in seqParse.parse(fp, 'fasta'): mObj = re.search(r'^ti\|(\d+)\|', r.id) ti = int(mObj.group(1)) #print ti #debug cat = h_tax2cat.get(ti, NOT_AVAIL) if cat == NOT_AVAIL: #search in phylogeny tree tmp = pathoUtilsA.search_cat_in_online_taxonomy( ti, ncbiLineage, tmpFn) cat = h_lin2catIdx.get(tmp) h_tax2cat[ti] = cat fp2 = h_cFps.get(cat) fp2.write('>%s\n%s\n' % (r.id, r.seq)) fp.close() if os.path.exists(tmpFn): os.remove(tmpFn) for c in catIdx: (h_cFps.get(c)).close() print 'done.' return (h_tax2cat)
def splitCheck(filePath, maxSize): files = [] fileSize = os.stat(filePath).st_size nSplit = 1 if (fileSize > maxSize): nSplit = int(math.ceil(1.0 * fileSize / float(maxSize))) if nSplit == 1: files.append(filePath) return files (base, ext) = os.path.splitext(filePath) #check if we have already done this splitting for i in range(nSplit): fiPath = base + '_' + str(i) + ext splitReq = False if not os.path.exists(fiPath): splitReq = True break fps = [] for i in range(nSplit): fiPath = base + '_' + str(i) + ext files.append(fiPath) if splitReq: fps.append(open(fiPath, 'w')) if splitReq: with open(filePath, 'r') as fp: j = 0 if ext == '.fq': for r in seqParse.parse(fp, 'fastq'): fps[j % nSplit].write('>%s %s\n%s\n%s\n' % (r.id, r.description, r.seq, r.qual)) j += 1 else: for r in seqParse.parse(fp, 'fasta'): fps[j % nSplit].write('>%s %s\n%s\n' % (r.id, r.description, r.seq)) j += 1 for i in range(nSplit): fps[i].close() return files
def register_fa_category(fa,catTag,ncbiNt_ti,h_tax2cat): h_rid=hash_gi_in_fasta(fa) fp=open(ncbiNt_ti,'r') for r in seqParse.parse(fp,'fasta'): mObj=re.search(r'^ti\|(\d+)\|gi\|(\d+)\|.*',r.id) gi=int(mObj.group(2)) if h_rid.get(gi,-1)!=-1: ti=int(mObj.group(1)) if ti!=-1: #print ti #debug h_tax2cat[ti]=catTag fp.close() return h_tax2cat
def splitCheck(filePath, maxSize): files = [] fileSize = os.stat(filePath).st_size nSplit = 1 if (fileSize > maxSize): nSplit = int(math.ceil(1.0*fileSize/float(maxSize))) if nSplit==1: files.append(filePath) return files (base, ext) = os.path.splitext(filePath) #check if we have already done this splitting for i in range(nSplit): fiPath=base+'_'+str(i)+ext splitReq=False if not os.path.exists(fiPath): splitReq=True break fps = [] for i in range(nSplit): fiPath=base+'_'+str(i)+ext files.append(fiPath) if splitReq: fps.append(open(fiPath,'w')) if splitReq: with open(filePath,'r') as fp: j=0 if ext=='.fq': for r in seqParse.parse(fp,'fastq'): fps[j%nSplit].write('>%s %s\n%s\n%s\n' % (r.id, r.description, r.seq, r.qual)) j+=1 else: for r in seqParse.parse(fp,'fasta'): fps[j%nSplit].write('>%s %s\n%s\n' % (r.id, r.description, r.seq)) j+=1 for i in range(nSplit): fps[i].close() return files
def test_extractRead(self): bowtie2Wrap.extractRead(self.targetAlignFile, self.fastqOutFile) expectedReadId = ["HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927", "HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927:A", "HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927:B"] with open(self.fastqOutFile,'r') as fp: count = 0 for r in seqParse.parse(fp,'fastq'): self.assertTrue(count < len(expectedReadId), "Extract Read: Expected number of reads Mismatch!") self.assertTrue(r.id == expectedReadId[count], "Extract Read: Expected Reads Mismatch!") count += 1 self.assertTrue(count == len(expectedReadId), "Extract Read: Expected number of reads Mismatch!")
def test_extractRead(self): bowtie2Wrap.extractRead(self.targetAlignFile, self.fastqOutFile) expectedReadId = [ "HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927", "HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927:A" ] with open(self.fastqOutFile, 'r') as fp: count = 0 for r in seqParse.parse(fp, 'fastq'): self.assertTrue( count < len(expectedReadId), "Extract Read: Expected number of reads Mismatch!") self.assertTrue(r.id == expectedReadId[count], "Extract Read: Expected Reads Mismatch!") count += 1 self.assertTrue( count == len(expectedReadId), "Extract Read: Expected number of reads Mismatch!")
def group_print_ti_cat_fa(catIdx,h_tax2cat,nt_ti,outD): #catIdx=['N','A','B','E','EP','EH','EF','V','U','O','OPT'] h_cFps={} tmpFn=outD+'/online_tax_cat.tmp' ncbiLineage=['Fungi','Protozoan','Archaea','Bacteria','Viroids','Viruses','other sequences','unclassified sequences','Eukaryota'] h_lin2catIdx={'Archaea':'A','Bacteria':'B','Eukaryota':'E','Viroids':'V','Viruses':'V','other sequences':'O','unclassified sequences':'U','Fungi':'EF','Protozoan':'EP','N':'N'} for c in catIdx: cFn=outD+'/'+c+'.fa' h_cFps[c]=open(cFn,'w') print 'grouping sequence into each cat...' NOT_AVAIL='X' fp = open(nt_ti,'r') for r in seqParse.parse(fp,'fasta'): mObj=re.search(r'^ti\|(\d+)\|',r.id) ti=int(mObj.group(1)) #print ti #debug cat=h_tax2cat.get(ti,NOT_AVAIL) if cat==NOT_AVAIL: #search in phylogeny tree tmp = pathoUtilsA.search_cat_in_online_taxonomy(ti,ncbiLineage,tmpFn) cat=h_lin2catIdx.get(tmp) h_tax2cat[ti]=cat fp2=h_cFps.get(cat) fp2.write('>%s\n%s\n' % (r.id, r.seq)) fp.close() if os.path.exists(tmpFn): os.remove(tmpFn) for c in catIdx: (h_cFps.get(c)).close() print 'done.' return (h_tax2cat)
def append_ti_into_fasta_hash(nt, gi2taxFn, Ti2sel, enable_descF, enable_onlineF, nt2, noTaxIdFa, invalSelFlag): NOT_AVAIL=0 NOT_VALID=-1 GET_ALL_TAX=-2 TAXONOMY_ID=1 #check if nt has ti tagged already tiReadyF=False if check_if_nt_has_ti(nt): tiReadyF=True if not tiReadyF: (maxGi,gi2ti)=gi2tax_list(gi2taxFn) get_all_taxF=False if Ti2sel[0]==GET_ALL_TAX: get_all_taxF=True if os.path.exists(nt2): return (nt2,noTaxIdFa) print 'selecting some reference genome sequences in [%s]...' % nt if (invalSelFlag): fp1 = open(noTaxIdFa,'w') with open(nt2,'w') as fp2: with open(nt,'r') as fp: if tiReadyF: for r in seqParse.parse(fp,'fasta'): #print r.id #debug mObj=re.search(r'ti\|(\d+)\|',r.id) if not mObj: continue ti=int(mObj.group(1)) if get_all_taxF or (ti in Ti2sel): if enable_descF and r.description: fp2.write('>%s\n%s\n' % (r.description, r.seq)) else: fp2.write('>%s\n%s\n' % (r.id, r.seq)) else: for r in seqParse.parse(fp,'fasta'): mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id) if not mObj: continue gi=int(mObj.group(1)) if gi>maxGi or gi2ti[gi]==NOT_AVAIL: if enable_onlineF: genbank_id=mObj.group(2) #telling exactly, it must be any gene name in a database #genbank_id=entries[3] #telling exactly, it must be any gene name in a database ti=pathoUtilsA.ncbi_eutil(gi,genbank_id,TAXONOMY_ID) #updated ti else: ti=NOT_VALID else: ti=gi2ti[gi] if gi<maxGi: gi2ti[gi]=ti if ti==NOT_VALID: if invalSelFlag: fp1.write('>ti|-1|%s\n%s\n' % (r.description, r.seq)) else: if get_all_taxF or (ti in Ti2sel): if enable_descF: fp2.write('>ti|%d|%s\n%s\n' % (ti, r.description, r.seq)) else: fp2.write('>ti|%d|%s\n%s\n' % (ti, r.id, r.seq)) print 'check %s' % nt2 if (invalSelFlag): fp1.close() print 'check %s' % noTaxIdFa print 'done.'
def append_ti_into_fasta_mysql(con, nt, Ti2sel, enable_descF, enable_onlineF, nt2, noTaxIdFa, invalSelFlag): NOT_VALID=-1 GET_ALL_TAX=-2 TAXON_ID=1 #check if nt has ti tagged already tiReadyF=False if check_if_nt_has_ti(nt): tiReadyF=True get_all_taxF=False if Ti2sel[0]==GET_ALL_TAX: get_all_taxF=True print 'selecting some reference genome sequences in [%s]' % nt if (invalSelFlag): fp1 = open(noTaxIdFa,'w') with open(nt2,'w') as fp2: with open(nt,'r') as fp: for r in seqParse.parse(fp,'fasta'): if tiReadyF: mObj=re.search(r'ti\|(\d+)\|',r.id) if not mObj: continue ti=int(mObj.group(1)) if ti!=NOT_VALID and (get_all_taxF or (ti in Ti2sel)): if enable_descF and r.description: fp2.write('>%s\n%s\n' % (r.description, r.seq)) else: fp2.write('>%s\n%s\n' % (r.id, r.seq)) else: mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id) if not mObj: continue gi=int(mObj.group(1)) with con: cur=con.cursor() sqlcmd='select taxon from giAnnoT where gi=%d' %gi cur.execute(sqlcmd) entr = cur.fetchone() if entr: ti=int(entr[0]) elif enable_onlineF: seqId=int(mObj.group(2)) ti=pathoUtilsA.ncbi_eutil(gi,seqId,TAXON_ID) #updated ti else: ti=NOT_VALID if ti==NOT_VALID: if (invalSelFlag): fp1.write('>ti|-1|%s\n%s\n' % (r.description,r.seq)) else: if get_all_taxF or (ti in Ti2sel): organismName, _ = dbUtils.findOrganismLineage(con, ti) organismName = re.sub('\s+', '_', organismName) if enable_descF and r.description: fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, r.description, r.seq)) else: fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, r.id, r.seq)) print 'check %s' % nt2 if (invalSelFlag): fp1.close() print 'check %s' % noTaxIdFa print 'done.'
def get_genome_annotation_in_mysql(\ refConsFq, minContigLen, MySqlConf, h_annoT, h_ti_contig): START,END = range(2) SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6) NAs = 'X' useMysql=True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_,_,_,passwd,_)=range(5) if MySqlConf[passwd]==NAs: #then, we do not use mysql useMysql=False if useMysql: con = dbUtils.init_mysql_innocentive(MySqlConf,0) fp = open(refConsFq,'r') #debugCnt = 0 #debug for r in seqParse.parse(fp,'fastq'): # for each covered genome covRange = selectConsensusContigs(r,minContigLen,-1) #disable checking seq complexity of contig if not covRange: continue C = len(covRange) #extract ti and gi refName = r.id mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi\|(\d+)\|',r.id) if mObj: ti = mObj.group(1) gi = mObj.group(3) else: mObj=re.search(r'ti\|(\d+)\|gi\|(\d+)\|',r.id) if mObj and mObj.group(1)!="-1": ti = mObj.group(1) gi = mObj.group(2) else: mObj=re.search(r'gi\|(\d+)\|',r.id) if mObj: gi = mObj.group(1) if not h_ti_contig.get(ti,[]): h_ti_contig[ti]=[] for c in range(C): #contig = r[covRange[c][0]:covRange[c][1]+1] contigSeq = str(r.seq[covRange[c][0]:covRange[c][1]+1]) #cqual = contig.letter_annotations["phred_quality"] #cLen = len(cqual) cLen = covRange[c][1]-covRange[c][0]+1 #cqual_ave = 1.*sum(cqual)/cLen #h_ti_contig[ti].append([refName,cLen,str(contig.seq)]) h_ti_contig[ti].append([refName,cLen,contigSeq]) if con: mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi cur = con.cursor() cur.execute(mysql_sel_cmd) entr=cur.fetchall() if entr: #subgi2query=[] #subgiAnnot=[] #print r.id #debug #print covRange #debug for j in entr: #select which subgi sits within the covered genomic regions aStbp=int(j[STBP]); aEdbp=int(j[EDBP]) A=aEdbp-aStbp+1 notCoveredA=A minCoveredA2 = notCoveredA - 100 reportA=False for i in range(C): #print '[subgi%s:%d - %d][cov:%d-%d]' % (gi,aStbp,aEdbp,covRange[START][i],covRange[END][i]) notCoveredA -= pathoUtilsA.segments_intersect(aStbp,aEdbp,covRange[i][START],covRange[i][END]) if notCoveredA<minCoveredA2: reportA=True break if reportA: selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI] cur = con.cursor() cur.execute(selCmd) entr2 = cur.fetchone() ref_name=NAs; product=NAs if entr2: ref_name = entr2[0]; product = entr2[1] if h_annoT.get(ti,-1)==-1: h_annoT[ti]=[] h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product]) fp.close() if con: dbUtils.mysql_close(con) return h_annoT,h_ti_contig