def hash_gi_in_fasta(fa):
	fp=open(fa,'r')
	h_rid={}
	for r in seqParse.parse(fp,'fasta'):
		mObj=re.search(r'^gi\|(\d+)\|',r.id)
		gi=int(mObj.group(1))
		h_rid[gi]=True
	fp.close()
	return h_rid
def group_print_ti_cat_fa(catIdx, h_tax2cat, nt_ti, outD):
    #catIdx=['N','A','B','E','EP','EH','EF','V','U','O','OPT']
    h_cFps = {}
    tmpFn = outD + '/online_tax_cat.tmp'
    ncbiLineage = [
        'Fungi', 'Protozoan', 'Archaea', 'Bacteria', 'Viroids', 'Viruses',
        'other sequences', 'unclassified sequences', 'Eukaryota'
    ]
    h_lin2catIdx = {
        'Archaea': 'A',
        'Bacteria': 'B',
        'Eukaryota': 'E',
        'Viroids': 'V',
        'Viruses': 'V',
        'other sequences': 'O',
        'unclassified sequences': 'U',
        'Fungi': 'EF',
        'Protozoan': 'EP',
        'N': 'N'
    }

    for c in catIdx:
        cFn = outD + '/' + c + '.fa'
        h_cFps[c] = open(cFn, 'w')

    print 'grouping sequence into each cat...'

    NOT_AVAIL = 'X'
    fp = open(nt_ti, 'r')
    for r in seqParse.parse(fp, 'fasta'):
        mObj = re.search(r'^ti\|(\d+)\|', r.id)
        ti = int(mObj.group(1))
        #print ti #debug
        cat = h_tax2cat.get(ti, NOT_AVAIL)
        if cat == NOT_AVAIL:
            #search in phylogeny tree
            tmp = pathoUtilsA.search_cat_in_online_taxonomy(
                ti, ncbiLineage, tmpFn)
            cat = h_lin2catIdx.get(tmp)
            h_tax2cat[ti] = cat

        fp2 = h_cFps.get(cat)
        fp2.write('>%s\n%s\n' % (r.id, r.seq))

    fp.close()
    if os.path.exists(tmpFn):
        os.remove(tmpFn)

    for c in catIdx:
        (h_cFps.get(c)).close()

    print 'done.'
    return (h_tax2cat)
def splitCheck(filePath, maxSize):
    files = []
    fileSize = os.stat(filePath).st_size
    nSplit = 1
    if (fileSize > maxSize):
        nSplit = int(math.ceil(1.0 * fileSize / float(maxSize)))
    if nSplit == 1:
        files.append(filePath)
        return files
    (base, ext) = os.path.splitext(filePath)
    #check if we have already done this splitting
    for i in range(nSplit):
        fiPath = base + '_' + str(i) + ext
        splitReq = False
        if not os.path.exists(fiPath):
            splitReq = True
            break
    fps = []
    for i in range(nSplit):
        fiPath = base + '_' + str(i) + ext
        files.append(fiPath)
        if splitReq:
            fps.append(open(fiPath, 'w'))
    if splitReq:
        with open(filePath, 'r') as fp:
            j = 0
            if ext == '.fq':
                for r in seqParse.parse(fp, 'fastq'):
                    fps[j % nSplit].write('>%s %s\n%s\n%s\n' %
                                          (r.id, r.description, r.seq, r.qual))
                    j += 1
            else:
                for r in seqParse.parse(fp, 'fasta'):
                    fps[j % nSplit].write('>%s %s\n%s\n' %
                                          (r.id, r.description, r.seq))
                    j += 1
        for i in range(nSplit):
            fps[i].close()
    return files
def register_fa_category(fa,catTag,ncbiNt_ti,h_tax2cat):
	
	h_rid=hash_gi_in_fasta(fa)
	fp=open(ncbiNt_ti,'r')
	for r in seqParse.parse(fp,'fasta'):
		mObj=re.search(r'^ti\|(\d+)\|gi\|(\d+)\|.*',r.id)
		gi=int(mObj.group(2))
		if h_rid.get(gi,-1)!=-1:
			ti=int(mObj.group(1))
			if ti!=-1:
				#print ti #debug
				h_tax2cat[ti]=catTag
	fp.close()
	return h_tax2cat
Example #5
0
def splitCheck(filePath, maxSize):
	files = []
	fileSize = os.stat(filePath).st_size
	nSplit = 1
	if (fileSize > maxSize):
		nSplit = int(math.ceil(1.0*fileSize/float(maxSize)))
	if nSplit==1:
		files.append(filePath)
		return files
	(base, ext) = os.path.splitext(filePath)
	#check if we have already done this splitting
	for i in range(nSplit):
		fiPath=base+'_'+str(i)+ext
		splitReq=False
		if not os.path.exists(fiPath):
			splitReq=True
			break
	fps = []
	for i in range(nSplit):
		fiPath=base+'_'+str(i)+ext
		files.append(fiPath)
		if splitReq:
			fps.append(open(fiPath,'w'))
	if splitReq:
		with open(filePath,'r') as fp:
			j=0
			if ext=='.fq':
				for r in seqParse.parse(fp,'fastq'):
					fps[j%nSplit].write('>%s %s\n%s\n%s\n' % (r.id, r.description, r.seq, r.qual))
					j+=1
			else:
				for r in seqParse.parse(fp,'fasta'):
					fps[j%nSplit].write('>%s %s\n%s\n' % (r.id, r.description, r.seq))
					j+=1
		for i in range(nSplit):
			fps[i].close()
	return files
Example #6
0
	def test_extractRead(self):
		bowtie2Wrap.extractRead(self.targetAlignFile, self.fastqOutFile)
		expectedReadId = ["HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927", 
			"HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927:A",
			"HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927:B"]
		with open(self.fastqOutFile,'r') as fp:
			count = 0
			for r in seqParse.parse(fp,'fastq'):
				self.assertTrue(count < len(expectedReadId), 
					"Extract Read: Expected number of reads Mismatch!")
				self.assertTrue(r.id == expectedReadId[count], 
					"Extract Read: Expected Reads Mismatch!")
				count += 1
			self.assertTrue(count == len(expectedReadId), 
				"Extract Read: Expected number of reads Mismatch!")
 def test_extractRead(self):
     bowtie2Wrap.extractRead(self.targetAlignFile, self.fastqOutFile)
     expectedReadId = [
         "HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927",
         "HWI-ST998R:270:H7NJ9ADXX:1:1101:1797:1927:A"
     ]
     with open(self.fastqOutFile, 'r') as fp:
         count = 0
         for r in seqParse.parse(fp, 'fastq'):
             self.assertTrue(
                 count < len(expectedReadId),
                 "Extract Read: Expected number of reads Mismatch!")
             self.assertTrue(r.id == expectedReadId[count],
                             "Extract Read: Expected Reads Mismatch!")
             count += 1
         self.assertTrue(
             count == len(expectedReadId),
             "Extract Read: Expected number of reads Mismatch!")
Example #8
0
def group_print_ti_cat_fa(catIdx,h_tax2cat,nt_ti,outD):
	#catIdx=['N','A','B','E','EP','EH','EF','V','U','O','OPT']
	h_cFps={}
	tmpFn=outD+'/online_tax_cat.tmp'
	ncbiLineage=['Fungi','Protozoan','Archaea','Bacteria','Viroids','Viruses','other sequences','unclassified sequences','Eukaryota']
	h_lin2catIdx={'Archaea':'A','Bacteria':'B','Eukaryota':'E','Viroids':'V','Viruses':'V','other sequences':'O','unclassified sequences':'U','Fungi':'EF','Protozoan':'EP','N':'N'}
	
	for c in catIdx:
		cFn=outD+'/'+c+'.fa'
		h_cFps[c]=open(cFn,'w')

	print 'grouping sequence into each cat...'
	
	NOT_AVAIL='X'
	fp = open(nt_ti,'r')
	for r in seqParse.parse(fp,'fasta'):
		mObj=re.search(r'^ti\|(\d+)\|',r.id)
		ti=int(mObj.group(1))
		#print ti #debug
		cat=h_tax2cat.get(ti,NOT_AVAIL)
		if cat==NOT_AVAIL:
			#search in phylogeny tree
			tmp = pathoUtilsA.search_cat_in_online_taxonomy(ti,ncbiLineage,tmpFn)
			cat=h_lin2catIdx.get(tmp)
			h_tax2cat[ti]=cat
		
		fp2=h_cFps.get(cat)
		fp2.write('>%s\n%s\n' % (r.id, r.seq))

	fp.close()
	if os.path.exists(tmpFn):
		os.remove(tmpFn)
	
	for c in catIdx:
		(h_cFps.get(c)).close()
		
	print 'done.'
	return (h_tax2cat)
def append_ti_into_fasta_hash(nt, gi2taxFn, Ti2sel, enable_descF, enable_onlineF,
		nt2, noTaxIdFa, invalSelFlag):
	
	NOT_AVAIL=0
	NOT_VALID=-1
	GET_ALL_TAX=-2
	TAXONOMY_ID=1
	
	#check if nt has ti tagged already
	tiReadyF=False
	if check_if_nt_has_ti(nt):
		tiReadyF=True
	
	if not tiReadyF:
		(maxGi,gi2ti)=gi2tax_list(gi2taxFn)
	
	get_all_taxF=False
	if Ti2sel[0]==GET_ALL_TAX:
		get_all_taxF=True
	
	if os.path.exists(nt2):
		return (nt2,noTaxIdFa)
		
	print 'selecting some reference genome sequences in [%s]...' % nt
	
	if (invalSelFlag):
		fp1 = open(noTaxIdFa,'w')
	with open(nt2,'w') as fp2:
		with open(nt,'r') as fp:
			if tiReadyF:
				for r in seqParse.parse(fp,'fasta'):
					#print r.id #debug
					mObj=re.search(r'ti\|(\d+)\|',r.id)
					if not mObj:
						continue
					ti=int(mObj.group(1))
					if get_all_taxF or (ti in Ti2sel):
						if enable_descF and r.description:
							fp2.write('>%s\n%s\n' % (r.description, r.seq))
						else:
							fp2.write('>%s\n%s\n' % (r.id, r.seq))
			else:
				for r in seqParse.parse(fp,'fasta'):
					mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id)
					if not mObj:
						continue
					gi=int(mObj.group(1))
					if gi>maxGi or gi2ti[gi]==NOT_AVAIL:
						if enable_onlineF:
							genbank_id=mObj.group(2) #telling exactly, it must be any gene name in a database
							#genbank_id=entries[3] #telling exactly, it must be any gene name in a database
							ti=pathoUtilsA.ncbi_eutil(gi,genbank_id,TAXONOMY_ID) #updated ti
						else:
							ti=NOT_VALID
					else:
						ti=gi2ti[gi]
						
					if gi<maxGi:
						gi2ti[gi]=ti
						
					if ti==NOT_VALID:
						if invalSelFlag:
							fp1.write('>ti|-1|%s\n%s\n' % (r.description, r.seq))
					else:
						if get_all_taxF or (ti in Ti2sel):
							if enable_descF:
								fp2.write('>ti|%d|%s\n%s\n' % (ti, r.description, r.seq))
							else:
								fp2.write('>ti|%d|%s\n%s\n' % (ti, r.id, r.seq))

	print 'check %s' % nt2
	if (invalSelFlag):
		fp1.close()
		print 'check %s' % noTaxIdFa
	print 'done.'
def append_ti_into_fasta_mysql(con, nt, Ti2sel, enable_descF, enable_onlineF,
		nt2, noTaxIdFa, invalSelFlag):
	
	NOT_VALID=-1
	GET_ALL_TAX=-2
	TAXON_ID=1
	
	#check if nt has ti tagged already
	tiReadyF=False
	if check_if_nt_has_ti(nt):
		tiReadyF=True

	get_all_taxF=False
	if Ti2sel[0]==GET_ALL_TAX:
		get_all_taxF=True

	
	print 'selecting some reference genome sequences in [%s]' % nt
	
	if (invalSelFlag):
		fp1 = open(noTaxIdFa,'w')
	with open(nt2,'w') as fp2:
		with open(nt,'r') as fp:
			for r in seqParse.parse(fp,'fasta'):
				if tiReadyF:
					mObj=re.search(r'ti\|(\d+)\|',r.id)
					if not mObj:
						continue
					ti=int(mObj.group(1))
					if ti!=NOT_VALID and (get_all_taxF or (ti in Ti2sel)):
						if enable_descF and r.description:
							fp2.write('>%s\n%s\n' % (r.description, r.seq))
						else:
							fp2.write('>%s\n%s\n' % (r.id, r.seq))
				else:
					mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id)
					if not mObj:
						continue
					gi=int(mObj.group(1))
					
					with con:
						cur=con.cursor()
						sqlcmd='select taxon from giAnnoT where gi=%d' %gi
						cur.execute(sqlcmd)
						entr = cur.fetchone()
						if entr:
							ti=int(entr[0])
						elif enable_onlineF:
							seqId=int(mObj.group(2))
							ti=pathoUtilsA.ncbi_eutil(gi,seqId,TAXON_ID) #updated ti
						else:
							ti=NOT_VALID
					
					if ti==NOT_VALID:
						if (invalSelFlag):
							fp1.write('>ti|-1|%s\n%s\n' % (r.description,r.seq))
					else:
						if get_all_taxF or (ti in Ti2sel):
							organismName, _ = dbUtils.findOrganismLineage(con, ti)
							organismName = re.sub('\s+', '_', organismName)
							if enable_descF and r.description:
								fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, 
									r.description, r.seq))
							else:
								fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, 
									r.id, r.seq))
	
	print 'check %s' % nt2
	if (invalSelFlag):
		fp1.close()
		print 'check %s' % noTaxIdFa
	print 'done.'
Example #11
0
def get_genome_annotation_in_mysql(\
	refConsFq, minContigLen, MySqlConf, h_annoT, h_ti_contig):
	
	START,END = range(2)
	SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6)
	NAs = 'X'
	useMysql=True
	con = None
	#(hostname,port,user,passwd,defaultDb)=range(5)
	(_,_,_,passwd,_)=range(5)
	if MySqlConf[passwd]==NAs: #then, we do not use mysql
		useMysql=False
	if useMysql:
		con = dbUtils.init_mysql_innocentive(MySqlConf,0)

	fp = open(refConsFq,'r')
	#debugCnt = 0 #debug
	for r in seqParse.parse(fp,'fastq'): # for each covered genome

		covRange = selectConsensusContigs(r,minContigLen,-1) #disable checking seq complexity of contig
		
		if not covRange:
			continue
		C = len(covRange)

		#extract ti and gi
		refName = r.id
		mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi\|(\d+)\|',r.id)
		if mObj:
			ti = mObj.group(1)
			gi = mObj.group(3)
		else:
			mObj=re.search(r'ti\|(\d+)\|gi\|(\d+)\|',r.id)
			if mObj and mObj.group(1)!="-1":
				ti = mObj.group(1)
				gi = mObj.group(2)
			else:
				mObj=re.search(r'gi\|(\d+)\|',r.id)
				if mObj:
					gi = mObj.group(1)

		if not h_ti_contig.get(ti,[]):
			h_ti_contig[ti]=[]
			
		for c in range(C):
			#contig = r[covRange[c][0]:covRange[c][1]+1]
			contigSeq = str(r.seq[covRange[c][0]:covRange[c][1]+1])
			#cqual = contig.letter_annotations["phred_quality"]
			#cLen = len(cqual)
			cLen = covRange[c][1]-covRange[c][0]+1
			#cqual_ave = 1.*sum(cqual)/cLen
			
			#h_ti_contig[ti].append([refName,cLen,str(contig.seq)])
			h_ti_contig[ti].append([refName,cLen,contigSeq])
		
		if con:
			mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi
			cur = con.cursor()
			cur.execute(mysql_sel_cmd)
			entr=cur.fetchall()
			if entr:
				#subgi2query=[]
				#subgiAnnot=[]
				#print r.id #debug
				#print covRange #debug
				for j in entr: #select which subgi sits within the covered genomic regions
					aStbp=int(j[STBP]); aEdbp=int(j[EDBP])

					A=aEdbp-aStbp+1
					notCoveredA=A
					minCoveredA2 = notCoveredA - 100
					
					reportA=False
					
					for i in range(C):
						#print '[subgi%s:%d - %d][cov:%d-%d]' % (gi,aStbp,aEdbp,covRange[START][i],covRange[END][i])
						notCoveredA -= pathoUtilsA.segments_intersect(aStbp,aEdbp,covRange[i][START],covRange[i][END])
						if notCoveredA<minCoveredA2:
							reportA=True
							break

					if reportA:
						selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI]
						cur = con.cursor()
						cur.execute(selCmd)
						entr2 = cur.fetchone()
						ref_name=NAs; product=NAs
						if entr2:
							ref_name = entr2[0]; product = entr2[1]
						if h_annoT.get(ti,-1)==-1:
							h_annoT[ti]=[]
						h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product])

	fp.close()
	if con:
		dbUtils.mysql_close(con)
	return h_annoT,h_ti_contig
Example #12
0
def get_genome_annotation_in_mysql(\
	refConsFq, minContigLen, MySqlConf, h_annoT, h_ti_contig):
	
	START,END = range(2)
	SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6)
	NAs = 'X'
	useMysql=True
	con = None
	#(hostname,port,user,passwd,defaultDb)=range(5)
	(_,_,_,passwd,_)=range(5)
	if MySqlConf[passwd]==NAs: #then, we do not use mysql
		useMysql=False
	if useMysql:
		con = dbUtils.init_mysql_innocentive(MySqlConf,0)

	fp = open(refConsFq,'r')
	#debugCnt = 0 #debug
	for r in seqParse.parse(fp,'fastq'): # for each covered genome

		covRange = selectConsensusContigs(r,minContigLen,-1) #disable checking seq complexity of contig
		
		if not covRange:
			continue
		C = len(covRange)

		#extract ti and gi
		refName = r.id
		mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi\|(\d+)\|',r.id)
		if mObj:
			ti = mObj.group(1)
			gi = mObj.group(3)
		else:
			mObj=re.search(r'ti\|(\d+)\|gi\|(\d+)\|',r.id)
			if mObj and mObj.group(1)!="-1":
				ti = mObj.group(1)
				gi = mObj.group(2)
			else:
				mObj=re.search(r'gi\|(\d+)\|',r.id)
				if mObj:
					gi = mObj.group(1)

		if not h_ti_contig.get(ti,[]):
			h_ti_contig[ti]=[]
			
		for c in range(C):
			#contig = r[covRange[c][0]:covRange[c][1]+1]
			contigSeq = str(r.seq[covRange[c][0]:covRange[c][1]+1])
			#cqual = contig.letter_annotations["phred_quality"]
			#cLen = len(cqual)
			cLen = covRange[c][1]-covRange[c][0]+1
			#cqual_ave = 1.*sum(cqual)/cLen
			
			#h_ti_contig[ti].append([refName,cLen,str(contig.seq)])
			h_ti_contig[ti].append([refName,cLen,contigSeq])
		
		if con:
			mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi
			cur = con.cursor()
			cur.execute(mysql_sel_cmd)
			entr=cur.fetchall()
			if entr:
				#subgi2query=[]
				#subgiAnnot=[]
				#print r.id #debug
				#print covRange #debug
				for j in entr: #select which subgi sits within the covered genomic regions
					aStbp=int(j[STBP]); aEdbp=int(j[EDBP])

					A=aEdbp-aStbp+1
					notCoveredA=A
					minCoveredA2 = notCoveredA - 100
					
					reportA=False
					
					for i in range(C):
						#print '[subgi%s:%d - %d][cov:%d-%d]' % (gi,aStbp,aEdbp,covRange[START][i],covRange[END][i])
						notCoveredA -= pathoUtilsA.segments_intersect(aStbp,aEdbp,covRange[i][START],covRange[i][END])
						if notCoveredA<minCoveredA2:
							reportA=True
							break

					if reportA:
						selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI]
						cur = con.cursor()
						cur.execute(selCmd)
						entr2 = cur.fetchone()
						ref_name=NAs; product=NAs
						if entr2:
							ref_name = entr2[0]; product = entr2[1]
						if h_annoT.get(ti,-1)==-1:
							h_annoT[ti]=[]
						h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product])

	fp.close()
	if con:
		dbUtils.mysql_close(con)
	return h_annoT,h_ti_contig