def append_ti_into_fasta_app(ncbiNt, taxon_ids, subTaxF, MySqlConf,
                             enable_descF, enable_onlineF, outPrefix, outDir):

    invalSelFlag = False  # Not generating invalid fasta file by default
    GET_ALL_TAX = -2
    NAs = 'X'
    (hostname, port, user, passwd, defaultDb) = range(5)
    useHash = False

    workD = outDir + os.sep + 'ncbiDB'
    if not os.path.exists(workD):
        os.makedirs(workD)

    valSel = outDir + os.sep + outPrefix + '_ti.fa'
    invalSel = outDir + os.sep + outPrefix + '_ti_inval.fa'

    if MySqlConf[
            passwd] == NAs:  #then, we use download gi2tax instead of using mysql query
        useHash = True
        gi2taxDump = getGi2TaxDump_online(workD)
        nodes_fn = getNodesDump_online(workD)

    if not useHash:
        con = dbUtils.init_mysql_innocentive(
            MySqlConf, 0)  #DON'T REMOVE LAST 0 or CHANGE TO 1

    if taxon_ids[0] == GET_ALL_TAX:
        subTaxons = [GET_ALL_TAX]
    else:
        #select all taxons under the given taxonomy lists
        subTaxons = taxon_ids
        if subTaxF:
            #build dic of ti nodes in phylogeny
            if useHash:
                h_tax_node2par = taxon_rel_in_hash(nodes_fn)
            else:
                h_tax_node2par = dbUtils.taxon_rel_in_hash_mysql(con, workD)
            subTaxons = get_allsub_taxons_phylo(h_tax_node2par, taxon_ids)

    #append taxonomy id to fasta header
    if useHash:
        append_ti_into_fasta_hash(ncbiNt, gi2taxDump, subTaxons, enable_descF,
                                  enable_onlineF, valSel, invalSel,
                                  invalSelFlag)
    else:
        append_ti_into_fasta_mysql(con, ncbiNt, subTaxons, enable_descF,
                                   enable_onlineF, valSel, invalSel,
                                   invalSelFlag)
        dbUtils.mysql_close(con)

    return (valSel, invalSel)
Esempio n. 2
0
def append_ti_into_fasta_app(ncbiNt, taxon_ids, exclude_taxon_ids, subTaxF, MySqlConf, 
		enable_descF, enable_onlineF, outPrefix, outDir):
	
	invalSelFlag = False # Not generating invalid fasta file by default
	GET_ALL_TAX=-2
	NAs='X'
	(hostname,port,user,passwd,defaultDb)=range(5)
	useHash=False
	
	workD = outDir + os.sep + 'ncbiDB'
	if not os.path.exists(workD):
		os.makedirs(workD)
	
	valSel = outDir + os.sep + outPrefix+'_ti.fa'
	invalSel = outDir + os.sep + outPrefix+'_ti_inval.fa'
	
	if MySqlConf[passwd]==NAs: #then, we use download gi2tax instead of using mysql query
		useHash=True
		gi2taxDump=getGi2TaxDump_online(workD)
		nodes_fn=getNodesDump_online(workD)
	
	if not useHash:
		con = dbUtils.init_mysql_innocentive(MySqlConf,0) #DON'T REMOVE LAST 0 or CHANGE TO 1
	
	if taxon_ids[0]==GET_ALL_TAX:
		subTaxons=[GET_ALL_TAX]
	else:
		#select all taxons under the given taxonomy lists
		subTaxons=taxon_ids
		if subTaxF:
			#build dic of ti nodes in phylogeny
			if useHash:
				h_tax_node2par = taxon_rel_in_hash(nodes_fn)
			else:
				h_tax_node2par = dbUtils.taxon_rel_in_hash_mysql(con, workD)
			subTaxons=get_allsub_taxons_phylo(h_tax_node2par,taxon_ids, exclude_taxon_ids)

	#append taxonomy id to fasta header
	if useHash:
		append_ti_into_fasta_hash(ncbiNt, gi2taxDump, subTaxons, enable_descF, enable_onlineF,
			valSel, invalSel, invalSelFlag)
	else:
		append_ti_into_fasta_mysql(con, ncbiNt, subTaxons, enable_descF, enable_onlineF,
			valSel, invalSel, invalSelFlag)
		dbUtils.mysql_close(con)
	
	return (valSel,invalSel)
def simple_genome_annotation(h_gisPerTi, mySqlConf, h_annoT):

    #SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6)
    SUBGI, GENE, LOCS_TAG, PROID = range(4)
    NAs = 'X'
    useMysql = True
    con = None
    #(hostname,port,user,passwd,defaultDb)=range(5)
    (_, _, _, passwd, _) = range(5)
    if mySqlConf[passwd] == NAs:  #then, we do not use mysql
        useMysql = False
    if useMysql:
        con = dbUtils.init_mysql_innocentive(mySqlConf, 0)
    if con:
        for ti in h_gisPerTi:
            gis = h_gisPerTi[ti]
            for gi in gis:
                mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi
                cur = con.cursor()
                cur.execute(mysql_sel_cmd)
                entr = cur.fetchall()
                if entr:
                    for j in entr:  #select which subgi sits within the covered genomic regions
                        selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[
                            SUBGI]
                        cur = con.cursor()
                        cur.execute(selCmd)
                        entr2 = cur.fetchone()
                        ref_name = NAs
                        product = NAs
                        if entr2:
                            ref_name = entr2[0]
                            product = entr2[1]
                        if h_annoT.get(ti, -1) == -1:
                            h_annoT[ti] = []
                        h_annoT[ti].append([
                            j[SUBGI], j[GENE], j[LOCS_TAG], j[PROID], ref_name,
                            product
                        ])

    if con:
        dbUtils.mysql_close(con)
    return h_annoT
Esempio n. 4
0
def simple_genome_annotation(h_gisPerTi, mySqlConf, h_annoT):
	
	#SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6)
	SUBGI,GENE,LOCS_TAG,PROID = range(4)
	NAs = 'X'
	useMysql=True
	con = None
	#(hostname,port,user,passwd,defaultDb)=range(5)
	(_,_,_,passwd,_)=range(5)
	if mySqlConf[passwd]==NAs: #then, we do not use mysql
		useMysql=False
	if useMysql:
		con = dbUtils.init_mysql_innocentive(mySqlConf,0)
	if con:
		for ti in h_gisPerTi:
			gis = h_gisPerTi[ti]
			for gi in gis:
				mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi
				cur = con.cursor()
				cur.execute(mysql_sel_cmd)
				entr=cur.fetchall()
				if entr:
					for j in entr: #select which subgi sits within the covered genomic regions
						selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI]
						cur = con.cursor()
						cur.execute(selCmd)
						entr2 = cur.fetchone()
						ref_name=NAs; product=NAs
						if entr2:
							ref_name = entr2[0]; product = entr2[1]
						if h_annoT.get(ti,-1)==-1:
							h_annoT[ti]=[]
						h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product])

	if con:
		dbUtils.mysql_close(con)
	return h_annoT
Esempio n. 5
0
def gb2prepare_load_data_file(MySqlConf,tiNtDfn,downloadD):
	
	HOST_NAME,MYSQL_PORT,USER,PASSWORD,DEFAULT_DB = range(5)
	#TODO[to develop to maintain pathoDB , differential updates]----------
	#to clean up downloadD first
	downloadD_gff = downloadD + '/gbff'
	
	gbDnExt=['x','x','x']
	gbDnExt[0] = 'seq.gz'
	gbDnExt[1] = 'gbff.gz'
	gbDnExt[2] = 'protein.gpff.gz'
	
	
	if not os.path.exists(downloadD_gff):
		os.makedirs(downloadD_gff)
	else:
		cmd = 'rm -rf %s/*.gbff.gz*\n' % (downloadD_gff)
		cmd = '%srm -rf %s/*.gpff.gz*\n' % (cmd,downloadD_gff)
		cmd = '%srm -rf %s/*.seq.gz*\n' % (cmd,downloadD_gff)
		os.system(cmd)
			
	#download genabank flat format files
	downExt='gz'
	downFbase='*.seq'
	#downFbase='gbest?.seq' #debug
	gbFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank'
	gbFlatFtp=gbFlatFtpD+'/'+downFbase+'.'+downExt
	
	dummy=pathoUtilsA.wget_download2(gbFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
	
	#download refseq flat format files (genomic and rna)
	downExt='gz'
	downFbase='*.gbff'
	#downFbase='complete.?.*.gbff' #debug
	refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete'
	refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt
	dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
	
	
	#download refseq flat format files (protein)
	downExt='gz'
	downFbase='*.gpff'
	#downFbase='complete.?.protein.gpff' #debug
	refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete'
	refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt
	dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
			
	#download tsa flat format files
	downExt='gz'
	downFbase='*.gbff'
	#downFbase='tsa.GAA?.1.gbff' #debug
	refSeqFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank/tsa'
	refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt
	dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
	#except the following format files
	cmd = 'rm -rf %s/*.mstr.gbff.gz' % downloadD_gff
	os.system(cmd)
	

	#the following two lines should be conistent with the one defined in parse_ncbi_entry()
	GI,REF_NAME,TAXON_ID,ORGANISM,LINEAGE,PRODUCT,STBP,EDBP,SUB_GI=range(9)
	GI_SUB,STRAND_SUB,STBP_SUB,EDBP_SUB,GENE_SUB,LOCUS_TAG_SUB,PRODUCT_SUB,PROTEIN_ID_SUB=range(8)
	
	NAi=0
	NAs='X'

	##################################################################$
	#processing genbank flat file and transfer all annotation to mysql
	##################################################################$
	ANNO_T,DELIM_T,TAX_T=range(3)
	
	gi_annoT_fn=downloadD_gff+'/giAnnoT2load.csv'
	delimT_fn=downloadD_gff+'/delimT2load.csv'
	taxT_fn=downloadD_gff+'/tax2load.csv'
	
	h_taxLookup = {}
	#read tax2load to dictionary (debug) ------------>
	if False:
		fp=open(taxT_fn,'r')
		for i in fp:
			words = i.split('\t')
			h_taxLookup[words[1]]=1
		fp.close()
	#<--------------------
	
	
	fps=[-1,-1,-1]
	fps[ANNO_T]=open(gi_annoT_fn,'w')
	fps[DELIM_T]=open(delimT_fn,'w')
	fps[TAX_T]=open(taxT_fn,'w')

	print 'transferring gene bank report to mysql...'
	gbFlatTmp = '%s/gb2process.tmp' % downloadD_gff
	pkey_anno = 1
	pkey_delim = 1
	pkey_ti = 1

	
	doneGbD = downloadD_gff+'/completed_gbff'
	if not os.path.exists(doneGbD):
		os.makedirs(doneGbD)

	#count a total # of gz to process

	F = len(os.listdir(downloadD_gff))
	f = 0
	for gbFlatFn in os.listdir(downloadD_gff):
		tick=time()
		
		if gbFlatFn.endswith(gbDnExt[0]) or gbFlatFn.endswith(gbDnExt[1]) or gbFlatFn.endswith(gbDnExt[2]):# or gbFlatFn!='gbcon208.seq.gz': #debug
			cmd='gunzip -c %s/%s > %s\n' % (downloadD_gff,gbFlatFn,gbFlatTmp)
			cmd='%smv %s/%s %s/%s\n' % (cmd,downloadD_gff,gbFlatFn,doneGbD,gbFlatFn)
			os.system(cmd)
			f+=1
		else:
			continue
		print 'processing %s[%d/%d]...' % (gbFlatFn,f,F)
		
		fp = open(gbFlatTmp,'r')
		#skipping header
		header=True
		while header: # Dump the header in the file
			tmp=fp.readline()
			if len(tmp)>5:
				header = (not tmp[:5] == "LOCUS")
		entry = [tmp]
		
		#only focus on the section between "LOCUS" ... "//"
		ti=-1
		for x in fp:
			if re.search(r'^//', x): # Every time we get to a //\n line, we read the current entry and then start collecting a new one.
				gB = parse_ncbi_entry(entry)
				#print gB[0] #debug
				entry=[]
				#.............................................
				#0) check if query gi has multiple sub gis
				has_sub=0
				if len(gB[SUB_GI])>0:
					has_sub=1

				#1) update query gi annotation
				#mysql_update_anno_gi(con,gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub)
				pkey_anno,fps[ANNO_T] = csv_update_anno_gi(fps[ANNO_T],gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub,pkey_anno)
				
				#2) update query delimit
				# mysql_update_delim(con,gB[GI],gB[GI],'+',gB[STBP],gB[EDBP])
				
				for s in gB[SUB_GI]:
					#3) update sub_gi annotation
					#mysql_update_anno_sub_gi(con,s[GI],gB[REF_NAME],gB[TAXON_ID],s[PRODUCT_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB])
					#4) update sub_gi delimit
					#mysql_update_delim(con,gB[GI],s[GI_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB])
					pkey_delim,fps[DELIM_T]=csv_update_delim(fps[DELIM_T],gB[GI],s[GI_SUB],s[GENE_SUB],s[LOCUS_TAG_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB],pkey_delim)


				#ti=mysql_update_ti(con,gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE])
				if h_taxLookup.get(gB[TAXON_ID],-1) == -1:
					pkey_ti,fps[TAX_T] = csv_update_ti(fps[TAX_T],gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE],pkey_ti)
					h_taxLookup[gB[TAXON_ID]]=1
					
				#.............................................
			else:
				entry.append(x)
		#for loop end (x in fp)
		fp.close()
		tock=time()
		elapsed=tock-tick
		print 'elasped time:[%g]' % elapsed
	
	fps[0].close()
	fps[1].close()
	fps[2].close()
	#(gbFlatFn) finish for loop
	
	con = dbUtils.init_mysql_innocentive(MySqlConf,0)
	with con:

		print 'loading %s...' % (gi_annoT_fn)
		cur=con.cursor()
		mysql_load_cmd = 'load data local infile \'%s\' into table giAnnoT fields terminated by \'\\t\'' % gi_annoT_fn
		cur.execute(mysql_load_cmd)
		cur=con.cursor()
		mysql_idx_cmd = 'create unique index idx_gi on giAnnoT (gi)'
		cur.execute(mysql_idx_cmd)
		print 'done.'

		print 'loading %s...' % (delimT_fn)
		cur=con.cursor()
		mysql_load_cmd = 'load data local infile \'%s\' into table giDelimT fields terminated by \'\\t\'' % delimT_fn
		cur.execute(mysql_load_cmd)
		cur=con.cursor()
		mysql_idx_cmd = 'create index idx_subgi on giDelimT (gi,stbp,edbp)'
		cur.execute(mysql_idx_cmd)
		print 'done.'
		
		print 'computing database size for each taxon id...'
		if False:
			#collect dbSize for each ti
			h_ti_dbSz = get_ti_db_size(gi_annoT_fn)
			update_taxT_fn(h_ti_dbSz,taxT_fn)
		else:
			add_dbsize2taxonT(tiNtDfn,taxT_fn)
		print 'done.'
		
		print 'loading %s...' % (taxT_fn)
		cur=con.cursor()
		mysql_load_cmd = 'load data local infile \'%s\' into table cj_taxonT fields terminated by \'\\t\'' % taxT_fn
		cur.execute(mysql_load_cmd)
		cur=con.cursor()
		mysql_idx_cmd = 'create unique index idx_taxon on cj_taxonT (taxon)'
		cur.execute(mysql_idx_cmd)
		print 'done.'
		
	dbUtils.mysql_close(con)
	print 'done'
Esempio n. 6
0
parser.add_argument('-r',
                    action='store',
                    dest='reset_table',
                    required=False,
                    type=int,
                    default=0,
                    help='set to 1 if you want to reset mysql table')

args = parser.parse_args()

#####################$
#open mysql connection
#####################$
HOST_NAME, MYSQL_PORT, USER, PASSWORD, DEFAULT_DB = range(5)
MySqlConf = [args.hostname, args.port, args.user, args.passwd, '']
con = dbUtils.init_mysql_innocentive(MySqlConf, args.reset_table)

#####################################$
#create ncbi phylogeny tree into mysql
#####################################$
if True:  #debug
    nodesDfn = pathoLib.getNodesDump_online(args.downloadD)
    dbUtils.phylo_node2mysql(con, nodesDfn)

###################################$
#close mysql connection
###################################$
dbUtils.mysql_close(con)

####################################$
#create ncbi genbank flat into mysql
Esempio n. 7
0
def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead,
                          h_refScore, h_gisPerTi, h_tiRef, reads,
                          h_readSequence, samFile, mySqlConf):

    NAs = 'X'
    useMysql = True
    con = None
    #(hostname,port,user,passwd,defaultDb)=range(5)
    (_, _, _, passwd, _) = range(5)
    if mySqlConf[passwd] == NAs:  #then, we do not use mysql
        useMysql = False
    organismsObj = Organisms()

    readCnt = len(reads)
    hostScore = 0
    if len(hostTaxon) > 0:
        try:
            hostScore = h_refScore[hostTaxon]
        except:
            hostScore = 0
    numTargetReads = readCnt - hostScore
    organismsObj.numAlignedReads = numTargetReads
    organismsObj.numMappedGenomes = len(h_gisPerTi)
    if useMysql:
        con = dbUtils.init_mysql_innocentive(mySqlConf, 0)
    for ti in h_gisPerTi:
        refIdName = h_tiRef.get(ti, [ti, ti])
        refId = refIdName[0]
        score = h_refScore.get(refId, 0)

        organismName = refIdName[1]
        lineage = ''
        #if taxonomyLevelF:
        if useMysql:
            organismName, lineage = dbUtils.findOrganismLineage(con, ti)
        organism = Organism(organismName)
        if useMysql:
            words = organismName.split()
            length = len(words)
            if length > 0:
                organism.genus = words[0]
            if length > 1:
                organism.species = words[1]
            if length > 2:
                organism.strain = words[2]

        organism.relativeAmount = RelativeAmount(score)
        organism.relativeAmount.count = len(h_refRead.get(refId, [-1]))
        organism.taxonomy = Taxonomy(lineage)
        organism.taxonomy.taxon_id = ti
        genes = []
        if h_annoT.get(ti, -1) != -1:
            for giList in h_annoT[ti]:
                gene = Gene(giList[1])
                if giList[2] and giList[2] != NAs:
                    gene.locus_tag = giList[2]
                if giList[3] and giList[3] != NAs:
                    gene.protein_id = giList[3]
                if giList[4] and giList[4] != NAs:
                    gene.ref_name = giList[4]
                if giList[5] and giList[5] != NAs:
                    gene.product = giList[5]
                genes.append(gene)
            organism.genes = genes

        #add contig information
        contigs = []
        j = 0
        ctgs = h_ti_contig.get(ti, [])
        for c in ctgs:
            ti_contig = ti + '_ctg_' + str(j)
            contig2 = Contig(ti_contig)
            contig2.ref_name = c[
                0]  #make sure that all string format only available in xml
            contig2.length = str(c[1])
            contig2.contig = c[2]
            j += 1
            contigs.append(contig2)
        organism.contigs = contigs

        #add read information
        reads = []
        readnames = h_refRead.get(refId, [])
        for readname in readnames:
            read = Read(readname)
            read.readSequence = h_readSequence[readname]
            reads.append(read)
        organism.reads = reads

        organismsObj.organisms.append(organism)

    organismsObj.organisms = sorted(organismsObj.organisms,
                                    key=lambda x: x.relativeAmount.value,
                                    reverse=True)
    organismsElement = organismsObj.buildElement()

    if con:
        dbUtils.mysql_close(con)
    return organismsElement
Esempio n. 8
0
def get_genome_annotation_in_mysql(\
	refConsFq, minContigLen, MySqlConf, h_annoT, h_ti_contig):
	
	START,END = range(2)
	SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6)
	NAs = 'X'
	useMysql=True
	con = None
	#(hostname,port,user,passwd,defaultDb)=range(5)
	(_,_,_,passwd,_)=range(5)
	if MySqlConf[passwd]==NAs: #then, we do not use mysql
		useMysql=False
	if useMysql:
		con = dbUtils.init_mysql_innocentive(MySqlConf,0)

	fp = open(refConsFq,'r')
	#debugCnt = 0 #debug
	for r in seqParse.parse(fp,'fastq'): # for each covered genome

		covRange = selectConsensusContigs(r,minContigLen,-1) #disable checking seq complexity of contig
		
		if not covRange:
			continue
		C = len(covRange)

		#extract ti and gi
		refName = r.id
		mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi\|(\d+)\|',r.id)
		if mObj:
			ti = mObj.group(1)
			gi = mObj.group(3)
		else:
			mObj=re.search(r'ti\|(\d+)\|gi\|(\d+)\|',r.id)
			if mObj and mObj.group(1)!="-1":
				ti = mObj.group(1)
				gi = mObj.group(2)
			else:
				mObj=re.search(r'gi\|(\d+)\|',r.id)
				if mObj:
					gi = mObj.group(1)

		if not h_ti_contig.get(ti,[]):
			h_ti_contig[ti]=[]
			
		for c in range(C):
			#contig = r[covRange[c][0]:covRange[c][1]+1]
			contigSeq = str(r.seq[covRange[c][0]:covRange[c][1]+1])
			#cqual = contig.letter_annotations["phred_quality"]
			#cLen = len(cqual)
			cLen = covRange[c][1]-covRange[c][0]+1
			#cqual_ave = 1.*sum(cqual)/cLen
			
			#h_ti_contig[ti].append([refName,cLen,str(contig.seq)])
			h_ti_contig[ti].append([refName,cLen,contigSeq])
		
		if con:
			mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi
			cur = con.cursor()
			cur.execute(mysql_sel_cmd)
			entr=cur.fetchall()
			if entr:
				#subgi2query=[]
				#subgiAnnot=[]
				#print r.id #debug
				#print covRange #debug
				for j in entr: #select which subgi sits within the covered genomic regions
					aStbp=int(j[STBP]); aEdbp=int(j[EDBP])

					A=aEdbp-aStbp+1
					notCoveredA=A
					minCoveredA2 = notCoveredA - 100
					
					reportA=False
					
					for i in range(C):
						#print '[subgi%s:%d - %d][cov:%d-%d]' % (gi,aStbp,aEdbp,covRange[START][i],covRange[END][i])
						notCoveredA -= pathoUtilsA.segments_intersect(aStbp,aEdbp,covRange[i][START],covRange[i][END])
						if notCoveredA<minCoveredA2:
							reportA=True
							break

					if reportA:
						selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI]
						cur = con.cursor()
						cur.execute(selCmd)
						entr2 = cur.fetchone()
						ref_name=NAs; product=NAs
						if entr2:
							ref_name = entr2[0]; product = entr2[1]
						if h_annoT.get(ti,-1)==-1:
							h_annoT[ti]=[]
						h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product])

	fp.close()
	if con:
		dbUtils.mysql_close(con)
	return h_annoT,h_ti_contig
Esempio n. 9
0
def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead, h_refScore, 
		h_gisPerTi, h_tiRef, reads, h_readSequence, samFile, mySqlConf):
	
	NAs = 'X'
	useMysql=True
	con = None
	#(hostname,port,user,passwd,defaultDb)=range(5)
	(_,_,_,passwd,_)=range(5)
	if mySqlConf[passwd]==NAs: #then, we do not use mysql
		useMysql=False
	organismsObj = Organisms()
	
	readCnt = len(reads)
	hostScore = 0
	if len(hostTaxon)>0:
		try:
			hostScore = h_refScore[hostTaxon]
		except:
			hostScore = 0
	numTargetReads = readCnt-hostScore
	organismsObj.numAlignedReads = numTargetReads
	organismsObj.numMappedGenomes = len(h_gisPerTi)
	if useMysql:
		con = dbUtils.init_mysql_innocentive(mySqlConf,0)
	for ti in h_gisPerTi:
		refIdName = h_tiRef.get(ti, [ti, ti])
		refId = refIdName[0]
		score = h_refScore.get(refId,0)
		
		organismName=refIdName[1]
		lineage=''
		#if taxonomyLevelF:
		if useMysql:
			organismName, lineage = dbUtils.findOrganismLineage(con, ti)
		organism = Organism(organismName)
		if useMysql:
			words = organismName.split()
			length = len(words)
			if length>0 :
				organism.genus = words[0]
			if length>1 :
				organism.species = words[1]
			if length>2 :
				organism.strain = words[2]
		
		organism.relativeAmount = RelativeAmount(score)
		organism.relativeAmount.count = len(h_refRead.get(refId,[-1]))
		organism.taxonomy = Taxonomy(lineage)
		organism.taxonomy.taxon_id = ti
		genes = []
		if h_annoT.get(ti,-1)!=-1:
			for giList in h_annoT[ti]:
				gene = Gene(giList[1])
				if giList[2] and giList[2] != NAs:
					gene.locus_tag = giList[2]
				if giList[3] and giList[3] != NAs:
					gene.protein_id = giList[3]
				if giList[4] and giList[4] != NAs:
					gene.ref_name = giList[4]
				if giList[5] and giList[5] != NAs:
					gene.product = giList[5]
				genes.append(gene)
			organism.genes = genes

		#add contig information
		contigs = []
		j= 0
		ctgs = h_ti_contig.get(ti,[])
		for c in ctgs:
			ti_contig = ti+'_ctg_'+str(j)
			contig2 = Contig(ti_contig)
			contig2.ref_name = c[0] #make sure that all string format only available in xml
			contig2.length = str(c[1])
			contig2.contig = c[2]
			j+=1
			contigs.append(contig2)
		organism.contigs = contigs
		
		#add read information
		reads = []
		readnames = h_refRead.get(refId,[])
		for readname in readnames:
			read = Read(readname)
			read.readSequence = h_readSequence[readname]
			reads.append(read)
		organism.reads = reads
		
		organismsObj.organisms.append(organism)
		
	organismsObj.organisms = sorted(organismsObj.organisms, 
		key=lambda x: x.relativeAmount.value, reverse=True)
	organismsElement = organismsObj.buildElement()
	
	if con:
		dbUtils.mysql_close(con)
	return organismsElement
Esempio n. 10
0
def get_genome_annotation_in_mysql(\
	refConsFq, minContigLen, MySqlConf, h_annoT, h_ti_contig):
	
	START,END = range(2)
	SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6)
	NAs = 'X'
	useMysql=True
	con = None
	#(hostname,port,user,passwd,defaultDb)=range(5)
	(_,_,_,passwd,_)=range(5)
	if MySqlConf[passwd]==NAs: #then, we do not use mysql
		useMysql=False
	if useMysql:
		con = dbUtils.init_mysql_innocentive(MySqlConf,0)

	fp = open(refConsFq,'r')
	#debugCnt = 0 #debug
	for r in seqParse.parse(fp,'fastq'): # for each covered genome

		covRange = selectConsensusContigs(r,minContigLen,-1) #disable checking seq complexity of contig
		
		if not covRange:
			continue
		C = len(covRange)

		#extract ti and gi
		refName = r.id
		mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi\|(\d+)\|',r.id)
		if mObj:
			ti = mObj.group(1)
			gi = mObj.group(3)
		else:
			mObj=re.search(r'ti\|(\d+)\|gi\|(\d+)\|',r.id)
			if mObj and mObj.group(1)!="-1":
				ti = mObj.group(1)
				gi = mObj.group(2)
			else:
				mObj=re.search(r'gi\|(\d+)\|',r.id)
				if mObj:
					gi = mObj.group(1)

		if not h_ti_contig.get(ti,[]):
			h_ti_contig[ti]=[]
			
		for c in range(C):
			#contig = r[covRange[c][0]:covRange[c][1]+1]
			contigSeq = str(r.seq[covRange[c][0]:covRange[c][1]+1])
			#cqual = contig.letter_annotations["phred_quality"]
			#cLen = len(cqual)
			cLen = covRange[c][1]-covRange[c][0]+1
			#cqual_ave = 1.*sum(cqual)/cLen
			
			#h_ti_contig[ti].append([refName,cLen,str(contig.seq)])
			h_ti_contig[ti].append([refName,cLen,contigSeq])
		
		if con:
			mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi
			cur = con.cursor()
			cur.execute(mysql_sel_cmd)
			entr=cur.fetchall()
			if entr:
				#subgi2query=[]
				#subgiAnnot=[]
				#print r.id #debug
				#print covRange #debug
				for j in entr: #select which subgi sits within the covered genomic regions
					aStbp=int(j[STBP]); aEdbp=int(j[EDBP])

					A=aEdbp-aStbp+1
					notCoveredA=A
					minCoveredA2 = notCoveredA - 100
					
					reportA=False
					
					for i in range(C):
						#print '[subgi%s:%d - %d][cov:%d-%d]' % (gi,aStbp,aEdbp,covRange[START][i],covRange[END][i])
						notCoveredA -= pathoUtilsA.segments_intersect(aStbp,aEdbp,covRange[i][START],covRange[i][END])
						if notCoveredA<minCoveredA2:
							reportA=True
							break

					if reportA:
						selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI]
						cur = con.cursor()
						cur.execute(selCmd)
						entr2 = cur.fetchone()
						ref_name=NAs; product=NAs
						if entr2:
							ref_name = entr2[0]; product = entr2[1]
						if h_annoT.get(ti,-1)==-1:
							h_annoT[ti]=[]
						h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product])

	fp.close()
	if con:
		dbUtils.mysql_close(con)
	return h_annoT,h_ti_contig
Esempio n. 11
0
parser.add_argument('-g', action='store', dest='ti_nt', required=True, help='specify a fasta format reference appended with taxonomy id in the front seq header. Refer to app_build_nt_tgt.py to build such fasta file.')
parser.add_argument('-d', action='store', dest='downloadD', required=True, help='specify a temporary download directory')
parser.add_argument('-m', action='store', dest='hostname', required=False, default='localhost', help='specify hostname running mysql')
parser.add_argument('-P', action='store', dest='port', required=False, type=int, default='3306', help='specify hostname running mysql')
parser.add_argument('-u', action='store', dest='user', required=False, default='root', help='user name to access mysql')
parser.add_argument('-p', action='store', dest='passwd', required=True, default='X', help='provide password associate with user')
parser.add_argument('-r', action='store', dest='reset_table', required=False, type=int, default=0, help='set to 1 if you want to reset mysql table')

args=parser.parse_args()

#####################$
#open mysql connection
#####################$
HOST_NAME,MYSQL_PORT,USER,PASSWORD,DEFAULT_DB = range(5)
MySqlConf=[args.hostname,args.port,args.user,args.passwd,'']
con = dbUtils.init_mysql_innocentive(MySqlConf,args.reset_table)

#####################################$
#create ncbi phylogeny tree into mysql
#####################################$
if True: #debug
	nodesDfn=pathoLib.getNodesDump_online(args.downloadD)
	dbUtils.phylo_node2mysql(con,nodesDfn)

###################################$
#close mysql connection
###################################$
dbUtils.mysql_close(con)

####################################$
#create ncbi genbank flat into mysql