def append_ti_into_fasta_app(ncbiNt, taxon_ids, subTaxF, MySqlConf, enable_descF, enable_onlineF, outPrefix, outDir): invalSelFlag = False # Not generating invalid fasta file by default GET_ALL_TAX = -2 NAs = 'X' (hostname, port, user, passwd, defaultDb) = range(5) useHash = False workD = outDir + os.sep + 'ncbiDB' if not os.path.exists(workD): os.makedirs(workD) valSel = outDir + os.sep + outPrefix + '_ti.fa' invalSel = outDir + os.sep + outPrefix + '_ti_inval.fa' if MySqlConf[ passwd] == NAs: #then, we use download gi2tax instead of using mysql query useHash = True gi2taxDump = getGi2TaxDump_online(workD) nodes_fn = getNodesDump_online(workD) if not useHash: con = dbUtils.init_mysql_innocentive( MySqlConf, 0) #DON'T REMOVE LAST 0 or CHANGE TO 1 if taxon_ids[0] == GET_ALL_TAX: subTaxons = [GET_ALL_TAX] else: #select all taxons under the given taxonomy lists subTaxons = taxon_ids if subTaxF: #build dic of ti nodes in phylogeny if useHash: h_tax_node2par = taxon_rel_in_hash(nodes_fn) else: h_tax_node2par = dbUtils.taxon_rel_in_hash_mysql(con, workD) subTaxons = get_allsub_taxons_phylo(h_tax_node2par, taxon_ids) #append taxonomy id to fasta header if useHash: append_ti_into_fasta_hash(ncbiNt, gi2taxDump, subTaxons, enable_descF, enable_onlineF, valSel, invalSel, invalSelFlag) else: append_ti_into_fasta_mysql(con, ncbiNt, subTaxons, enable_descF, enable_onlineF, valSel, invalSel, invalSelFlag) dbUtils.mysql_close(con) return (valSel, invalSel)
def append_ti_into_fasta_app(ncbiNt, taxon_ids, exclude_taxon_ids, subTaxF, MySqlConf, enable_descF, enable_onlineF, outPrefix, outDir): invalSelFlag = False # Not generating invalid fasta file by default GET_ALL_TAX=-2 NAs='X' (hostname,port,user,passwd,defaultDb)=range(5) useHash=False workD = outDir + os.sep + 'ncbiDB' if not os.path.exists(workD): os.makedirs(workD) valSel = outDir + os.sep + outPrefix+'_ti.fa' invalSel = outDir + os.sep + outPrefix+'_ti_inval.fa' if MySqlConf[passwd]==NAs: #then, we use download gi2tax instead of using mysql query useHash=True gi2taxDump=getGi2TaxDump_online(workD) nodes_fn=getNodesDump_online(workD) if not useHash: con = dbUtils.init_mysql_innocentive(MySqlConf,0) #DON'T REMOVE LAST 0 or CHANGE TO 1 if taxon_ids[0]==GET_ALL_TAX: subTaxons=[GET_ALL_TAX] else: #select all taxons under the given taxonomy lists subTaxons=taxon_ids if subTaxF: #build dic of ti nodes in phylogeny if useHash: h_tax_node2par = taxon_rel_in_hash(nodes_fn) else: h_tax_node2par = dbUtils.taxon_rel_in_hash_mysql(con, workD) subTaxons=get_allsub_taxons_phylo(h_tax_node2par,taxon_ids, exclude_taxon_ids) #append taxonomy id to fasta header if useHash: append_ti_into_fasta_hash(ncbiNt, gi2taxDump, subTaxons, enable_descF, enable_onlineF, valSel, invalSel, invalSelFlag) else: append_ti_into_fasta_mysql(con, ncbiNt, subTaxons, enable_descF, enable_onlineF, valSel, invalSel, invalSelFlag) dbUtils.mysql_close(con) return (valSel,invalSel)
def simple_genome_annotation(h_gisPerTi, mySqlConf, h_annoT): #SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6) SUBGI, GENE, LOCS_TAG, PROID = range(4) NAs = 'X' useMysql = True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_, _, _, passwd, _) = range(5) if mySqlConf[passwd] == NAs: #then, we do not use mysql useMysql = False if useMysql: con = dbUtils.init_mysql_innocentive(mySqlConf, 0) if con: for ti in h_gisPerTi: gis = h_gisPerTi[ti] for gi in gis: mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi cur = con.cursor() cur.execute(mysql_sel_cmd) entr = cur.fetchall() if entr: for j in entr: #select which subgi sits within the covered genomic regions selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[ SUBGI] cur = con.cursor() cur.execute(selCmd) entr2 = cur.fetchone() ref_name = NAs product = NAs if entr2: ref_name = entr2[0] product = entr2[1] if h_annoT.get(ti, -1) == -1: h_annoT[ti] = [] h_annoT[ti].append([ j[SUBGI], j[GENE], j[LOCS_TAG], j[PROID], ref_name, product ]) if con: dbUtils.mysql_close(con) return h_annoT
def simple_genome_annotation(h_gisPerTi, mySqlConf, h_annoT): #SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6) SUBGI,GENE,LOCS_TAG,PROID = range(4) NAs = 'X' useMysql=True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_,_,_,passwd,_)=range(5) if mySqlConf[passwd]==NAs: #then, we do not use mysql useMysql=False if useMysql: con = dbUtils.init_mysql_innocentive(mySqlConf,0) if con: for ti in h_gisPerTi: gis = h_gisPerTi[ti] for gi in gis: mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi cur = con.cursor() cur.execute(mysql_sel_cmd) entr=cur.fetchall() if entr: for j in entr: #select which subgi sits within the covered genomic regions selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI] cur = con.cursor() cur.execute(selCmd) entr2 = cur.fetchone() ref_name=NAs; product=NAs if entr2: ref_name = entr2[0]; product = entr2[1] if h_annoT.get(ti,-1)==-1: h_annoT[ti]=[] h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product]) if con: dbUtils.mysql_close(con) return h_annoT
def gb2prepare_load_data_file(MySqlConf,tiNtDfn,downloadD): HOST_NAME,MYSQL_PORT,USER,PASSWORD,DEFAULT_DB = range(5) #TODO[to develop to maintain pathoDB , differential updates]---------- #to clean up downloadD first downloadD_gff = downloadD + '/gbff' gbDnExt=['x','x','x'] gbDnExt[0] = 'seq.gz' gbDnExt[1] = 'gbff.gz' gbDnExt[2] = 'protein.gpff.gz' if not os.path.exists(downloadD_gff): os.makedirs(downloadD_gff) else: cmd = 'rm -rf %s/*.gbff.gz*\n' % (downloadD_gff) cmd = '%srm -rf %s/*.gpff.gz*\n' % (cmd,downloadD_gff) cmd = '%srm -rf %s/*.seq.gz*\n' % (cmd,downloadD_gff) os.system(cmd) #download genabank flat format files downExt='gz' downFbase='*.seq' #downFbase='gbest?.seq' #debug gbFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank' gbFlatFtp=gbFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(gbFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #download refseq flat format files (genomic and rna) downExt='gz' downFbase='*.gbff' #downFbase='complete.?.*.gbff' #debug refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete' refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #download refseq flat format files (protein) downExt='gz' downFbase='*.gpff' #downFbase='complete.?.protein.gpff' #debug refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete' refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #download tsa flat format files downExt='gz' downFbase='*.gbff' #downFbase='tsa.GAA?.1.gbff' #debug refSeqFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank/tsa' refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #except the following format files cmd = 'rm -rf %s/*.mstr.gbff.gz' % downloadD_gff os.system(cmd) #the following two lines should be conistent with the one defined in parse_ncbi_entry() GI,REF_NAME,TAXON_ID,ORGANISM,LINEAGE,PRODUCT,STBP,EDBP,SUB_GI=range(9) GI_SUB,STRAND_SUB,STBP_SUB,EDBP_SUB,GENE_SUB,LOCUS_TAG_SUB,PRODUCT_SUB,PROTEIN_ID_SUB=range(8) NAi=0 NAs='X' ##################################################################$ #processing genbank flat file and transfer all annotation to mysql ##################################################################$ ANNO_T,DELIM_T,TAX_T=range(3) gi_annoT_fn=downloadD_gff+'/giAnnoT2load.csv' delimT_fn=downloadD_gff+'/delimT2load.csv' taxT_fn=downloadD_gff+'/tax2load.csv' h_taxLookup = {} #read tax2load to dictionary (debug) ------------> if False: fp=open(taxT_fn,'r') for i in fp: words = i.split('\t') h_taxLookup[words[1]]=1 fp.close() #<-------------------- fps=[-1,-1,-1] fps[ANNO_T]=open(gi_annoT_fn,'w') fps[DELIM_T]=open(delimT_fn,'w') fps[TAX_T]=open(taxT_fn,'w') print 'transferring gene bank report to mysql...' gbFlatTmp = '%s/gb2process.tmp' % downloadD_gff pkey_anno = 1 pkey_delim = 1 pkey_ti = 1 doneGbD = downloadD_gff+'/completed_gbff' if not os.path.exists(doneGbD): os.makedirs(doneGbD) #count a total # of gz to process F = len(os.listdir(downloadD_gff)) f = 0 for gbFlatFn in os.listdir(downloadD_gff): tick=time() if gbFlatFn.endswith(gbDnExt[0]) or gbFlatFn.endswith(gbDnExt[1]) or gbFlatFn.endswith(gbDnExt[2]):# or gbFlatFn!='gbcon208.seq.gz': #debug cmd='gunzip -c %s/%s > %s\n' % (downloadD_gff,gbFlatFn,gbFlatTmp) cmd='%smv %s/%s %s/%s\n' % (cmd,downloadD_gff,gbFlatFn,doneGbD,gbFlatFn) os.system(cmd) f+=1 else: continue print 'processing %s[%d/%d]...' % (gbFlatFn,f,F) fp = open(gbFlatTmp,'r') #skipping header header=True while header: # Dump the header in the file tmp=fp.readline() if len(tmp)>5: header = (not tmp[:5] == "LOCUS") entry = [tmp] #only focus on the section between "LOCUS" ... "//" ti=-1 for x in fp: if re.search(r'^//', x): # Every time we get to a //\n line, we read the current entry and then start collecting a new one. gB = parse_ncbi_entry(entry) #print gB[0] #debug entry=[] #............................................. #0) check if query gi has multiple sub gis has_sub=0 if len(gB[SUB_GI])>0: has_sub=1 #1) update query gi annotation #mysql_update_anno_gi(con,gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub) pkey_anno,fps[ANNO_T] = csv_update_anno_gi(fps[ANNO_T],gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub,pkey_anno) #2) update query delimit # mysql_update_delim(con,gB[GI],gB[GI],'+',gB[STBP],gB[EDBP]) for s in gB[SUB_GI]: #3) update sub_gi annotation #mysql_update_anno_sub_gi(con,s[GI],gB[REF_NAME],gB[TAXON_ID],s[PRODUCT_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB]) #4) update sub_gi delimit #mysql_update_delim(con,gB[GI],s[GI_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB]) pkey_delim,fps[DELIM_T]=csv_update_delim(fps[DELIM_T],gB[GI],s[GI_SUB],s[GENE_SUB],s[LOCUS_TAG_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB],pkey_delim) #ti=mysql_update_ti(con,gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE]) if h_taxLookup.get(gB[TAXON_ID],-1) == -1: pkey_ti,fps[TAX_T] = csv_update_ti(fps[TAX_T],gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE],pkey_ti) h_taxLookup[gB[TAXON_ID]]=1 #............................................. else: entry.append(x) #for loop end (x in fp) fp.close() tock=time() elapsed=tock-tick print 'elasped time:[%g]' % elapsed fps[0].close() fps[1].close() fps[2].close() #(gbFlatFn) finish for loop con = dbUtils.init_mysql_innocentive(MySqlConf,0) with con: print 'loading %s...' % (gi_annoT_fn) cur=con.cursor() mysql_load_cmd = 'load data local infile \'%s\' into table giAnnoT fields terminated by \'\\t\'' % gi_annoT_fn cur.execute(mysql_load_cmd) cur=con.cursor() mysql_idx_cmd = 'create unique index idx_gi on giAnnoT (gi)' cur.execute(mysql_idx_cmd) print 'done.' print 'loading %s...' % (delimT_fn) cur=con.cursor() mysql_load_cmd = 'load data local infile \'%s\' into table giDelimT fields terminated by \'\\t\'' % delimT_fn cur.execute(mysql_load_cmd) cur=con.cursor() mysql_idx_cmd = 'create index idx_subgi on giDelimT (gi,stbp,edbp)' cur.execute(mysql_idx_cmd) print 'done.' print 'computing database size for each taxon id...' if False: #collect dbSize for each ti h_ti_dbSz = get_ti_db_size(gi_annoT_fn) update_taxT_fn(h_ti_dbSz,taxT_fn) else: add_dbsize2taxonT(tiNtDfn,taxT_fn) print 'done.' print 'loading %s...' % (taxT_fn) cur=con.cursor() mysql_load_cmd = 'load data local infile \'%s\' into table cj_taxonT fields terminated by \'\\t\'' % taxT_fn cur.execute(mysql_load_cmd) cur=con.cursor() mysql_idx_cmd = 'create unique index idx_taxon on cj_taxonT (taxon)' cur.execute(mysql_idx_cmd) print 'done.' dbUtils.mysql_close(con) print 'done'
parser.add_argument('-r', action='store', dest='reset_table', required=False, type=int, default=0, help='set to 1 if you want to reset mysql table') args = parser.parse_args() #####################$ #open mysql connection #####################$ HOST_NAME, MYSQL_PORT, USER, PASSWORD, DEFAULT_DB = range(5) MySqlConf = [args.hostname, args.port, args.user, args.passwd, ''] con = dbUtils.init_mysql_innocentive(MySqlConf, args.reset_table) #####################################$ #create ncbi phylogeny tree into mysql #####################################$ if True: #debug nodesDfn = pathoLib.getNodesDump_online(args.downloadD) dbUtils.phylo_node2mysql(con, nodesDfn) ###################################$ #close mysql connection ###################################$ dbUtils.mysql_close(con) ####################################$ #create ncbi genbank flat into mysql
def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead, h_refScore, h_gisPerTi, h_tiRef, reads, h_readSequence, samFile, mySqlConf): NAs = 'X' useMysql = True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_, _, _, passwd, _) = range(5) if mySqlConf[passwd] == NAs: #then, we do not use mysql useMysql = False organismsObj = Organisms() readCnt = len(reads) hostScore = 0 if len(hostTaxon) > 0: try: hostScore = h_refScore[hostTaxon] except: hostScore = 0 numTargetReads = readCnt - hostScore organismsObj.numAlignedReads = numTargetReads organismsObj.numMappedGenomes = len(h_gisPerTi) if useMysql: con = dbUtils.init_mysql_innocentive(mySqlConf, 0) for ti in h_gisPerTi: refIdName = h_tiRef.get(ti, [ti, ti]) refId = refIdName[0] score = h_refScore.get(refId, 0) organismName = refIdName[1] lineage = '' #if taxonomyLevelF: if useMysql: organismName, lineage = dbUtils.findOrganismLineage(con, ti) organism = Organism(organismName) if useMysql: words = organismName.split() length = len(words) if length > 0: organism.genus = words[0] if length > 1: organism.species = words[1] if length > 2: organism.strain = words[2] organism.relativeAmount = RelativeAmount(score) organism.relativeAmount.count = len(h_refRead.get(refId, [-1])) organism.taxonomy = Taxonomy(lineage) organism.taxonomy.taxon_id = ti genes = [] if h_annoT.get(ti, -1) != -1: for giList in h_annoT[ti]: gene = Gene(giList[1]) if giList[2] and giList[2] != NAs: gene.locus_tag = giList[2] if giList[3] and giList[3] != NAs: gene.protein_id = giList[3] if giList[4] and giList[4] != NAs: gene.ref_name = giList[4] if giList[5] and giList[5] != NAs: gene.product = giList[5] genes.append(gene) organism.genes = genes #add contig information contigs = [] j = 0 ctgs = h_ti_contig.get(ti, []) for c in ctgs: ti_contig = ti + '_ctg_' + str(j) contig2 = Contig(ti_contig) contig2.ref_name = c[ 0] #make sure that all string format only available in xml contig2.length = str(c[1]) contig2.contig = c[2] j += 1 contigs.append(contig2) organism.contigs = contigs #add read information reads = [] readnames = h_refRead.get(refId, []) for readname in readnames: read = Read(readname) read.readSequence = h_readSequence[readname] reads.append(read) organism.reads = reads organismsObj.organisms.append(organism) organismsObj.organisms = sorted(organismsObj.organisms, key=lambda x: x.relativeAmount.value, reverse=True) organismsElement = organismsObj.buildElement() if con: dbUtils.mysql_close(con) return organismsElement
def get_genome_annotation_in_mysql(\ refConsFq, minContigLen, MySqlConf, h_annoT, h_ti_contig): START,END = range(2) SUBGI,GENE,LOCS_TAG,PROID,STBP,EDBP = range(6) NAs = 'X' useMysql=True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_,_,_,passwd,_)=range(5) if MySqlConf[passwd]==NAs: #then, we do not use mysql useMysql=False if useMysql: con = dbUtils.init_mysql_innocentive(MySqlConf,0) fp = open(refConsFq,'r') #debugCnt = 0 #debug for r in seqParse.parse(fp,'fastq'): # for each covered genome covRange = selectConsensusContigs(r,minContigLen,-1) #disable checking seq complexity of contig if not covRange: continue C = len(covRange) #extract ti and gi refName = r.id mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi\|(\d+)\|',r.id) if mObj: ti = mObj.group(1) gi = mObj.group(3) else: mObj=re.search(r'ti\|(\d+)\|gi\|(\d+)\|',r.id) if mObj and mObj.group(1)!="-1": ti = mObj.group(1) gi = mObj.group(2) else: mObj=re.search(r'gi\|(\d+)\|',r.id) if mObj: gi = mObj.group(1) if not h_ti_contig.get(ti,[]): h_ti_contig[ti]=[] for c in range(C): #contig = r[covRange[c][0]:covRange[c][1]+1] contigSeq = str(r.seq[covRange[c][0]:covRange[c][1]+1]) #cqual = contig.letter_annotations["phred_quality"] #cLen = len(cqual) cLen = covRange[c][1]-covRange[c][0]+1 #cqual_ave = 1.*sum(cqual)/cLen #h_ti_contig[ti].append([refName,cLen,str(contig.seq)]) h_ti_contig[ti].append([refName,cLen,contigSeq]) if con: mysql_sel_cmd = 'select sub_gi, gene, locus_tag, protein_id, stbp, edbp from giDelimT where gi = %s' % gi cur = con.cursor() cur.execute(mysql_sel_cmd) entr=cur.fetchall() if entr: #subgi2query=[] #subgiAnnot=[] #print r.id #debug #print covRange #debug for j in entr: #select which subgi sits within the covered genomic regions aStbp=int(j[STBP]); aEdbp=int(j[EDBP]) A=aEdbp-aStbp+1 notCoveredA=A minCoveredA2 = notCoveredA - 100 reportA=False for i in range(C): #print '[subgi%s:%d - %d][cov:%d-%d]' % (gi,aStbp,aEdbp,covRange[START][i],covRange[END][i]) notCoveredA -= pathoUtilsA.segments_intersect(aStbp,aEdbp,covRange[i][START],covRange[i][END]) if notCoveredA<minCoveredA2: reportA=True break if reportA: selCmd = 'select ref_name, product from giAnnoT where gi = %s' % j[SUBGI] cur = con.cursor() cur.execute(selCmd) entr2 = cur.fetchone() ref_name=NAs; product=NAs if entr2: ref_name = entr2[0]; product = entr2[1] if h_annoT.get(ti,-1)==-1: h_annoT[ti]=[] h_annoT[ti].append([j[SUBGI],j[GENE],j[LOCS_TAG],j[PROID],ref_name,product]) fp.close() if con: dbUtils.mysql_close(con) return h_annoT,h_ti_contig
def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead, h_refScore, h_gisPerTi, h_tiRef, reads, h_readSequence, samFile, mySqlConf): NAs = 'X' useMysql=True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_,_,_,passwd,_)=range(5) if mySqlConf[passwd]==NAs: #then, we do not use mysql useMysql=False organismsObj = Organisms() readCnt = len(reads) hostScore = 0 if len(hostTaxon)>0: try: hostScore = h_refScore[hostTaxon] except: hostScore = 0 numTargetReads = readCnt-hostScore organismsObj.numAlignedReads = numTargetReads organismsObj.numMappedGenomes = len(h_gisPerTi) if useMysql: con = dbUtils.init_mysql_innocentive(mySqlConf,0) for ti in h_gisPerTi: refIdName = h_tiRef.get(ti, [ti, ti]) refId = refIdName[0] score = h_refScore.get(refId,0) organismName=refIdName[1] lineage='' #if taxonomyLevelF: if useMysql: organismName, lineage = dbUtils.findOrganismLineage(con, ti) organism = Organism(organismName) if useMysql: words = organismName.split() length = len(words) if length>0 : organism.genus = words[0] if length>1 : organism.species = words[1] if length>2 : organism.strain = words[2] organism.relativeAmount = RelativeAmount(score) organism.relativeAmount.count = len(h_refRead.get(refId,[-1])) organism.taxonomy = Taxonomy(lineage) organism.taxonomy.taxon_id = ti genes = [] if h_annoT.get(ti,-1)!=-1: for giList in h_annoT[ti]: gene = Gene(giList[1]) if giList[2] and giList[2] != NAs: gene.locus_tag = giList[2] if giList[3] and giList[3] != NAs: gene.protein_id = giList[3] if giList[4] and giList[4] != NAs: gene.ref_name = giList[4] if giList[5] and giList[5] != NAs: gene.product = giList[5] genes.append(gene) organism.genes = genes #add contig information contigs = [] j= 0 ctgs = h_ti_contig.get(ti,[]) for c in ctgs: ti_contig = ti+'_ctg_'+str(j) contig2 = Contig(ti_contig) contig2.ref_name = c[0] #make sure that all string format only available in xml contig2.length = str(c[1]) contig2.contig = c[2] j+=1 contigs.append(contig2) organism.contigs = contigs #add read information reads = [] readnames = h_refRead.get(refId,[]) for readname in readnames: read = Read(readname) read.readSequence = h_readSequence[readname] reads.append(read) organism.reads = reads organismsObj.organisms.append(organism) organismsObj.organisms = sorted(organismsObj.organisms, key=lambda x: x.relativeAmount.value, reverse=True) organismsElement = organismsObj.buildElement() if con: dbUtils.mysql_close(con) return organismsElement
parser.add_argument('-g', action='store', dest='ti_nt', required=True, help='specify a fasta format reference appended with taxonomy id in the front seq header. Refer to app_build_nt_tgt.py to build such fasta file.') parser.add_argument('-d', action='store', dest='downloadD', required=True, help='specify a temporary download directory') parser.add_argument('-m', action='store', dest='hostname', required=False, default='localhost', help='specify hostname running mysql') parser.add_argument('-P', action='store', dest='port', required=False, type=int, default='3306', help='specify hostname running mysql') parser.add_argument('-u', action='store', dest='user', required=False, default='root', help='user name to access mysql') parser.add_argument('-p', action='store', dest='passwd', required=True, default='X', help='provide password associate with user') parser.add_argument('-r', action='store', dest='reset_table', required=False, type=int, default=0, help='set to 1 if you want to reset mysql table') args=parser.parse_args() #####################$ #open mysql connection #####################$ HOST_NAME,MYSQL_PORT,USER,PASSWORD,DEFAULT_DB = range(5) MySqlConf=[args.hostname,args.port,args.user,args.passwd,''] con = dbUtils.init_mysql_innocentive(MySqlConf,args.reset_table) #####################################$ #create ncbi phylogeny tree into mysql #####################################$ if True: #debug nodesDfn=pathoLib.getNodesDump_online(args.downloadD) dbUtils.phylo_node2mysql(con,nodesDfn) ###################################$ #close mysql connection ###################################$ dbUtils.mysql_close(con) ####################################$ #create ncbi genbank flat into mysql