def parse_diamond_blastx(input, db, good_reads): root.flog('processing blastx from DIAMOND') Genes={} eval=1e-5 iden=80 mlen=25 for line in open(input): i=line.split() try: if float(i[2])>=iden and float(i[3])>=mlen and float(i[10])<=eval: Genes[i[1]]+=1 except: if float(i[2])>=iden and float(i[3])>=mlen and float(i[10])<=eval: Genes[i[1]]=1 fo=open(input+'.function.genes.abundance','w') for i in Genes: fo.write(i+'\t'+str(Genes[i])+str([db.len[i]])+'\n') fo.close() func_id_ab={} for gene in Genes: try: func_id_ab[db.func[gene]]+=Genes[gene] # put in the function ID the total number of reads that contains this category except: func_id_ab[db.func[gene]]=Genes[gene] fo=open(input+'.function.abundance','w') ABN=[] for id in func_id_ab: ABN.append([id,str(func_id_ab[id]),str(func_id_ab[id]),str(func_id_ab[id]),db.funcdb[id].split()[0],db.funcdb[id].split()[1]]) fo.write("\t".join([id,str(func_id_ab[id]),db.funcdb[id].split()[1]])+"\n") fo.close() root.flog('taxonomy abundance: done') return ABN
def parse_blast(filename,taxo, lens, taxodb,analysis,dbname, silvafile, good_reads,pip): f = open(filename, 'r') matches=0 #total number of mapped genes eval=1e-5# mlen=25 # if pip=='matches': iden=80 # else: iden=60 # # Best Hit: select the read->bestHit to reference BH={} for line in f: line=line.replace("\n","").split("\t") try: current_BH=float(line[-1]) prev_BH=float(BH[line[0]][-1]) if current_BH>prev_BH: BH[line[0]]=line except: BH[line[0]]=line GENES={} # reads per genes dict structure id:gene -> reads number for hit in BH: line=BH[hit] if float(line[2])>=iden and float(line[3])>=mlen and float(line[10])<=eval: gene=line[1] matches+=1 try: GENES[gene]+=1 except: GENES[gene]=1 # compute the gene-reads abundance #print matches lreads=100 avg_reads_length=1432 TAXO={} totalGenes=1 #cnt=0 # outf=open(filename+"."+analysis+".genes.abundance.rpkm",'w'); root.flog('mode: Normalization by RPKM') for i in GENES: totalGenes+=1 #rpkm=((GENES[i]*lreads)/float(lens[i]))/(matches*lreads/float(avg_reads_length)) rpkm=(GENES[i]*1000000000)/(good_reads*float(lens[i])) outf.write("\t".join([i, ",".join([k for k in taxo[i]]) , str(GENES[i]), str(rpkm), str(lens[i]), "\n"])) for k in taxo[i]: try: TAXO[k][0]+=GENES[i] TAXO[k][1]+=rpkm TAXO[k][2]+=1 except: TAXO[k]=[GENES[i],rpkm,1] outf.close() #print filename #compute the gene-taxo abundance #print TAXO outf=open(filename+"."+analysis+".abundance.rpkm",'w'); ABN=[] for i in TAXO: item=[i,str(TAXO[i][2]),str(TAXO[i][0]),str(TAXO[i][1]),taxodb[i].split()[0],taxodb[i].split()[1]] outf.write("\t".join(item+["\n"])) ABN.append(item) outf.close() OUT1=ABN # if analysis=="function": outf=open(filename+"."+analysis+".genes.abundance.16s",'w'); root.flog('mode: Normalization by 16s rRNA abundance') N16s=1 L16s=1432 for line in open(silvafile): line=line.split() N16s+=float(line[2]) # TAXO={} for i in GENES: #cnt+=1 #print cnt totalGenes+=1 rpkm=((GENES[i]*lreads)/float(lens[i]))/(N16s*lreads/float(L16s)) #rpkm=(GENES[i]*1000000000)/(matches*float(lens[i])) #print "\t".join([i,taxo[i], str(GENES[i]), str(rpkm),"\n"]) outf.write("\t".join([i, ",".join([k for k in taxo[i]]) , str(GENES[i]), str(rpkm), str(lens[i]),"\n"])) for k in taxo[i]: try: TAXO[k][0]+=GENES[i] TAXO[k][1]+=rpkm TAXO[k][2]+=1 except: TAXO[k]=[GENES[i],rpkm,1] outf.close() outf=open(filename+"."+analysis+".abundance.16s",'w'); ABN=[] for i in TAXO: item=[i,str(TAXO[i][2]),str(TAXO[i][0]),str(TAXO[i][1]),taxodb[i].split()[0],taxodb[i].split()[1]] outf.write("\t".join(item+["\n"])) ABN.append(item) outf.close() OUT2=ABN if analysis=="function": return({'rpkm':OUT1, '16s':OUT2}) else: return OUT1
def parse_sam(input,db, good_reads): #root.flog('processing sam file') # here is implemented the best hit approach GenesA={} #flags={'83':1,'99':1,'73':1,'137':1,'89':1,'153':1} BH={} for line in open(input): values=line.replace("\n","").split("\t") try: matchedbp=sum([int(x.strip('M')) for x in re.findall(r'(\d*M)', values[5])]) alignlen=float(len(values[9])) if matchedbp>25: # 90% of identity and at least 75 nt long try: current_BH=int(line.split("NM:i:")[1].split()[0]) prev_BH=int(BH[values[0]].split("NM:i:")[1].split()[0]) if current_BH<prev_BH: BH[values[0]]=line except: BH[values[0]]=line except: pass for item in BH: line=BH[item] try: edit_distance=int(line.split("NM:i:")[1].split()[0]) if edit_distance > 0: continue line=line.split("\t") try: p_edit_distance=GenesA[line[2]][line[0]] if edit_distance<p_edit_distance: GenesA[line[2]]={line[0]:edit_distance} except: try: GenesA[line[2]].update({line[0]:edit_distance}) except: GenesA[line[2]]={line[0]:edit_distance} except: pass #json.dump(GenesA, open(input+'.taxonomy.abundance.genes.json')) Genes={} for gene in GenesA: Genes[gene]=len(GenesA[gene]) fo=open(input+'.taxonomy.abundance.genes.rpkm','w') taxo_id_ab={} for gene in Genes: rpkm=compute_rpkm(Genes[gene], good_reads, db.len[gene]) fo.write("\t".join([gene, str(db.taxo[gene][0]), str(Genes[gene]), str(rpkm), str(db.len[gene])])+"\n") try: taxo_id_ab[db.taxo[gene][0]][0]+=Genes[gene] # put in the taxonomy ID the total number of reads that contains this taxonomy taxo_id_ab[db.taxo[gene][0]][1]+=rpkm taxo_id_ab[db.taxo[gene][0]][2]+=1 except: taxo_id_ab[db.taxo[gene][0]]=[Genes[gene],rpkm,1] # number of reads in the gene, rpkm, and unique genes in the taxonomy category fo.close() fo=open(input+'.taxonomy.abundance.rpkm','w') ABN=[] for id in taxo_id_ab: ABN.append([id,str(taxo_id_ab[id][2]),str(taxo_id_ab[id][0]),str(taxo_id_ab[id][1]),db.taxodb[id].split()[0],db.taxodb[id].split()[1]]) # [category, unique_genes, number of reads, rpkm, lineage] fo.write("\t".join([id,str(taxo_id_ab[id][2]),str(taxo_id_ab[id][0]),str(taxo_id_ab[id][1]), db.taxodb[id].split()[0], db.taxodb[id].split()[1]])+"\n") fo.close() root.flog('taxonomy abundance: done') return ABN
def process(projectid, sampleid, db, protocol, reads1, reads2, good_reads): #db=root.dataset(db) x = sql.SQL(root.filedb()) xpath = x.project(projectid)[0][4] val = x.exe('update samples set reads1="' + reads1 + '" where project_id="' + projectid + '" and sample_id="' + sampleid + '"') val = x.exe('update samples set reads2="' + reads2 + '" where project_id="' + projectid + '" and sample_id="' + sampleid + '"') samples = x.exe('select * from samples where project_id="' + projectid + '" and sample_id="' + sampleid + '"') sample = samples[0] sample = root.samples(sample, xpath) root.mkdir(sample.matchesDir) rdir = root.__ROOTPRO__ + "/" + projectid + "/READS/" if db.name == "abcdefghij": #print "MetaPhlAnn" update_status(x, sampleid, db.id, protocol, "Processing") metaphlan = root.program('MetaPhlAnR', sample, db) if not root.isdir(metaphlan.out): metaphlan.run() G = txp.metaphlan_taxonomy_tree(metaphlan.out) abn = root.SampleResults(sample, G, protocol, db.name, "taxonomy", metaphlan.out) abn.start() update_status(x, sampleid, db.id, protocol, "Done") if not db.taxo == "none": # run bowtie using the paired end reads update_status(x, sampleid, db.id, 'matches', "Screening") cmd = " ".join([ root.__ROOTEXEDIR__ + 'bowtie2', '--very-fast-local -p ' + p + ' --no-unal --no-hd --no-sq -x', db.bowtie, '-1', sample.reads1, '-2', sample.reads2, '-S', sample.matchesDir + '/alignment.' + db.id + '.matches >>', root.log, '2>&1' ]) if not root.isdir(sample.matchesDir + '/alignment.' + db.id + '.matches'): os.system(cmd) #process output in sam format to get genes and number of reads per gene. update_status(x, sampleid, db.id, protocol, "Quantification") if not root.isdir(sample.matchesDir + '/alignment.' + db.id + '.matches.taxonomy.abundance.results.sqlite3.db'): abundance = parse_sam( sample.matchesDir + '/alignment.' + db.id + '.matches', db, good_reads) G = txp.taxonomy_tree( abundance, sample.matchesDir + '/alignment.' + db.id + '.matches', protocol, "taxonomy", db.id) abn = root.SampleResults(sample, G, protocol, db.id, "taxonomy", sample.matchesDir + '/alignment.' + db.id + '.matches') # Store data in the sql TABLE abn.start() update_status(x, sampleid, db.id, protocol, "Done") return 'success' if not db.func == "none": fileso = root.result_files(projectid, "function", protocol, sampleid, db.name) #Merge paired ends update_status(x, sampleid, db.id, protocol, "Merge") cmd = " ".join([ 'python', root.__ROOTEXEDIR__ + "pairend_join.py -s -p " + p + " -m 8 -o ", sample.matchesDir + '/merged.reads.fastq', sample.reads1, sample.reads2 ]) #print cmd root.flog(cmd) #print cmd if not root.isdir(sample.matchesDir + '/merged.reads.fastq'): os.system(cmd) #Get fasta files cmd = ' '.join([ root.__ROOTEXEDIR__ + '/seqtk seq -a', sample.matchesDir + '/merged.reads.fastq >', sample.matchesDir + '/merged.reads.fasta' ]) if not root.isdir(sample.matchesDir + '/merged.reads.fasta'): os.system(cmd) #BlastX from diamond update_status(x, sampleid, db.id, protocol, "Screening") dout = sample.matchesDir + 'alignment.' + db.id din = sample.matchesDir + '/merged.reads.fasta' cmd = ' '.join([ root.__ROOTEXEDIR__ + '/diamond blastx --id 60 -p ' + p + ' -k 1 -e 1e-5 -d', db.diamond, '-a', dout + '.pre', '-q', din, '>>', root.log, "2>&1" ]) if not root.isdir(dout + '.daa'): os.system(cmd) cmd = ' '.join([ root.__ROOTEXEDIR__ + '/diamond view -a', dout + '.pre.daa', '-o', dout + '.matches -f tab', ">>", root.log, "2>&1" ]) if not root.isdir(dout + '.matches'): os.system(cmd) # parse diamond output update_status(x, sampleid, db.id, protocol, "Quantification") if not root.isdir(sample.matchesDir + '/alignment.' + db.id + '.matches.function.abundance.results.sqlite3.db'): abundance = pb(dout + '.matches', db.func, db.len, db.funcdb, "function", db.name, fileso.GGenes + ".rpkm", good_reads) #abundance=pdx(dout+'.matches', db, good_reads) abn = root.SampleResults(sample, 'none', protocol, db.name, "function", dout + '.matches') abn.createFuncDb(abundance) update_status(x, sampleid, db.id, protocol, "Done") os.system('rm ' + sample.matchesDir + '/merged.reads.fastq >> ' + root.log + " 2>&1") os.system('rm ' + sample.matchesDir + '/merged.reads.fasta >> ' + root.log + " 2>&1") return 'success'
def parse_blast(filename, taxo, lens, taxodb, analysis, dbname, silvafile, good_reads): GENES = {} # reads per genes dict structure id:gene -> reads number f = open(filename, 'r') matches = 0 #total number of mapped genes eval = 1e-10 # iden = 90 # mlen = 25 # # for line in f: line = line.split() if float(line[2]) >= iden and float(line[3]) >= mlen and float( line[10]) <= eval: gene = line[1] matches += 1 try: GENES[gene] += 1 except: GENES[gene] = 1 # compute the gene-reads abundance #print matches lreads = 100 avg_reads_length = 1432 TAXO = {} totalGenes = 0 #cnt=0 # outf = open(filename + "." + analysis + ".genes.abundance.rpkm", 'w') root.flog('mode: Normalization by RPKM') for i in GENES: totalGenes += 1 #rpkm=((GENES[i]*lreads)/float(lens[i]))/(matches*lreads/float(avg_reads_length)) rpkm = (GENES[i] * 1000000000) / (good_reads * float(lens[i])) outf.write("\t".join( [i, taxo[i], str(GENES[i]), str(rpkm), str(lens[i]), "\n"])) if taxo[i] in TAXO: TAXO[taxo[i]][0] += GENES[i] TAXO[taxo[i]][1] += rpkm TAXO[taxo[i]][2] += 1 else: TAXO[taxo[i]] = [GENES[i], rpkm, 1] outf.close() #print filename #compute the gene-taxo abundance #print TAXO outf = open(filename + "." + analysis + ".abundance.rpkm", 'w') ABN = [] for i in TAXO: item = [ i, str(TAXO[i][2]), str(TAXO[i][0]), str(TAXO[i][1]), taxodb[i].split()[0], taxodb[i].split()[1] ] outf.write("\t".join(item + ["\n"])) ABN.append(item) outf.close() OUT1 = ABN # if analysis == "function": outf = open(filename + "." + analysis + ".genes.abundance.16s", 'w') root.flog('mode: Normalization by 16s rRNA abundance') N16s = 0 L16s = 1432 for line in open(silvafile): line = line.split() N16s += float(line[2]) # TAXO = {} for i in GENES: #cnt+=1 #print cnt totalGenes += 1 rpkm = ((GENES[i] * lreads) / float(lens[i])) / (N16s * lreads / float(L16s)) #rpkm=(GENES[i]*1000000000)/(matches*float(lens[i])) #print "\t".join([i,taxo[i], str(GENES[i]), str(rpkm),"\n"]) outf.write("\t".join( [i, taxo[i], str(GENES[i]), str(rpkm), str(lens[i]), "\n"])) if taxo[i] in TAXO: TAXO[taxo[i]][0] += GENES[i] TAXO[taxo[i]][1] += rpkm TAXO[taxo[i]][2] += 1 else: TAXO[taxo[i]] = [GENES[i], rpkm, 1] outf.close() outf = open(filename + "." + analysis + ".abundance.16s", 'w') ABN = [] for i in TAXO: item = [ i, str(TAXO[i][2]), str(TAXO[i][0]), str(TAXO[i][1]), taxodb[i].split()[0], taxodb[i].split()[1] ] outf.write("\t".join(item + ["\n"])) ABN.append(item) outf.close() OUT2 = ABN if analysis == "function": return ({'rpkm': OUT1, '16s': OUT2}) else: return OUT1
def parse_sam(input, db, good_reads): root.flog('processing sam file') Genes = {} flags = {'83': 1, '99': 1, '73': 1, '137': 1, '89': 1, '153': 1} for line in open(input): try: if int( line.split("XM:i:")[1].split()[0] ) < 1: #make sure that the sequence is properly aligned. The output only contains line = line.split() try: Genes[line[2]] += flags[line[1]] except: try: Genes[line[2]] = flags[line[1]] except: pass except: pass fo = open(input + '.taxonomy.abundance.genes.rpkm', 'w') taxo_id_ab = {} for gene in Genes: rpkm = compute_rpkm(Genes[gene], good_reads, db.len[gene]) fo.write("\t".join([ gene, str(db.taxo[gene]), str(Genes[gene]), str(rpkm), str(db.len[gene]) ]) + "\n") try: taxo_id_ab[db.taxo[gene]][0] += Genes[ gene] # put in the taxonomy ID the total number of reads that contains this taxonomy taxo_id_ab[db.taxo[gene]][1] += rpkm taxo_id_ab[db.taxo[gene]][2] += 1 except: taxo_id_ab[db.taxo[gene]] = [ Genes[gene], rpkm, 1 ] # number of reads in the gene, rpkm, and unique genes in the taxonomy category fo.close() fo = open(input + '.taxonomy.abundance.rpkm', 'w') ABN = [] for id in taxo_id_ab: ABN.append([ id, str(taxo_id_ab[id][2]), str(taxo_id_ab[id][0]), str(taxo_id_ab[id][1]), db.taxodb[id].split()[0], db.taxodb[id].split()[1] ]) # [category, unique_genes, number of reads, rpkm, lineage] fo.write("\t".join([ id, str(taxo_id_ab[id][2]), str(taxo_id_ab[id][0]), str(taxo_id_ab[id][1]), db.taxodb[id].split()[0], db.taxodb[id].split()[1] ]) + "\n") fo.close() root.flog('taxonomy abundance: done') return ABN