예제 #1
0
파일: parse.py 프로젝트: barichd/MetaStorm
def parse_diamond_blastx(input, db, good_reads):
    root.flog('processing blastx from DIAMOND')
    Genes={}
    eval=1e-5
    iden=80
    mlen=25
    for line in open(input):
        i=line.split()
        try:
            if float(i[2])>=iden and float(i[3])>=mlen and float(i[10])<=eval:
                Genes[i[1]]+=1
        except:
            if float(i[2])>=iden and float(i[3])>=mlen and float(i[10])<=eval:
                Genes[i[1]]=1
    fo=open(input+'.function.genes.abundance','w')
    for i in Genes:
        fo.write(i+'\t'+str(Genes[i])+str([db.len[i]])+'\n')
    fo.close()
    func_id_ab={}
    for gene in Genes:
        try:
            func_id_ab[db.func[gene]]+=Genes[gene] # put in the function ID the total number of reads that contains this category
        except:
            func_id_ab[db.func[gene]]=Genes[gene]
    fo=open(input+'.function.abundance','w')
    ABN=[]
    for id in func_id_ab:
        ABN.append([id,str(func_id_ab[id]),str(func_id_ab[id]),str(func_id_ab[id]),db.funcdb[id].split()[0],db.funcdb[id].split()[1]])
        fo.write("\t".join([id,str(func_id_ab[id]),db.funcdb[id].split()[1]])+"\n")
    fo.close()
    root.flog('taxonomy abundance: done')
    return ABN
예제 #2
0
파일: parse.py 프로젝트: barichd/MetaStorm
def parse_blast(filename,taxo, lens, taxodb,analysis,dbname, silvafile, good_reads,pip):
    f = open(filename, 'r')
    matches=0 #total number of mapped genes
    eval=1e-5# 
    mlen=25 #
    if pip=='matches':
        iden=80 #
    else:
        iden=60 #
    # Best Hit: select the read->bestHit to reference
    BH={}
    for line in f:
        line=line.replace("\n","").split("\t")
        try:
            current_BH=float(line[-1])
            prev_BH=float(BH[line[0]][-1])
            if current_BH>prev_BH:
                BH[line[0]]=line
        except:
            BH[line[0]]=line
    
    GENES={} # reads per genes dict structure id:gene -> reads number
    for hit in BH:
        line=BH[hit]
        if float(line[2])>=iden and float(line[3])>=mlen and float(line[10])<=eval:
            gene=line[1]
            matches+=1
            try:
                GENES[gene]+=1
            except:
                GENES[gene]=1

    # compute the gene-reads abundance
    #print matches
    lreads=100
    avg_reads_length=1432
    TAXO={}
    totalGenes=1
    #cnt=0
    #
    outf=open(filename+"."+analysis+".genes.abundance.rpkm",'w');
    root.flog('mode: Normalization by RPKM')
    for i in GENES:
        totalGenes+=1
        #rpkm=((GENES[i]*lreads)/float(lens[i]))/(matches*lreads/float(avg_reads_length))
        rpkm=(GENES[i]*1000000000)/(good_reads*float(lens[i]))
        outf.write("\t".join([i,  ",".join([k for k in taxo[i]])   , str(GENES[i]), str(rpkm), str(lens[i]), "\n"]))
        for k in taxo[i]:
            try:
                TAXO[k][0]+=GENES[i]
                TAXO[k][1]+=rpkm
                TAXO[k][2]+=1
            except:
                TAXO[k]=[GENES[i],rpkm,1]
    outf.close()
    #print filename
    #compute the gene-taxo abundance
    #print TAXO
    outf=open(filename+"."+analysis+".abundance.rpkm",'w');
    ABN=[]
    for i in TAXO:
        item=[i,str(TAXO[i][2]),str(TAXO[i][0]),str(TAXO[i][1]),taxodb[i].split()[0],taxodb[i].split()[1]]
        outf.write("\t".join(item+["\n"]))
        ABN.append(item)
    outf.close()
    OUT1=ABN
    #
    if analysis=="function":
        outf=open(filename+"."+analysis+".genes.abundance.16s",'w');
        root.flog('mode: Normalization by 16s rRNA abundance')
        N16s=1
        L16s=1432
        for line in open(silvafile):
            line=line.split()
            N16s+=float(line[2])
        #
        TAXO={}
        for i in GENES:
            #cnt+=1
            #print cnt
            totalGenes+=1
            rpkm=((GENES[i]*lreads)/float(lens[i]))/(N16s*lreads/float(L16s))
            #rpkm=(GENES[i]*1000000000)/(matches*float(lens[i]))
            #print "\t".join([i,taxo[i], str(GENES[i]), str(rpkm),"\n"])
            outf.write("\t".join([i,  ",".join([k for k in taxo[i]])  , str(GENES[i]), str(rpkm), str(lens[i]),"\n"]))
            for k in taxo[i]:
                try:
                    TAXO[k][0]+=GENES[i]
                    TAXO[k][1]+=rpkm
                    TAXO[k][2]+=1
                except:
                    TAXO[k]=[GENES[i],rpkm,1]
        outf.close()
        outf=open(filename+"."+analysis+".abundance.16s",'w');
        ABN=[]
        for i in TAXO:
            item=[i,str(TAXO[i][2]),str(TAXO[i][0]),str(TAXO[i][1]),taxodb[i].split()[0],taxodb[i].split()[1]]
            outf.write("\t".join(item+["\n"]))
            ABN.append(item)
        outf.close()
        OUT2=ABN
    if analysis=="function":
        return({'rpkm':OUT1, '16s':OUT2})
    else:
        return OUT1
예제 #3
0
파일: parse.py 프로젝트: barichd/MetaStorm
def parse_sam(input,db, good_reads):
    #root.flog('processing sam file')
    # here is implemented the best hit approach
    GenesA={}
    #flags={'83':1,'99':1,'73':1,'137':1,'89':1,'153':1}
    BH={}
    for line in open(input):
        values=line.replace("\n","").split("\t")
        try:
            matchedbp=sum([int(x.strip('M')) for x in re.findall(r'(\d*M)', values[5])])
            alignlen=float(len(values[9]))
            if  matchedbp>25: # 90% of identity and at least 75 nt long
                try:
                    current_BH=int(line.split("NM:i:")[1].split()[0])
                    prev_BH=int(BH[values[0]].split("NM:i:")[1].split()[0])
                    if current_BH<prev_BH:
                        BH[values[0]]=line
                except:
                    BH[values[0]]=line
        except:
            pass
    
    for item in BH:
        line=BH[item]
        try:
            edit_distance=int(line.split("NM:i:")[1].split()[0])
            if edit_distance > 0:
               continue
            line=line.split("\t")
            try:
                p_edit_distance=GenesA[line[2]][line[0]]
                if edit_distance<p_edit_distance:
                    GenesA[line[2]]={line[0]:edit_distance}
            except:
                try:
                    GenesA[line[2]].update({line[0]:edit_distance})
                except:
                    GenesA[line[2]]={line[0]:edit_distance}
        except:
            pass
    #json.dump(GenesA, open(input+'.taxonomy.abundance.genes.json'))
    
    Genes={}
    for gene in GenesA:
        Genes[gene]=len(GenesA[gene])
    
    fo=open(input+'.taxonomy.abundance.genes.rpkm','w')
    taxo_id_ab={}
    for gene in Genes:
        rpkm=compute_rpkm(Genes[gene], good_reads, db.len[gene])
        fo.write("\t".join([gene, str(db.taxo[gene][0]), str(Genes[gene]), str(rpkm), str(db.len[gene])])+"\n")
        try:
            taxo_id_ab[db.taxo[gene][0]][0]+=Genes[gene] # put in the taxonomy ID the total number of reads that contains this taxonomy
            taxo_id_ab[db.taxo[gene][0]][1]+=rpkm
            taxo_id_ab[db.taxo[gene][0]][2]+=1
        except:
            taxo_id_ab[db.taxo[gene][0]]=[Genes[gene],rpkm,1] # number of reads in the gene, rpkm, and unique genes in the taxonomy category
    
    fo.close()
    
    fo=open(input+'.taxonomy.abundance.rpkm','w')
    ABN=[]
    for id in taxo_id_ab:
        ABN.append([id,str(taxo_id_ab[id][2]),str(taxo_id_ab[id][0]),str(taxo_id_ab[id][1]),db.taxodb[id].split()[0],db.taxodb[id].split()[1]]) # [category, unique_genes, number of reads, rpkm, lineage]
        fo.write("\t".join([id,str(taxo_id_ab[id][2]),str(taxo_id_ab[id][0]),str(taxo_id_ab[id][1]), db.taxodb[id].split()[0], db.taxodb[id].split()[1]])+"\n")
    fo.close()
    root.flog('taxonomy abundance: done')
    return ABN
예제 #4
0
def process(projectid, sampleid, db, protocol, reads1, reads2, good_reads):
    #db=root.dataset(db)
    x = sql.SQL(root.filedb())
    xpath = x.project(projectid)[0][4]
    val = x.exe('update samples set reads1="' + reads1 +
                '" where project_id="' + projectid + '" and sample_id="' +
                sampleid + '"')
    val = x.exe('update samples set reads2="' + reads2 +
                '" where project_id="' + projectid + '" and sample_id="' +
                sampleid + '"')
    samples = x.exe('select * from samples where project_id="' + projectid +
                    '" and sample_id="' + sampleid + '"')
    sample = samples[0]
    sample = root.samples(sample, xpath)
    root.mkdir(sample.matchesDir)
    rdir = root.__ROOTPRO__ + "/" + projectid + "/READS/"

    if db.name == "abcdefghij":
        #print "MetaPhlAnn"
        update_status(x, sampleid, db.id, protocol, "Processing")
        metaphlan = root.program('MetaPhlAnR', sample, db)
        if not root.isdir(metaphlan.out): metaphlan.run()
        G = txp.metaphlan_taxonomy_tree(metaphlan.out)
        abn = root.SampleResults(sample, G, protocol, db.name, "taxonomy",
                                 metaphlan.out)
        abn.start()
        update_status(x, sampleid, db.id, protocol, "Done")

    if not db.taxo == "none":
        # run bowtie using the paired end reads
        update_status(x, sampleid, db.id, 'matches', "Screening")
        cmd = " ".join([
            root.__ROOTEXEDIR__ + 'bowtie2',
            '--very-fast-local -p ' + p + ' --no-unal --no-hd --no-sq -x',
            db.bowtie, '-1', sample.reads1, '-2', sample.reads2, '-S',
            sample.matchesDir + '/alignment.' + db.id + '.matches >>',
            root.log, '2>&1'
        ])
        if not root.isdir(sample.matchesDir + '/alignment.' + db.id +
                          '.matches'):
            os.system(cmd)
        #process output in sam format to get genes and number of reads per gene.
        update_status(x, sampleid, db.id, protocol, "Quantification")
        if not root.isdir(sample.matchesDir + '/alignment.' + db.id +
                          '.matches.taxonomy.abundance.results.sqlite3.db'):
            abundance = parse_sam(
                sample.matchesDir + '/alignment.' + db.id + '.matches', db,
                good_reads)
            G = txp.taxonomy_tree(
                abundance,
                sample.matchesDir + '/alignment.' + db.id + '.matches',
                protocol, "taxonomy", db.id)
            abn = root.SampleResults(sample, G, protocol, db.id, "taxonomy",
                                     sample.matchesDir + '/alignment.' +
                                     db.id +
                                     '.matches')  # Store data in the sql TABLE
            abn.start()
        update_status(x, sampleid, db.id, protocol, "Done")
        return 'success'
    if not db.func == "none":
        fileso = root.result_files(projectid, "function", protocol, sampleid,
                                   db.name)
        #Merge paired ends
        update_status(x, sampleid, db.id, protocol, "Merge")
        cmd = " ".join([
            'python',
            root.__ROOTEXEDIR__ + "pairend_join.py -s -p " + p + " -m 8 -o ",
            sample.matchesDir + '/merged.reads.fastq', sample.reads1,
            sample.reads2
        ])
        #print cmd
        root.flog(cmd)  #print cmd
        if not root.isdir(sample.matchesDir + '/merged.reads.fastq'):
            os.system(cmd)
        #Get fasta files
        cmd = ' '.join([
            root.__ROOTEXEDIR__ + '/seqtk seq -a',
            sample.matchesDir + '/merged.reads.fastq >',
            sample.matchesDir + '/merged.reads.fasta'
        ])
        if not root.isdir(sample.matchesDir + '/merged.reads.fasta'):
            os.system(cmd)
        #BlastX from diamond
        update_status(x, sampleid, db.id, protocol, "Screening")
        dout = sample.matchesDir + 'alignment.' + db.id
        din = sample.matchesDir + '/merged.reads.fasta'
        cmd = ' '.join([
            root.__ROOTEXEDIR__ + '/diamond blastx --id 60 -p ' + p +
            ' -k 1 -e 1e-5 -d', db.diamond, '-a', dout + '.pre', '-q', din,
            '>>', root.log, "2>&1"
        ])
        if not root.isdir(dout + '.daa'): os.system(cmd)
        cmd = ' '.join([
            root.__ROOTEXEDIR__ + '/diamond view -a', dout + '.pre.daa', '-o',
            dout + '.matches -f tab', ">>", root.log, "2>&1"
        ])
        if not root.isdir(dout + '.matches'): os.system(cmd)
        # parse diamond output
        update_status(x, sampleid, db.id, protocol, "Quantification")
        if not root.isdir(sample.matchesDir + '/alignment.' + db.id +
                          '.matches.function.abundance.results.sqlite3.db'):
            abundance = pb(dout + '.matches', db.func, db.len, db.funcdb,
                           "function", db.name, fileso.GGenes + ".rpkm",
                           good_reads)
            #abundance=pdx(dout+'.matches', db, good_reads)
            abn = root.SampleResults(sample, 'none', protocol, db.name,
                                     "function", dout + '.matches')
            abn.createFuncDb(abundance)
        update_status(x, sampleid, db.id, protocol, "Done")
        os.system('rm ' + sample.matchesDir + '/merged.reads.fastq >> ' +
                  root.log + " 2>&1")
        os.system('rm ' + sample.matchesDir + '/merged.reads.fasta >> ' +
                  root.log + " 2>&1")
        return 'success'
예제 #5
0
def parse_blast(filename, taxo, lens, taxodb, analysis, dbname, silvafile,
                good_reads):
    GENES = {}  # reads per genes dict structure id:gene -> reads number
    f = open(filename, 'r')
    matches = 0  #total number of mapped genes
    eval = 1e-10  #
    iden = 90  #
    mlen = 25  #
    #
    for line in f:
        line = line.split()
        if float(line[2]) >= iden and float(line[3]) >= mlen and float(
                line[10]) <= eval:
            gene = line[1]
            matches += 1
            try:
                GENES[gene] += 1
            except:
                GENES[gene] = 1

    # compute the gene-reads abundance
    #print matches
    lreads = 100
    avg_reads_length = 1432
    TAXO = {}
    totalGenes = 0
    #cnt=0
    #
    outf = open(filename + "." + analysis + ".genes.abundance.rpkm", 'w')
    root.flog('mode: Normalization by RPKM')
    for i in GENES:
        totalGenes += 1
        #rpkm=((GENES[i]*lreads)/float(lens[i]))/(matches*lreads/float(avg_reads_length))
        rpkm = (GENES[i] * 1000000000) / (good_reads * float(lens[i]))
        outf.write("\t".join(
            [i, taxo[i],
             str(GENES[i]),
             str(rpkm),
             str(lens[i]), "\n"]))
        if taxo[i] in TAXO:
            TAXO[taxo[i]][0] += GENES[i]
            TAXO[taxo[i]][1] += rpkm
            TAXO[taxo[i]][2] += 1
        else:
            TAXO[taxo[i]] = [GENES[i], rpkm, 1]
    outf.close()
    #print filename
    #compute the gene-taxo abundance
    #print TAXO
    outf = open(filename + "." + analysis + ".abundance.rpkm", 'w')
    ABN = []
    for i in TAXO:
        item = [
            i,
            str(TAXO[i][2]),
            str(TAXO[i][0]),
            str(TAXO[i][1]), taxodb[i].split()[0], taxodb[i].split()[1]
        ]
        outf.write("\t".join(item + ["\n"]))
        ABN.append(item)
    outf.close()
    OUT1 = ABN
    #
    if analysis == "function":
        outf = open(filename + "." + analysis + ".genes.abundance.16s", 'w')
        root.flog('mode: Normalization by 16s rRNA abundance')
        N16s = 0
        L16s = 1432
        for line in open(silvafile):
            line = line.split()
            N16s += float(line[2])
        #
        TAXO = {}
        for i in GENES:
            #cnt+=1
            #print cnt
            totalGenes += 1
            rpkm = ((GENES[i] * lreads) / float(lens[i])) / (N16s * lreads /
                                                             float(L16s))
            #rpkm=(GENES[i]*1000000000)/(matches*float(lens[i]))
            #print "\t".join([i,taxo[i], str(GENES[i]), str(rpkm),"\n"])
            outf.write("\t".join(
                [i, taxo[i],
                 str(GENES[i]),
                 str(rpkm),
                 str(lens[i]), "\n"]))
            if taxo[i] in TAXO:
                TAXO[taxo[i]][0] += GENES[i]
                TAXO[taxo[i]][1] += rpkm
                TAXO[taxo[i]][2] += 1
            else:
                TAXO[taxo[i]] = [GENES[i], rpkm, 1]
        outf.close()
        outf = open(filename + "." + analysis + ".abundance.16s", 'w')
        ABN = []
        for i in TAXO:
            item = [
                i,
                str(TAXO[i][2]),
                str(TAXO[i][0]),
                str(TAXO[i][1]), taxodb[i].split()[0], taxodb[i].split()[1]
            ]
            outf.write("\t".join(item + ["\n"]))
            ABN.append(item)
        outf.close()
        OUT2 = ABN
    if analysis == "function":
        return ({'rpkm': OUT1, '16s': OUT2})
    else:
        return OUT1
예제 #6
0
def parse_sam(input, db, good_reads):
    root.flog('processing sam file')
    Genes = {}
    flags = {'83': 1, '99': 1, '73': 1, '137': 1, '89': 1, '153': 1}
    for line in open(input):
        try:
            if int(
                    line.split("XM:i:")[1].split()[0]
            ) < 1:  #make sure that the sequence is properly aligned. The output only contains
                line = line.split()
                try:
                    Genes[line[2]] += flags[line[1]]
                except:
                    try:
                        Genes[line[2]] = flags[line[1]]
                    except:
                        pass
        except:
            pass

    fo = open(input + '.taxonomy.abundance.genes.rpkm', 'w')
    taxo_id_ab = {}
    for gene in Genes:
        rpkm = compute_rpkm(Genes[gene], good_reads, db.len[gene])
        fo.write("\t".join([
            gene,
            str(db.taxo[gene]),
            str(Genes[gene]),
            str(rpkm),
            str(db.len[gene])
        ]) + "\n")
        try:
            taxo_id_ab[db.taxo[gene]][0] += Genes[
                gene]  # put in the taxonomy ID the total number of reads that contains this taxonomy
            taxo_id_ab[db.taxo[gene]][1] += rpkm
            taxo_id_ab[db.taxo[gene]][2] += 1
        except:
            taxo_id_ab[db.taxo[gene]] = [
                Genes[gene], rpkm, 1
            ]  # number of reads in the gene, rpkm, and unique genes in the taxonomy category

    fo.close()

    fo = open(input + '.taxonomy.abundance.rpkm', 'w')
    ABN = []
    for id in taxo_id_ab:
        ABN.append([
            id,
            str(taxo_id_ab[id][2]),
            str(taxo_id_ab[id][0]),
            str(taxo_id_ab[id][1]), db.taxodb[id].split()[0],
            db.taxodb[id].split()[1]
        ])  # [category, unique_genes, number of reads, rpkm, lineage]
        fo.write("\t".join([
            id,
            str(taxo_id_ab[id][2]),
            str(taxo_id_ab[id][0]),
            str(taxo_id_ab[id][1]), db.taxodb[id].split()[0],
            db.taxodb[id].split()[1]
        ]) + "\n")
    fo.close()
    root.flog('taxonomy abundance: done')
    return ABN