Ejemplo n.º 1
0
def main():

    args = check_options(get_options())

    genomesize = int(os.path.getsize(args.genome)/1e6)

    kmer = int(log(genomesize, 4)+1)

    if kmer < 17:

        kmer = 17

    #jellyfish par
    lowercount = 2

    #jellyfish par
    jfsize = '100M'

    # splite sequence longer than 10M
    spsize = 10000000

    step = args.step

    maxkmerscore = int(args.length * args.homology / 100) - kmer

    jfpool = Pool(args.threads)

    # ?build kmerindex
    jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf'))

    kmerbuild = True

    if os.path.isfile(jfkmerfile):

        if not args.docker:

            print("find:", jfkmerfile)

            kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it?  Press Y or N to continue:"

            print(kmmess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        kmerbuild = True

                    elif char == 'n':

                        kmerbuild = False

                    break


    # ?build bwa index
    bwaindexfile = os.path.basename(args.genome)

    bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa')

    bwaindex = os.path.join(args.saved, bwaindexfile)

    bwabuild = True

    if os.path.isfile(bwatestindex):

        if not args.docker:

            print('find:', bwatestindex)

            bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:"

            print(bwamess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        bwabuild = True

                    elif char == 'n':

                        bwabuild = False

                    break

    print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:",
          jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads)

    # Build Jellyfish index
    if kmerbuild:

        jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile,
                                    threads=args.threads, lowercount=lowercount, size=jfsize)

        if jfcount:

            print("JellyFish Count finished ...")

        else:

            print("JellyFish Count Error!!!")

            sys.exit(1)

    else:

        print("Use ", jfkmerfile)
    # End build Jellyfish index

    if bwabuild:

        bwa.bwaindex(args.bwa, args.genome, args.saved)

        print("bwa index build finished ...")

    else:

        print("Use", bwatestindex)


    jffilteredprobe = list()

    fastain = Fasta(args.input)

    jffpbrunerlist = list()

    for seqname in fastain.keys():

        chrlen = len(fastain[seqname])

        if chrlen < spsize:

            start = 0

            end = chrlen - 1

            jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                              pyfasta=fastain, seqname=seqname, pblength=args.length,
                                              maxkmerscore=maxkmerscore, start=start,
                                              end=end, step=step)

            jffpbrunerlist.append(jffpbruner)

        else:

            chrblock = int(chrlen/spsize) + 1

            for i in range(chrblock):

                start = i * spsize

                end = start + spsize - 1

                if end >= chrlen:

                    end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                              pyfasta=fastain, seqname=seqname, pblength=args.length,
                                              maxkmerscore=maxkmerscore, start=start,
                                              end=end, step=step)

                jffpbrunerlist.append(jffpbruner)

    jffinished = 0

    for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

        jffilteredprobe.extend(curpblist)

        jffinished += 1

        print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='')

    jfpool.close()

    print('Jellyfish filter finished!!')

    tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa')

    tmppbfaio = open(tmppbfa, 'w')

    seqnum = 0

    for tmppb in jffilteredprobe:

        print('>','seq',seqnum, sep='',file=tmppbfaio)

        print(tmppb,file=tmppbfaio)

        seqnum += 1

    tmppbfaio.close()

    del jffilteredprobe

    bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length,
                                maxxs=int(args.length*args.homology/100), threadnumber=args.threads)

    # print(bwafiltedpb)

    tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed')

    alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed')

    tmpbwaftlistio = open(tmpbwaftlist,'w')

    allbwaftlistio = open(alltmpbwaftlist,'w')

    seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len')

    seqlenio = open(seqlenfile,'w')

    seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex)

    for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

    seqlenio.close()


    oligobefortmf = list()

    for pbtmp in bwafiltedpb:

        # print(pbtmp, file=tmpbwaftlistio)
        nowpbcounter = dict()

        nowpbcounter['seq'] = pbtmp

        nowpbcounter['dTm'] = args.dtm

        nowpbcounter['rprimer'] = args.primer

        oligobefortmf.append(nowpbcounter)

    keepedprobe = list()

    ctedpb = 0

    oligobefortmflen = len(oligobefortmf)

    print("oligobefortmflen:",oligobefortmflen)

    pbftpool = Pool()

    for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf):

        if keep:

            keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)
        ctedpb += 1

        if ctedpb % 10000 == 0:

            print(ctedpb,'/',oligobefortmflen)

    pbdictbychr = dict()

    pbftpool.close()

    for pb in keepedprobe:

        seq, chro, start = pb.split('\t')

        start = int(start)

        if chro in pbdictbychr:

            pbdictbychr[chro][start] = seq

        else:

            pbdictbychr[chro] = dict()

            pbdictbychr[chro][start] = seq

    lenrprimer = len(args.primer)

    if lenrprimer == 0:

            lenrprimer = 5

    slidwindow = lenrprimer+args.length

    for chro in pbdictbychr:

        startn = 0

        for startnow in sorted(pbdictbychr[chro]):

            endnow = startnow + args.length - 1

            print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

            if startnow > startn+slidwindow:
                    #startn = startnow+slidwindow
                startn = startnow

                print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


    tmpbwaftlistio.close()

    allbwaftlistio.close()

    print("Job finshed!!")
Ejemplo n.º 2
0
    def run(self):

        if self.kmerbuild:

            jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath,
                                          mer=self.kmer,
                                          infile=self.genomefile,
                                          output=self.jfkmerfile,
                                          threads=self.threadsnumber,
                                          lowercount=self.lowercount,
                                          size=self.size)
            """
                check jelly fish count run correctly
            """
            if jfcounter:

                self.progressnumber = self.progressnumber + 5

                self.notifyProgress.emit(self.progressnumber)

                self.notifyMessage.emit("JellyFish Count finished...")

            else:

                self.notifyMessage.emit("JellyFish Count Error!!!")

        else:
            jfcountmess = "Use " + self.jfkmerfile
            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)
            self.notifyMessage.emit(jfcountmess)

        if self.indexbuild:

            if self.aligner == 'BWA':

                bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder)

                self.notifyMessage.emit("BWA Index build finished...")

                self.progressnumber = self.progressnumber + 5
                self.notifyProgress.emit(self.progressnumber)

            elif self.aligner == 'BLAT':
                """
                    add code for BLAT
                """

                pass
        else:

            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)
        """
            load and splite input file
        """

        # splite sequence longer than 10M
        spsize = 10000000

        maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer

        jffilteredprobe = list()

        fastain = Fasta(self.inputfile)

        jffpbrunerlist = list()

        for seqname in fastain.keys():

            chrlen = len(fastain[seqname])

            if chrlen < spsize:

                start = 0

                end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath,
                                                  jfkmerfile=self.jfkmerfile,
                                                  mer=self.kmer,
                                                  pyfasta=fastain,
                                                  seqname=seqname,
                                                  pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore,
                                                  start=start,
                                                  end=end,
                                                  step=self.step)
                jffpbrunerlist.append(jffpbruner)

            else:

                chrblock = int(chrlen / spsize) + 1

                for i in range(chrblock):

                    start = i * spsize

                    end = start + spsize - 1

                    if end >= chrlen:

                        end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(
                        jfpath=self.jellyfishpath,
                        jfkmerfile=self.jfkmerfile,
                        mer=self.kmer,
                        pyfasta=fastain,
                        seqname=seqname,
                        pblength=self.pblength,
                        maxkmerscore=maxkmerscore,
                        start=start,
                        end=end,
                        step=self.step)

                    jffpbrunerlist.append(jffpbruner)

        jffinished = 0

        for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe,
                                                  jffpbrunerlist):

            jffilteredprobe.extend(curpblist)

            tmpprogress = float(
                format(
                    self.progressnumber +
                    (jffinished / len(jffpbrunerlist) * 40), ".2f"))

            self.notifyProgress.emit(tmpprogress)

            if self.isRunning():

                print("running")

            else:

                print("not running")

            jffinished += 1

        self.notifyMessage.emit('kmer filter finished!!')

        self.progressnumber = 50.0

        self.notifyProgress.emit(self.progressnumber)

        tmppbfa = os.path.join(
            self.samplefolder,
            os.path.basename(self.inputfile) + '_tmp_probes.fa')

        tmppbfaio = open(tmppbfa, 'w')

        seqnum = 0

        for tmppb in jffilteredprobe:

            print('>', 'seq', seqnum, sep='', file=tmppbfaio)

            print(tmppb, file=tmppbfaio)

            seqnum += 1

        tmppbfaio.close()

        #delete jffilteredprobe and release memory
        del jffilteredprobe

        bwaindexfile = os.path.join(self.samplefolder,
                                    os.path.basename(self.genomefile))

        bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath,
                                    reffile=bwaindexfile,
                                    inputfile=tmppbfa,
                                    minas=self.pblength,
                                    maxxs=int(self.pblength * self.homology /
                                              100),
                                    threadnumber=self.threadsnumber)

        tmpbwaftlist = os.path.join(self.samplefolder,
                                    os.path.basename(self.inputfile) + '.bed')

        alltmpbwaftlist = os.path.join(
            self.samplefolder,
            os.path.basename(self.inputfile) + '_all.bed')

        tmpbwaftlistio = open(tmpbwaftlist, 'w')

        allbwaftlistio = open(alltmpbwaftlist, 'w')

        seqlenfile = os.path.join(self.samplefolder,
                                  os.path.basename(self.inputfile)) + '.len'

        seqlenio = open(seqlenfile, 'w')

        seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile)

        for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

        seqlenio.close()

        oligobefortmf = list()

        for pbtmp in bwafiltedpb:

            # print(pbtmp, file=tmpbwaftlistio)
            nowpbcounter = dict()

            nowpbcounter['seq'] = pbtmp

            nowpbcounter['dTm'] = self.dTm

            nowpbcounter['rprimer'] = self.rprimer

            oligobefortmf.append(nowpbcounter)

        keepedprobe = list()

        self.progressnumber = 55

        self.notifyProgress.emit(self.progressnumber)

        ctedpb = 0

        oligobefortmflen = len(oligobefortmf)

        for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf):

            if keep:

                keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)

            ctedpb += 1

            if ctedpb % 10000 == 0:

                tmpprogress = float(
                    format(
                        self.progressnumber + (ctedpb / oligobefortmflen * 30),
                        ".2f"))

                self.notifyProgress.emit(tmpprogress)

        self.notifyProgress.emit(90)

        pbdictbychr = dict()

        #load pb to dict
        for pb in keepedprobe:

            # print(pb, file=tmpbwaftlistio)
            seq, chro, start = pb.split('\t')

            start = int(start)

            if chro in pbdictbychr:

                pbdictbychr[chro][start] = seq

            else:

                pbdictbychr[chro] = dict()

                pbdictbychr[chro][start] = seq

        #get lenth of primer
        lenrprimer = len(self.rprimer)

        if lenrprimer == 0:

            lenrprimer = 5

        slidwindow = lenrprimer + self.pblength

        for chro in pbdictbychr:

            startn = 0

            for startnow in sorted(pbdictbychr[chro]):

                endnow = startnow + self.pblength - 1

                print(chro,
                      startnow,
                      endnow,
                      pbdictbychr[chro][startnow],
                      file=allbwaftlistio,
                      sep='\t')

                if startnow > startn + slidwindow:

                    #startn = startnow+slidwindow
                    startn = startnow

                    print(chro,
                          startnow,
                          endnow,
                          pbdictbychr[chro][startnow],
                          file=tmpbwaftlistio,
                          sep='\t')

        tmpbwaftlistio.close()

        allbwaftlistio.close()

        #remove temp fasta file
        # os.remove(tmppbfa)

        self.notifyProgress.emit(100)

        self.notifyMessage.emit('all finished!!')
Ejemplo n.º 3
0
    def run(self):

        if self.kmerbuild:

            jfcounter = jellyfish.jfcount(jfpath=self.jellyfishpath, mer=self.kmer,
                                          infile=self.genomefile, output=self.jfkmerfile, threads=self.threadsnumber,
                                          lowercount=self.lowercount, size=self.size)

            """
                check jelly fish count run correctly
            """
            if jfcounter:

                self.progressnumber = self.progressnumber + 5

                self.notifyProgress.emit(self.progressnumber)

                self.notifyMessage.emit("JellyFish Count finished...")

            else:

                self.notifyMessage.emit("JellyFish Count Error!!!")

        else:
            jfcountmess = "Use " + self.jfkmerfile
            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)
            self.notifyMessage.emit(jfcountmess)

        if self.indexbuild:

            if self.aligner == 'BWA':

                bwa.bwaindex(self.alnpath, self.genomefile, self.samplefolder)

                self.notifyMessage.emit("BWA Index build finished...")

                self.progressnumber = self.progressnumber + 5
                self.notifyProgress.emit(self.progressnumber)

            elif self.aligner == 'BLAT':

                """
                    add code for BLAT
                """

                pass
        else:

            self.progressnumber = self.progressnumber + 5
            self.notifyProgress.emit(self.progressnumber)

        """
            load and splite input file
        """

        # splite sequence longer than 10M
        spsize = 10000000

        maxkmerscore = int(self.pblength * self.homology / 100) - self.kmer

        jffilteredprobe = list()

        fastain = Fasta(self.inputfile)

        jffpbrunerlist = list()


        for seqname in fastain.keys():

            chrlen = len(fastain[seqname])

            if chrlen < spsize:

                start = 0

                end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=self.step)
                jffpbrunerlist.append(jffpbruner)

            else:

                chrblock = int(chrlen / spsize) + 1

                for i in range(chrblock):

                    start = i * spsize

                    end = start + spsize - 1

                    if end >= chrlen:

                        end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(jfpath=self.jellyfishpath, jfkmerfile=self.jfkmerfile, mer=self.kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=self.pblength,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=self.step)

                    jffpbrunerlist.append(jffpbruner)



        jffinished = 0

        for curpblist in self.pool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

            jffilteredprobe.extend(curpblist)

            tmpprogress = float(format(self.progressnumber + (jffinished/len(jffpbrunerlist) * 40),".2f"))

            self.notifyProgress.emit(tmpprogress)

            if self.isRunning():

                print("running")

            else:

                print("not running")

            jffinished += 1


        self.notifyMessage.emit('jelly fish finished!!')

        self.progressnumber = 50.0

        self.notifyProgress.emit(self.progressnumber)

        tmppbfa = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_tmp_probes.fa')

        tmppbfaio = open(tmppbfa, 'w')

        seqnum = 0

        for tmppb in jffilteredprobe:

            print('>','seq',seqnum, sep='',file=tmppbfaio)


            print(tmppb,file=tmppbfaio)


            seqnum += 1

        tmppbfaio.close()

        #delete jffilteredprobe and release memory
        del jffilteredprobe

        bwaindexfile = os.path.join(self.samplefolder, os.path.basename(self.genomefile))

        bwafiltedpb = bwa.bwafilter(bwabin=self.alnpath, reffile=bwaindexfile, inputfile=tmppbfa, minas=self.pblength,
                                    maxxs=int(self.pblength * self.homology / 100), threadnumber=self.threadsnumber)


        tmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'.bed')

        alltmpbwaftlist = os.path.join(self.samplefolder, os.path.basename(self.inputfile)+'_all.bed')

        tmpbwaftlistio = open(tmpbwaftlist,'w')

        allbwaftlistio = open(alltmpbwaftlist,'w')

        seqlenfile = os.path.join(self.samplefolder, os.path.basename(self.inputfile))+'.len'

        seqlenio = open(seqlenfile, 'w')

        seqlength = bwa.bwareflength(bwabin=self.alnpath, reffile=bwaindexfile)

        for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

        seqlenio.close()


        oligobefortmf = list()

        for pbtmp in bwafiltedpb:

            # print(pbtmp, file=tmpbwaftlistio)
            nowpbcounter = dict()

            nowpbcounter['seq'] = pbtmp

            nowpbcounter['dTm'] = self.dTm

            nowpbcounter['rprimer'] = self.rprimer


            oligobefortmf.append(nowpbcounter)

        keepedprobe = list()

        self.progressnumber = 55

        self.notifyProgress.emit(self.progressnumber)

        ctedpb = 0



        oligobefortmflen = len(oligobefortmf)

        for (pb, keep) in self.pool.imap_unordered(probefilter, oligobefortmf):

            if keep:

                keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)

            ctedpb += 1

            if ctedpb % 10000 == 0:

                tmpprogress = float(format(self.progressnumber + (ctedpb/oligobefortmflen * 30),".2f"))

                self.notifyProgress.emit(tmpprogress)

        self.notifyProgress.emit(90)

        pbdictbychr = dict()

        #load pb to dict
        for pb in keepedprobe:

            # print(pb, file=tmpbwaftlistio)
            seq, chro, start = pb.split('\t')

            start = int(start)

            if chro in pbdictbychr:

                pbdictbychr[chro][start] = seq

            else:

                pbdictbychr[chro] = dict()



                pbdictbychr[chro][start] = seq


        #get lenth of primer
        lenrprimer = len(self.rprimer)

        if lenrprimer == 0:

            lenrprimer = 5

        slidwindow = lenrprimer+self.pblength


        for chro in pbdictbychr:

            startn = 0

            for startnow in sorted(pbdictbychr[chro]):

                endnow = startnow + self.pblength - 1

                print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

                if startnow > startn+slidwindow:

                    #startn = startnow+slidwindow
                    startn = startnow



                    print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


        tmpbwaftlistio.close()

        allbwaftlistio.close()

        #remove temp fasta file
        # os.remove(tmppbfa)

        self.notifyProgress.emit(100)

        self.notifyMessage.emit('all finished!!')
Ejemplo n.º 4
0
def main():

    args = check_options(get_options())

    genomesize = int(os.path.getsize(args.genome)/1e6)

    kmer = int(log(genomesize, 4)+1)

    if kmer < 17:

        kmer = 17

    #jellyfish par
    lowercount = 2

    #jellyfish par
    jfsize = '100M'

    # splite sequence longer than 10M
    spsize = 10000000

    step = args.step

    maxkmerscore = int(((args.length * args.homology / 100) - kmer) * args.ploidy/2 + 0.5 )

    jfpool = Pool(args.threads)

    # ?build kmerindex
    jfkmerfile = os.path.join(args.saved,(os.path.basename(args.genome)+'_'+str(kmer)+'mer.jf'))

    kmerbuild = True

    if os.path.isfile(jfkmerfile):

        if not args.docker:

            print("find:", jfkmerfile)

            kmmess = "Found kmerfile "+jfkmerfile+". Do you want rebuild it?  Press Y or N to continue:"

            print(kmmess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        kmerbuild = True

                    elif char == 'n':

                        kmerbuild = False

                    break


    # ?build bwa index
    bwaindexfile = os.path.basename(args.genome)

    bwatestindex = os.path.join(args.saved, bwaindexfile+'.sa')

    bwaindex = os.path.join(args.saved, bwaindexfile)

    bwabuild = True

    if os.path.isfile(bwatestindex):

        if not args.docker:

            print('find:', bwatestindex)

            bwamess = "Found bwa index file " + bwatestindex + ". Do you want rebuild it? Press Y or N to continue:"

            print(bwamess)

            while True:

                char = getch()

                if char.lower() in ("y", "n"):

                    print(char)

                    if char == 'y':

                        bwabuild = True

                    elif char == 'n':

                        bwabuild = False

                    break

    print("genomesize:",genomesize, "kmer:",kmer, "jfkmerfile:",
          jfkmerfile, "kmerbuild:", kmerbuild, "bwabuild:", bwabuild, "threads:", args.threads)

    # Build Jellyfish index
    if kmerbuild:

        jfcount = jellyfish.jfcount(jfpath=args.jellyfish, mer=kmer, infile=args.genome, output=jfkmerfile,
                                    threads=args.threads, lowercount=lowercount, size=jfsize)

        if jfcount:

            print("JellyFish Count finished ...")

        else:

            print("JellyFish Count Error!!!")

            sys.exit(1)

    else:

        print("Use ", jfkmerfile)
    # End build Jellyfish index

    if bwabuild:

        bwa.bwaindex(args.bwa, args.genome, args.saved)

        print("bwa index build finished ...")

    else:

        print("Use", bwatestindex)


    jffilteredprobe = list()

#####

    if genomesize < 1000:

        fastain = Fasta(args.input)

        jffpbrunerlist = list()

        for seqname in fastain.keys():

            chrlen = len(fastain[seqname])

            if chrlen < spsize:

                start = 0

                end = chrlen - 1

                jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=step)

                jffpbrunerlist.append(jffpbruner)

            else:

                chrblock = int(chrlen/spsize) + 1

                for i in range(chrblock):

                    start = i * spsize

                    end = start + spsize - 1

                    if end >= chrlen:

                        end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                  pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                  maxkmerscore=maxkmerscore, start=start,
                                                  end=end, step=step)

                    jffpbrunerlist.append(jffpbruner)

        jffinished = 0

        print(len(jffpbrunerlist))

        for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):

            jffilteredprobe.extend(curpblist)

            jffinished += 1

            print("Jellyfish filter: ",jffinished,'/',len(jffpbrunerlist), sep='')

        jfpool.close()

        print('Jellyfish filter finished!!')

    else:

        ### split fa file when geome size greater than 1 Gb

        print("genome size > 1G")

        subFas = spgenome.spgenome(args.input, args.saved)



        for subFafile in subFas:
            print(subFafile)
            fastain = Fasta(subFafile)

            jffpbrunerlist = list()

            for seqname in fastain.keys():

                chrlen = len(fastain[seqname])

                if chrlen < spsize:

                    start = 0

                    end = chrlen - 1

                    jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                      pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                      maxkmerscore=maxkmerscore, start=start,
                                                      end=end, step=step)

                    jffpbrunerlist.append(jffpbruner)

                else:

                    chrblock = int(chrlen / spsize) + 1

                    for i in range(chrblock):

                        start = i * spsize

                        end = start + spsize - 1

                        if end >= chrlen:
                            end = chrlen - 1

                        jffpbruner = jellyfish.JFfpbruner(jfpath=args.jellyfish, jfkmerfile=jfkmerfile, mer=kmer,
                                                          pyfasta=fastain, seqname=seqname, pblength=args.length,
                                                          maxkmerscore=maxkmerscore, start=start,
                                                          end=end, step=step)

                        jffpbrunerlist.append(jffpbruner)

            jffinished = 0

            print(len(jffpbrunerlist))

            for curpblist in jfpool.imap_unordered(jellyfish.kmerfilterprobe, jffpbrunerlist):
                jffilteredprobe.extend(curpblist)

                jffinished += 1

                print(subFafile + " Jellyfish filter: ", jffinished, '/', len(jffpbrunerlist), sep='')


        jfpool.close()

        print('Jellyfish filter finished!!')


    tmppbfa = os.path.join(args.saved, os.path.basename(args.input)+'_tmp_probe.fa')

    tmppbfaio = open(tmppbfa, 'w')

    seqnum = 0

    for tmppb in jffilteredprobe:

        print('>','seq',seqnum, sep='',file=tmppbfaio)

        print(tmppb,file=tmppbfaio)

        seqnum += 1

    tmppbfaio.close()

    del jffilteredprobe

    print("run bwafilter")
    print("maxxs:", int(args.length*args.homology/100))

    bwafiltedpb = bwa.bwafilter(bwabin=args.bwa, reffile=bwaindex, inputfile=tmppbfa, minas=args.length,
                                maxxs=int(args.length*args.homology/100), threadnumber=args.threads)

    print("bwafiltedpb len",len(bwafiltedpb))

    print(bwafiltedpb[0:10])

    tmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'.bed')

    alltmpbwaftlist = os.path.join(args.saved, os.path.basename(args.input)+'_all.bed')

    tmpbwaftlistio = open(tmpbwaftlist,'w')

    allbwaftlistio = open(alltmpbwaftlist,'w')

    seqlenfile = os.path.join(args.saved, os.path.basename(args.input)+'.len')

    seqlenio = open(seqlenfile,'w')

    seqlength = bwa.bwareflength(bwabin=args.bwa, reffile=bwaindex)

    for seqname in seqlength:

            print(seqname, seqlength[seqname], sep='\t', file=seqlenio)

    seqlenio.close()


    oligobefortmf = list()

    for pbtmp in bwafiltedpb:

        # print(pbtmp, file=tmpbwaftlistio)
        nowpbcounter = dict()

        nowpbcounter['seq'] = pbtmp

        nowpbcounter['dTm'] = args.dtm

        nowpbcounter['rprimer'] = args.primer

        oligobefortmf.append(nowpbcounter)

    keepedprobe = list()

    ctedpb = 0

    oligobefortmflen = len(oligobefortmf)

    print("oligobefortmflen:",oligobefortmflen)

    pbftpool = Pool()

    for (pb, keep) in pbftpool.imap_unordered(probefilter, oligobefortmf):

        if keep:

            keepedprobe.append(pb)
                # print(pb, file=tmpbwaftlistio)
        ctedpb += 1

        if ctedpb % 10000 == 0:

            print(ctedpb,'/',oligobefortmflen)

    pbdictbychr = dict()

    pbftpool.close()

    for pb in keepedprobe:

        seq, chro, start = pb.split('\t')

        start = int(start)

        if chro in pbdictbychr:

            pbdictbychr[chro][start] = seq

        else:

            pbdictbychr[chro] = dict()

            pbdictbychr[chro][start] = seq

    lenrprimer = len(args.primer)

    if lenrprimer == 0:

            lenrprimer = 5

    slidwindow = lenrprimer+args.length

    for chro in pbdictbychr:

        startn = 0

        for startnow in sorted(pbdictbychr[chro]):

            endnow = startnow + args.length - 1

            print(chro, startnow, endnow, pbdictbychr[chro][startnow],file=allbwaftlistio,sep='\t')

            if startnow > startn+slidwindow:
                    #startn = startnow+slidwindow
                startn = startnow

                print(chro, startnow, endnow, pbdictbychr[chro][startnow], file=tmpbwaftlistio, sep='\t')


    tmpbwaftlistio.close()

    allbwaftlistio.close()

    print("Job finshed!!")
Ejemplo n.º 5
0
def main():

    args = check_options(get_options())

    # jellyfish par
    jfsize = '100M'

    # ?build bwa index
    bwaindexfile = os.path.basename(args.genome)

    tmpfolder = args.tmp

    bwatestindex = os.path.join(tmpfolder, bwaindexfile + '.sa')

    bwaindex = os.path.join(tmpfolder, bwaindexfile)

    bwabuild = True

    if os.path.isfile(bwatestindex):

        bwabuild = False

    if bwabuild:

        # build bwa index
        bwa.bwaindex(args.bwa, args.genome, tmpfolder)

        print("bwa index build finished ...")

    else:

        print("Use", bwatestindex)

    sampleinfor = dict()

    names = args.names.split(',')

    reads1 = args.reads1.split(',')

    reads2 = args.reads2.split(',')

    cnsfile = os.path.join(args.saved, '_'.join(names) + '_cns_probe.csv')

    print(cnsfile)

    cnsio = open(cnsfile, 'w')

    for i in range(len(names)):

        name = names[i]

        read1 = reads1[i]

        read2 = reads2[i]

        bamfile = os.path.join(tmpfolder, name + '.bam')

        bcffile = os.path.join(tmpfolder, name + '.bcf')

        jffile = os.path.join(tmpfolder, name + '.jf')

        cnsprobe = os.path.join(args.saved, name + '_probe.txt')

        # new add indel
        indelNprobe = os.path.join(args.saved, name + '_indel_probe.txt')

        mindepth = os.path.join(tmpfolder, name + '_mindepth.bed')

        if name in sampleinfor:

            print("error same name:", name)

        else:

            sampleinfor[name] = dict()

            sampleinfor[name]['read1'] = read1

            sampleinfor[name]['read2'] = read2

            sampleinfor[name]['bamfile'] = bamfile

            sampleinfor[name]['bcffile'] = bcffile

            sampleinfor[name]['jffile'] = jffile

            # sampleinfor[name]['kmerscore'] = kmerscore
            #
            # sampleinfor[name]['kmerscoreio'] = open(kmerscore, 'w')

            sampleinfor[name]['cnsprobe'] = cnsprobe

            sampleinfor[name]['cnsprobeio'] = open(cnsprobe, 'w')

            # new add indel
            sampleinfor[name]['indelNprobelist'] = list()
            sampleinfor[name]['indelNprobeio'] = open(indelNprobe, 'w')

            sampleinfor[name]['mindepth'] = mindepth

            # run bwa mem
            bwa.bwamem_paired(bwabin=args.bwa,
                              samtoolsbin=args.samtools,
                              reffile=bwaindex,
                              outfile=bamfile,
                              inputfile1=read1,
                              inputfile2=read2,
                              samplename=name,
                              threadnumber=args.threads)

            print("bwa mem", name, 'finished')

            # get min depth bed file
            bamdepth.bamdepthtobed(bamfile=bamfile,
                                   outbed=mindepth,
                                   mindepth=args.mindepth,
                                   minlength=200)

            print(mindepth, 'done')

            # generate bcf file from bam file
            bcftools.bamtobcf(bcfbin=args.bcftools,
                              reffile=bwaindex,
                              bamfile=bamfile,
                              outbcf=bcffile)

            print(bcffile, "done")

            # generate jf file

            jellyfish.makegenerator(filenames=[read1, read2],
                                    type='gz',
                                    generators='generators')

            jellyfish.jfgeneratorscount(jfpath=args.jellyfish,
                                        mer=args.length,
                                        output=jffile,
                                        generators='generators',
                                        threads=args.threads,
                                        size='100M')

            print(jffile, "done")

    probe = BedTool(args.probe).sort()

    for name in sampleinfor:

        nowprobe = BedTool(sampleinfor[name]['mindepth']).sort()

        probe = probe.intersect(nowprobe, wa=True, u=True)

    # cnsprobe

    for name in sampleinfor:

        bcfpool = Pool(args.threads)

        bcfrunerlist = list()

        consensusprobelist = list()

        for i in probe:

            probestr = str(i).rstrip()

            bcfconsensusruner = bcftools.BcfConsensusRuner(
                probestr=probestr,
                bcftoolspath=args.bcftools,
                bcffile=sampleinfor[name]['bcffile'],
                sample=name)

            bcfrunerlist.append(bcfconsensusruner)
            # consensusprobe = bcftools.probestrtoconsensus(bcfconsensusruner)
            #
            # print(probestr, consensusprobe, sep='\t')

        reslist = list()

        for res in bcfpool.imap_unordered(bcftools.probestrtoconsensus,
                                          bcfrunerlist):

            # print(res['probestr'], name, res['consensusprobe'], sep='\t', file=sampleinfor[name]['cnsprobeio'])

            if len(res['consensusprobe']) != args.length:

                sampleinfor[name]['indelNprobelist'].append(res)

            elif 'N' in res['consensusprobe']:

                continue

            else:

                consensusprobelist.append(res['consensusprobe'])
                # consensusprobelist.append(res)
                reslist.append(res)

        bcfpool.close()

        consensusprobekmerscore = jellyfish.jfquerylist(
            jfkmerfile=sampleinfor[name]['jffile'],
            jfpath=args.jellyfish,
            seqlist=consensusprobelist)

        kmerscoredict = dict()

        kmerscorelist = list()

        for score in consensusprobekmerscore:

            # print(score, file=sampleinfor[name]['kmerscoreio'])
            (subseq, kmerscore) = score.split(',')

            if 'N' not in subseq:

                kmerscoredict[subseq] = int(kmerscore)

                kmerscorelist.append(int(kmerscore))

        maxkmer = pd.Series(kmerscorelist).quantile(0.9)

        minkmer = args.minkmer

        for consensusprobe in reslist:

            probestr = consensusprobe['probestr']

            consensusprobeseq = consensusprobe['consensusprobe']

            if consensusprobeseq in kmerscoredict:

                if kmerscoredict[consensusprobeseq] <= maxkmer:

                    if kmerscoredict[consensusprobeseq] >= minkmer:

                        print(probestr,
                              consensusprobeseq,
                              kmerscoredict[consensusprobeseq],
                              sep='\t',
                              file=sampleinfor[name]['cnsprobeio'])

    for name in sampleinfor:

        sampleinfor[name]['cnsprobeio'].close()
        # sampleinfor[name]['kmerscoreio'].close()
        # print(sampleinfor)

        for res in sampleinfor[name]['indelNprobelist']:

            print(res['probestr'],
                  name,
                  res['consensusprobe'],
                  sep='\t',
                  file=sampleinfor[name]['indelNprobeio'])

        sampleinfor[name]['indelNprobeio'].close()

    probdict = dict()

    for name in sampleinfor:

        with open(sampleinfor[name]['cnsprobe']) as inio:

            for infor in inio:

                infor = infor.rstrip()

                inforlist = infor.split('\t')

                orgprb = inforlist[3]

                if orgprb in probdict:

                    probdict[orgprb][name] = infor

                else:

                    probdict[orgprb] = dict()

                    probdict[orgprb][name] = infor

    print('chrom',
          'start',
          'end',
          'refseq',
          ','.join(sampleinfor),
          'consensusprobe',
          'consensusscore',
          'consensussite',
          'consensusdiff',
          sep=',',
          file=cnsio)

    for orgprb in probdict:

        sharecount = len(probdict[orgprb])

        values_view = probdict[orgprb].values()
        value_iterator = iter(values_view)
        first_value = next(value_iterator).split('\t')

        outinfo = first_value[0:3]

        if len(sampleinfor) == sharecount:
            #         print(sampleinfor, sharecount)
            # print(orgprb, len(probdict[orgprb]))
            probelist = list()
            namelist = list()
            namelist.append('refseq')
            probelist.append(orgprb)

            for name in sampleinfor:

                infor = probdict[orgprb][name].split('\t')

                speciesprobe = infor[-2]

                namelist.append(name)
                if len(speciesprobe) == len(orgprb):
                    probelist.append(speciesprobe)

            if len(namelist) == len(probelist):
                #             print(namelist, probelist)

                res = probecompare.getconsensusprobe(probelist)
                outinfo.extend(probelist)
                print(','.join(outinfo),
                      res['consensusprobe'],
                      res['consensusscore'],
                      res['consensussite'],
                      res['consensusdiff'],
                      sep=',',
                      file=cnsio)

    cnsio.close()

    print("finished")