Esempio n. 1
0
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn,
                           queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            seqs = readGenomicSeqsFromFasta(
                os.path.join(genomeDir, genomeId, genomeId + '.fna'))

            # calculate GC of genome
            numericGenome = self.__createNumericGenome(seqs)
            counts = np.bincount(numericGenome)
            gc = counts[0]
            totalBases = gc + counts[1]
            meanGC = float(gc) / totalBases

            fout = open('./deltaGC/' + genomeId + '.tsv', 'w')
            fout.write('# Mean GC = ' + str(meanGC) + '\n')

            # calculate GC distribution for different window sizes
            for windowSize in windowSizes:
                endWindowPos = len(numericGenome) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                requiredBasePairs = 0.9 * windowSize

                deltaGCs = []
                while len(deltaGCs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    # calculate GC
                    counts = np.bincount(
                        numericGenome[startWindow:(startWindow + windowSize)])
                    gc = counts[0]
                    totalBases = gc + counts[1]

                    if totalBases < requiredBasePairs:
                        # there are N's in the window so skip it
                        continue

                    gcPer = float(gc) / totalBases
                    deltaGCs.append(gcPer - meanGC)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaGCs)) + '\n')
            fout.close()

            queueOut.put(genomeId)
Esempio n. 2
0
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn,
                           queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            start = time.time()

            seqs = readGenomicSeqsFromFasta(
                os.path.join(genomeDir, genomeId, genomeId + '.fna'))
            genomeScaffold = 'NNNN'.join(seqs.values())

            # calculate tetranucleotide signature of genome
            gsCalculator = GenomicSignatures(4)
            genomeSig = gsCalculator.seqSignature(genomeScaffold)

            fout = open('./deltaTD/' + genomeId + '.tsv', 'w')
            fout.write('# Tetra signature = ' + str(genomeSig) + '\n')
            fout.close()
            sys.exit()

            # calculate tetranucleotide distance distribution for different window sizes
            startW = time.time()
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaTDs = []
                while len(deltaTDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    windowSig = gsCalculator.seqSignature(
                        genomeScaffold[startWindow:(startWindow + windowSize)])
                    dist = gsCalculator.distance(genomeSig, windowSig)
                    deltaTDs.append(dist)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaTDs)) + '\n')
            fout.close()
            endW = time.time()
            print endW - startW

            queueOut.put(genomeId)

            end = time.time()
            print end - start
Esempio n. 3
0
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna'))

            # calculate GC of genome
            numericGenome = self.__createNumericGenome(seqs)
            counts = np.bincount(numericGenome)
            gc = counts[0]
            totalBases = gc + counts[1]
            meanGC = float(gc) / totalBases

            fout = open('./deltaGC/' + genomeId + '.tsv', 'w')
            fout.write('# Mean GC = ' + str(meanGC) + '\n')

            # calculate GC distribution for different window sizes
            for windowSize in windowSizes:
                endWindowPos = len(numericGenome) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                requiredBasePairs = 0.9*windowSize

                deltaGCs = []
                while len(deltaGCs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    # calculate GC
                    counts = np.bincount(numericGenome[startWindow:(startWindow+windowSize)])
                    gc = counts[0]
                    totalBases = gc + counts[1]

                    if totalBases < requiredBasePairs:
                        # there are N's in the window so skip it
                        continue

                    gcPer = float(gc) / totalBases
                    deltaGCs.append(gcPer - meanGC)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaGCs)) + '\n')
            fout.close()

            queueOut.put(genomeId)
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            start = time.time()

            seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna'))
            genomeScaffold = 'NNNN'.join(seqs.values())

            # calculate tetranucleotide signature of genome
            gsCalculator = GenomicSignatures(4)
            genomeSig = gsCalculator.seqSignature(genomeScaffold)

            fout = open('./deltaTD/' + genomeId + '.tsv', 'w')
            fout.write('# Tetra signature = ' + str(genomeSig) + '\n')
            fout.close()
            sys.exit()

            # calculate tetranucleotide distance distribution for different window sizes
            startW = time.time()
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaTDs = []
                while len(deltaTDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    windowSig = gsCalculator.seqSignature(genomeScaffold[startWindow:(startWindow+windowSize)])
                    dist = gsCalculator.distance(genomeSig, windowSig)
                    deltaTDs.append(dist)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaTDs)) + '\n')
            fout.close()
            endW = time.time()
            print endW - startW

            queueOut.put(genomeId)

            end = time.time()
            print end - start
Esempio n. 5
0
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna'))

            # for simplicity, create a single scaffold from all sequences
            genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna')
            genomeScaffold = 'NNNNNNNNNN'.join(list(seqs.values())).upper()
            fout = open(genomeFile, 'w')
            fout.write('>' + genomeId + '\n')
            fout.write(genomeScaffold)
            fout.close()

            # run prodigal on genome
            ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna')
            gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff')

            cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile))
            os.system(cmd)

            # calculate mean coding density of genome
            numericScaffold = self.__createNumericScaffold(genomeScaffold)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)

            codingBases = prodigalParser.codingBases(genomeId)

            counts = np.bincount(numericScaffold)
            totalBases = counts[0]

            meanCD = float(codingBases) / totalBases

            fout = open('./deltaCD/' + genomeId + '.tsv', 'w')
            fout.write('# Mean CD = ' + str(meanCD) + '\n')

            # calculate coding density distribution for different window sizes
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaCDs = []
                while len(deltaCDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    # calculate coding density
                    codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize)
                    counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)])
                    totalBases = counts[0]

                    if totalBases != windowSize:
                        # there are N's in the window so skip it
                        continue

                    cdPer = float(codingBases) / totalBases
                    deltaCDs.append(cdPer - meanCD)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaCDs)) + '\n')
            fout.close()

            queueOut.put(genomeId)
    def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut):
        while True:
            genomeId = queueIn.get(block=True, timeout=None)
            if genomeId == None:
                break

            seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna'))

            # for simplicity, create a single scaffold from all sequences
            genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna')
            genomeScaffold = 'NNNNNNNNNN'.join(seqs.values()).upper()
            fout = open(genomeFile, 'w')
            fout.write('>' + genomeId + '\n')
            fout.write(genomeScaffold)
            fout.close()

            # run prodigal on genome
            ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna')
            gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff')

            cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile))
            os.system(cmd)

            # calculate mean coding density of genome
            numericScaffold = self.__createNumericScaffold(genomeScaffold)

            prodigalParser = ProdigalGeneFeatureParser(gffFile)

            codingBases = prodigalParser.codingBases(genomeId)

            counts = np.bincount(numericScaffold)
            totalBases = counts[0]

            meanCD = float(codingBases) / totalBases

            fout = open('./deltaCD/' + genomeId + '.tsv', 'w')
            fout.write('# Mean CD = ' + str(meanCD) + '\n')

            # calculate coding density distribution for different window sizes
            for windowSize in windowSizes:
                endWindowPos = len(genomeScaffold) - windowSize
                if endWindowPos <= 0:
                    # This might occur for the largest window sizes and smallest genomes
                    break

                deltaCDs = []
                while len(deltaCDs) != numWindows:
                    # pick random window
                    startWindow = randint(0, endWindowPos)

                    # calculate coding density
                    codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize)
                    counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)])
                    totalBases = counts[0]

                    if totalBases != windowSize:
                        # there are N's in the window so skip it
                        continue

                    cdPer = float(codingBases) / totalBases
                    deltaCDs.append(cdPer - meanCD)

                fout.write('Windows Size = ' + str(windowSize) + '\n')
                fout.write(','.join(map(str, deltaCDs)) + '\n')
            fout.close()

            queueOut.put(genomeId)