def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break seqs = readGenomicSeqsFromFasta( os.path.join(genomeDir, genomeId, genomeId + '.fna')) # calculate GC of genome numericGenome = self.__createNumericGenome(seqs) counts = np.bincount(numericGenome) gc = counts[0] totalBases = gc + counts[1] meanGC = float(gc) / totalBases fout = open('./deltaGC/' + genomeId + '.tsv', 'w') fout.write('# Mean GC = ' + str(meanGC) + '\n') # calculate GC distribution for different window sizes for windowSize in windowSizes: endWindowPos = len(numericGenome) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break requiredBasePairs = 0.9 * windowSize deltaGCs = [] while len(deltaGCs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) # calculate GC counts = np.bincount( numericGenome[startWindow:(startWindow + windowSize)]) gc = counts[0] totalBases = gc + counts[1] if totalBases < requiredBasePairs: # there are N's in the window so skip it continue gcPer = float(gc) / totalBases deltaGCs.append(gcPer - meanGC) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaGCs)) + '\n') fout.close() queueOut.put(genomeId)
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break start = time.time() seqs = readGenomicSeqsFromFasta( os.path.join(genomeDir, genomeId, genomeId + '.fna')) genomeScaffold = 'NNNN'.join(seqs.values()) # calculate tetranucleotide signature of genome gsCalculator = GenomicSignatures(4) genomeSig = gsCalculator.seqSignature(genomeScaffold) fout = open('./deltaTD/' + genomeId + '.tsv', 'w') fout.write('# Tetra signature = ' + str(genomeSig) + '\n') fout.close() sys.exit() # calculate tetranucleotide distance distribution for different window sizes startW = time.time() for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaTDs = [] while len(deltaTDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) windowSig = gsCalculator.seqSignature( genomeScaffold[startWindow:(startWindow + windowSize)]) dist = gsCalculator.distance(genomeSig, windowSig) deltaTDs.append(dist) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaTDs)) + '\n') fout.close() endW = time.time() print endW - startW queueOut.put(genomeId) end = time.time() print end - start
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna')) # calculate GC of genome numericGenome = self.__createNumericGenome(seqs) counts = np.bincount(numericGenome) gc = counts[0] totalBases = gc + counts[1] meanGC = float(gc) / totalBases fout = open('./deltaGC/' + genomeId + '.tsv', 'w') fout.write('# Mean GC = ' + str(meanGC) + '\n') # calculate GC distribution for different window sizes for windowSize in windowSizes: endWindowPos = len(numericGenome) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break requiredBasePairs = 0.9*windowSize deltaGCs = [] while len(deltaGCs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) # calculate GC counts = np.bincount(numericGenome[startWindow:(startWindow+windowSize)]) gc = counts[0] totalBases = gc + counts[1] if totalBases < requiredBasePairs: # there are N's in the window so skip it continue gcPer = float(gc) / totalBases deltaGCs.append(gcPer - meanGC) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaGCs)) + '\n') fout.close() queueOut.put(genomeId)
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break start = time.time() seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna')) genomeScaffold = 'NNNN'.join(seqs.values()) # calculate tetranucleotide signature of genome gsCalculator = GenomicSignatures(4) genomeSig = gsCalculator.seqSignature(genomeScaffold) fout = open('./deltaTD/' + genomeId + '.tsv', 'w') fout.write('# Tetra signature = ' + str(genomeSig) + '\n') fout.close() sys.exit() # calculate tetranucleotide distance distribution for different window sizes startW = time.time() for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaTDs = [] while len(deltaTDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) windowSig = gsCalculator.seqSignature(genomeScaffold[startWindow:(startWindow+windowSize)]) dist = gsCalculator.distance(genomeSig, windowSig) deltaTDs.append(dist) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaTDs)) + '\n') fout.close() endW = time.time() print endW - startW queueOut.put(genomeId) end = time.time() print end - start
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna')) # for simplicity, create a single scaffold from all sequences genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna') genomeScaffold = 'NNNNNNNNNN'.join(list(seqs.values())).upper() fout = open(genomeFile, 'w') fout.write('>' + genomeId + '\n') fout.write(genomeScaffold) fout.close() # run prodigal on genome ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna') gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff') cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile)) os.system(cmd) # calculate mean coding density of genome numericScaffold = self.__createNumericScaffold(genomeScaffold) prodigalParser = ProdigalGeneFeatureParser(gffFile) codingBases = prodigalParser.codingBases(genomeId) counts = np.bincount(numericScaffold) totalBases = counts[0] meanCD = float(codingBases) / totalBases fout = open('./deltaCD/' + genomeId + '.tsv', 'w') fout.write('# Mean CD = ' + str(meanCD) + '\n') # calculate coding density distribution for different window sizes for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaCDs = [] while len(deltaCDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) # calculate coding density codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize) counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)]) totalBases = counts[0] if totalBases != windowSize: # there are N's in the window so skip it continue cdPer = float(codingBases) / totalBases deltaCDs.append(cdPer - meanCD) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaCDs)) + '\n') fout.close() queueOut.put(genomeId)
def __calculateResults(self, windowSizes, numWindows, genomeDir, queueIn, queueOut): while True: genomeId = queueIn.get(block=True, timeout=None) if genomeId == None: break seqs = readGenomicSeqsFromFasta(os.path.join(genomeDir, genomeId, genomeId + '.fna')) # for simplicity, create a single scaffold from all sequences genomeFile = os.path.join('./deltaCD/genomes', genomeId + '.single_scaffold.fna') genomeScaffold = 'NNNNNNNNNN'.join(seqs.values()).upper() fout = open(genomeFile, 'w') fout.write('>' + genomeId + '\n') fout.write(genomeScaffold) fout.close() # run prodigal on genome ntFile = os.path.join('./deltaCD/prodigal', genomeId + '.genes.fna') gffFile = os.path.join('./deltaCD/prodigal', genomeId + '.gff') cmd = ('prodigal -q -c -m -f gff -d %s -i %s > %s' % (ntFile, genomeFile, gffFile)) os.system(cmd) # calculate mean coding density of genome numericScaffold = self.__createNumericScaffold(genomeScaffold) prodigalParser = ProdigalGeneFeatureParser(gffFile) codingBases = prodigalParser.codingBases(genomeId) counts = np.bincount(numericScaffold) totalBases = counts[0] meanCD = float(codingBases) / totalBases fout = open('./deltaCD/' + genomeId + '.tsv', 'w') fout.write('# Mean CD = ' + str(meanCD) + '\n') # calculate coding density distribution for different window sizes for windowSize in windowSizes: endWindowPos = len(genomeScaffold) - windowSize if endWindowPos <= 0: # This might occur for the largest window sizes and smallest genomes break deltaCDs = [] while len(deltaCDs) != numWindows: # pick random window startWindow = randint(0, endWindowPos) # calculate coding density codingBases = prodigalParser.codingBases(genomeId, startWindow, startWindow+windowSize) counts = np.bincount(numericScaffold[startWindow:(startWindow+windowSize)]) totalBases = counts[0] if totalBases != windowSize: # there are N's in the window so skip it continue cdPer = float(codingBases) / totalBases deltaCDs.append(cdPer - meanCD) fout.write('Windows Size = ' + str(windowSize) + '\n') fout.write(','.join(map(str, deltaCDs)) + '\n') fout.close() queueOut.put(genomeId)