def addBarcodeClusterInfoToDatabase(self, barcodeClusters): """ adds the cluster info to the database both one entry for each cluster but also updates each readpair entry """ from dbs_analysis import misc from dbs_analysis import metadata # # set initial values # tmpUpdateValues = {} updateValues = [] updateChunks = [] updateChunkSize = 10000 addValues = [(None, 0, '[]', '[]', 'NoneCluster', None, None, None)] # # drop old data and create table for new data # if self.logfile: self.logfile.write( 'Create barcodeClusters table (and drop old one if needed) ...\n' ) self.analysisfolder.database.getConnection() self.analysisfolder.database.c.execute( "DROP TABLE IF EXISTS barcodeClusters") self.analysisfolder.database.c.execute( '''CREATE TABLE barcodeClusters (clusterId,clusterTotalReadCount,readPairsList,readBarcodeIdentitiesList,clusterBarcodeSequence,clusterBarcodeQuality,contigSequencesList,annotations,PRIMARY KEY (clusterId))''' ) if self.logfile: self.logfile.write('commiting changes to database.\n') self.analysisfolder.database.commitAndClose() # # Convert the dictionary to be able to add info to database # progress = misc.Progress(len(barcodeClusters), logfile=self.logfile, unit='clusters', mem=True) if self.logfile: self.logfile.write('Converting the data ... \n') for clusterId, data in barcodeClusters.iteritems(): for readPairId in data['readPairs']: tmpUpdateValues[readPairId] = clusterId addValues.append( (clusterId, data['clusterReadCount'], str(data['readPairs']), str(data['identities']), data['clusterBarcodeSequence'], data['clusterBarcodeQuality'], None, None)) progress.update() for readPairId in sorted(tmpUpdateValues.keys()): updateValues.append( (int(tmpUpdateValues[readPairId]), int(readPairId))) if len(updateValues) == updateChunkSize: updateChunks.append(updateValues) updateValues = [] updateChunks.append(updateValues) # # Add the barcodeClusters # if self.logfile: self.logfile.write('Adding cluster info to database ... \n') self.analysisfolder.database.getConnection() self.analysisfolder.database.c.executemany( 'INSERT INTO barcodeClusters VALUES (?,?,?,?,?,?,?,?)', addValues) if self.logfile: self.logfile.write('commiting changes to database.\n') self.analysisfolder.database.commitAndClose() # # Update the reads table # if self.logfile: self.logfile.write( 'Updating read pair info in the database ... \n') progress = misc.Progress(len(tmpUpdateValues), logfile=self.logfile, unit='reads-updated', mem=True) with progress: for updateValues in updateChunks: self.analysisfolder.database.getConnection() self.analysisfolder.database.c.executemany( 'UPDATE reads SET clusterId=? WHERE id=?', updateValues) self.analysisfolder.database.commitAndClose() for i in xrange(len(updateValues)): progress.update() return 0
def generateBarcodeFastq(self, ): """ function that loads the barcode sequnces found from the database and creates a fasta file with these sequence formated and rady for running the clustering """ # # imports # import operator from dbs_analysis import misc if self.analysisfolder.settings.type == 'HLA': from dbs_analysis.sequences import HLA_DBS as DBS elif self.analysisfolder.settings.type == 'WFA': from dbs_analysis.sequences import WFA_DBS as DBS if self.logfile: self.logfile.write('Generating barcode fastq ...\n') # # setting initial values # uniqBarcodeSequences = {} temporaryDict = {} barcodeCounter = 0 totalReadPairCounter = 0 qualities = {} readPairHasBarcodeCounter = 0 base_frequencies = [{ 'A': 0, 'T': 0, 'G': 0, 'C': 0 } for i in xrange(len(DBS))] if self.logfile: self.logfile.write('Loading read pairs ...\n') progress = misc.Progress(self.analysisfolder.results.totalReadCount, logfile=self.logfile, unit='reads-loaded-from-db', mem=True) with progress: self.analysisfolder.database.getConnection() for pairid, barcodeSequence, qual in self.analysisfolder.database.c.execute( 'SELECT id, dbsSeq, dbsQual FROM reads'): pairid = int(pairid) if barcodeSequence: readPairHasBarcodeCounter += 1 qualities[pairid] = qual try: uniqBarcodeSequences[barcodeSequence].append(pairid) except KeyError: uniqBarcodeSequences[barcodeSequence] = [pairid] for i in xrange(len(barcodeSequence)): base_frequencies[i][barcodeSequence[i]] += 1 progress.update() if self.logfile: self.logfile.write('Done.\n') self.analysisfolder.results.setResult('uniqueBarcodeSequences', len(uniqBarcodeSequences)) if self.logfile: self.logfile.write( str(self.analysisfolder.results.uniqueBarcodeSequences) + ' uniq barcode sequences found within the read pair population.\n' ) # # print base frequenzies for raw barcodes to file # with open( self.analysisfolder.dataPath + '/rawBarcodeBaseFreq.dict.txt', 'w') as outfile: outfile.write(str(base_frequencies)) self.id2seq = {} if self.logfile: self.logfile.write( 'Sorting the barcodes by number of reads/sequence.\n') if self.logfile: self.logfile.write('Building the sorting dictionary ...\n') for barcode, idList in uniqBarcodeSequences.iteritems(): try: temporaryDict[len(idList)].append(barcode) except KeyError: temporaryDict[len(idList)] = [barcode] for idnumber in idList: self.id2seq[idnumber] = barcode if self.logfile: self.logfile.write('Creating output ... \n') barcodeFastqFile = open(self.analysisfolder.dbsfastq, 'w') progress = misc.Progress(readPairHasBarcodeCounter, logfile=self.logfile, unit='reads-to-fastq', mem=True) with progress: for count, barcodes in sorted(temporaryDict.iteritems(), key=operator.itemgetter(0))[::-1]: for barcode in barcodes: barcodeCounter += 1 readPairCounter = 0 for readPairId in uniqBarcodeSequences[barcode]: readPairCounter += 1 totalReadPairCounter += 1 barcodeFastqFile.write('@' + str(readPairId) + ' bc=' + str(barcodeCounter) + ' rp=' + str(readPairCounter) + ' bctrp=' + str(count) + '\n' + barcode + '\n+\n' + qualities[readPairId] + '\n') progress.update() barcodeFastqFile.close() return readPairHasBarcodeCounter
def parseBarcodeClusteringOutput(self, readPairsHasBarcode): """ parse the output from the clustering programs to find what reads ids have beeen clustered together """ from dbs_analysis import misc # # inititate variables # totalClusterCount = 0 barcodeClusters = {} singletonClusters = {} nonSingletonClusters = {} # # open file connections # # consensusFile = open(self.analysisfolder.dataPath+'/clusteredBarcodeSequences.consensus.fastq') clstrFile = open(self.analysisfolder.dataPath + '/clusteredBarcodeSequences.clstr') # # load cluster ids and consensus sequences # if self.logfile: self.logfile.write( '\nLoading barcode clusters and a barcode consesnsus sequences for each cluster ...\n' ) # while True: # header = consensusFile.readline().rstrip() # barcodeSequence = consensusFile.readline().rstrip() # junk = consensusFile.readline().rstrip() # barcodeQuality = consensusFile.readline().rstrip() # if header == '': break # totalClusterCount += 1 # header = header.split('_cluster_') # clusterId = int(header[1].split(' ')[0]) # if header[0][:2] == '@s': # singletonClusters[clusterId] = {'clusterReadCount':1,'readPairs':[],'identities':[],'clusterBarcodeSequence':barcodeSequence,'clusterBarcodeQuality':barcodeQuality} # barcodeClusters[clusterId] = singletonClusters[clusterId] # elif header[0][:2] == '@c': # nonSingletonClusters[clusterId] = {'clusterReadCount':int(header[1].split(' ')[2]),'readPairs':[],'identities':[],'clusterBarcodeSequence':barcodeSequence,'clusterBarcodeQuality':barcodeQuality} # barcodeClusters[clusterId] = nonSingletonClusters[clusterId] # else: raise ValueError # self.analysisfolder.results.setResult('barcodeClusterCount',totalClusterCount) # self.analysisfolder.results.setResult('singeltonBarcodeClusters',len(singletonClusters)) if self.logfile: self.logfile.write( 'A total of ' + str(totalClusterCount) + ' clusters of barcode sequences were loaded into memory.\n') # # Load what readpairs are in each cluster # if self.logfile: self.logfile.write( '\nLoading read pair to barcode cluster connections ...\n') progress = misc.Progress(readPairsHasBarcode, logfile=self.logfile, unit='reads-loaded', mem=True) # tmp_totalClusterCount = 0 barcodeCluster = None with progress: for line in clstrFile: line = line.rstrip() if line[0] == '>': if barcodeCluster: # print '#',clusterId,barcodeCluster['clusterReadCount'],barcodeCluster['clusterBarcodeSequence'],(clusterId in nonSingletonClusters),(clusterId in singletonClusters),len(barcodeCluster['readPairs']),len(barcodeCluster['identities']) # assert barcodeCluster['clusterReadCount'] == barcodeClusters[clusterId]['clusterReadCount'],'readcounts dont match' # if barcodeCluster['clusterBarcodeSequence'] != barcodeClusters[clusterId]['clusterBarcodeSequence']:print'NNNNNNNNNNNNNNNNNNNN';print barcodeCluster['clusterBarcodeSequence'];print barcodeClusters[clusterId]['clusterBarcodeSequence'] barcodeClusters[clusterId] = barcodeCluster if barcodeCluster['clusterReadCount'] > 1: nonSingletonClusters[clusterId] = barcodeCluster else: singletonClusters[clusterId] = barcodeCluster # tmp_totalClusterCount += 1 clusterId = int(line.split(' ')[1]) clusterReadCount = 0 barcodeCluster = { 'clusterReadCount': 0, 'readPairs': [], 'identities': [], 'clusterBarcodeSequence': None, 'clusterBarcodeQuality': None } continue elif line[0] == '0': readId = line.split('>')[1].split('.')[0] identity = 'seed' assert line.split( ' ')[-1] == '*', 'Error in file format of clstr file' barcodeCluster['clusterBarcodeSequence'] = self.id2seq[int( readId)] else: readId = line.split('>')[1].split('.')[0] identity = float( line.split(' ')[-1].split('/')[-1].split('%')[0]) # barcodeClusters[clusterId]['readPairs'].append(readId) # barcodeClusters[clusterId]['identities'].append(identity) # clusterReadCount += 1 barcodeCluster['readPairs'].append(int(readId)) barcodeCluster['identities'].append(identity) barcodeCluster['clusterReadCount'] += 1 progress.update() totalClusterCount = len(barcodeClusters) self.analysisfolder.results.setResult('barcodeClusterCount', len(barcodeClusters)) self.analysisfolder.results.setResult('singeltonBarcodeClusters', len(singletonClusters)) if self.logfile: self.logfile.write( 'All read pair to barcode cluster connections loaded.\n') return barcodeClusters