def writeClonoTypesToFile(clonoTypes, filename, top=100, overRepresented=True, stream=None): if exists(filename): printto( stream, "\tThe clonotype file " + os.path.basename(filename) + " was found!", LEVEL.WARN) return total = sum(clonoTypes.values()) * 1.0 dic = defaultdict(list) t = 0 for k in sorted(clonoTypes, key=clonoTypes.get, reverse=overRepresented): dic['Clonotype'].append(str(k)) dic['Count'].append(clonoTypes[k]) dic['Percentage (%)'].append(clonoTypes[k] / total * 100) t += 1 if t >= top: break df = DataFrame(dic) # fixed format (fast read/write) sacrificing search # (should change to table format(t) if search is needed for clonotype clustering/comparison) # df.to_hdf(filename, "clonotype", mode="w", format="f") df.to_csv(filename + ".gz", mode="w", compression="gzip") printto( stream, "\tA clonotype file has been written to " + os.path.basename(filename))
def flattenClonoTypeCountsDict(clonoTypes, stream=None): """ reduces something of this structure: 'IGHV1-3': { 'FR1': { 'FWGCGC': 12, 'EVILK': 1, ... } 'CDR1': { 'FWGCGC': 12, 'EVILK': 1, ... } }, 'IGHV2-3': { 'FR1' : { 'FWGCGC': 12, 'EVILK': 1, ... } 'CDR1': { 'FWGCGC': 12, 'EVILK': 1, ... } }, ... to this: { 'FR1': { 'FWGCGC': 24, 'EVILK': 2, ... } 'CDR1': { 'FWGCGC': 24, 'EVILK': 2, ... } } :param clonoTypes: dict input nested dictionary :return: dict flattened dictionary """ printto(stream, "Compressing clonotype table ... discarding IGV information ...") flattened = defaultdict(Counter) for geneName in clonoTypes: for region, counts in clonoTypes[geneName].items(): flattened[region] += Counter(counts) printto(stream, "Finish compressing clonotype table") return flattened
def splitFastaFile(fastaFile, totalFiles, seqsPerFile, filesDir, prefix="", ext=".fasta", stream=None): if not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ext)) and \ not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ".out")): # Split the FASTA file into multiple chunks printto(stream, "\tThe clones are distributed into multiple workers .. ") if not os.path.isdir(filesDir): os.makedirs(filesDir) if hasLargeMem(): with safeOpen(fastaFile) as fp: recordsAll = SeqIO.to_dict(SeqIO.parse(fp, 'fasta')) queryIds = recordsAll.keys() else: # SeqIO.index can only open string filenames and they must be unzipped recordsAll = SeqIO.index(gunzip(fastaFile), 'fasta') # recordsAll.keys() is of type <dictionary-keyiterator object>, need to cast to list queryIds = list(recordsAll.keys()) for i in range(totalFiles): ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile] records = map(lambda x: recordsAll[x], ids) out = os.path.join(filesDir, prefix + 'part' + str(i + 1) + ext) SeqIO.write(records, out, 'fasta')
def writeDAbundanceToFiles(stats, sampleName, outDir, stream=None): igdDist = Counter(stats["dgene"].tolist()) igdDist = Counter(dict([(str(k), igdDist[k]) for k in igdDist])) if len(igdDist) == 0: printto(stream, "WARNING: No IGD hits were detected.", LEVEL.WARN) return # Write the counts of all IGVs into a text file # This isn't plotted by default, but we still write the csv file for it classes = sorted(igdDist, key=igdDist.get, reverse=True) total = sum(igdDist.values()) * 1.0 writeCSV(os.path.join(outDir, sampleName + '_igd_dist_variant_level.csv'), "x,y\n", "{},{}\n", [(x, y) for x, y in zip( classes, map(lambda k: (igdDist[k] / total * 100), classes))]) # Group IGVs based on the subfamilies (gene level) and then write into a text file igdDistSub = compressCountsGeneLevel(igdDist) plotDist(igdDistSub, sampleName, os.path.join(outDir, sampleName + '_igd_dist_gene_level.csv'), rotateLabels=False, vertical=False, title='IGD Abundance in Sample ' + sampleName, stream=stream) # Group IGVs based on the families and then write into a text file igdDistfam = compressCountsFamilyLevel(igdDistSub) # Plot the family level distribution plotDist(igdDistfam, sampleName, os.path.join(outDir, sampleName + '_igd_dist_family_level.csv'), title='IGD Abundance in Sample ' + sampleName, stream=stream)
def writeJAbundanceToFiles(stats, sampleName, outDir, stream=None): igjDist = Counter(stats["jgene"].tolist()) igjDist = dict([(str(k), igjDist[k]) for k in igjDist]) if len(igjDist) == 0: printto(stream, "WARNING: No IGJ hits were detected.", LEVEL.WARN) return plotDist(igjDist, sampleName, os.path.join(outDir, sampleName + '_igj_dist_variant_level.csv'), rotateLabels=False, vertical=False, stream=stream) # Group IGVs based on the subfamilies (gene level) and then write into a text file igjDistSub = compressCountsGeneLevel(igjDist) # plotDist(igjDistSub, sampleName, outDir + sampleName + # '_igj_dist_gene_level.csv', rotateLabels=False, vertical=False) # # Group IGVs based on the families and then write into a text file igjDistfam = compressCountsFamilyLevel(igjDistSub) # Plot the family level distribution plotDist(igjDistfam, sampleName, os.path.join(outDir, sampleName + '_igj_dist_family_level.csv'), title='IGJ Abundance in Sample ' + sampleName, stream=stream)
def estimateDiversity(clonoTypes, flatClonoTypes, name, outDir, threads=2, segregate=False, stream=None): # create Germline gene level composition logos compositionLogos(name, clonoTypes, flatClonoTypes, outDir, threads=threads, detailed=segregate, stream=stream) generateSeqMotifs(flatClonoTypes, name, outDir, threads=threads, stream=stream) generateRarefactionPlots(flatClonoTypes, name, outDir, threads=threads, stream=stream) printto(stream, "The diversity of the library is being estimated ... ")
def run(self): while True: nextTask = self.tasksQueue.get() # poison pill check if nextTask is None: printto(self.stream, "process has stopped ... " + self.name) self.exitQueue.put("exit") # self.terminate() break try: result = analyzeSmallFile(nextTask, self.chain, self.igBlastDB, self.seqType, self.threads, domainSystem=self.domainSystem, stream=self.stream) self.resultsQueue.put(result) except Exception: printto( self.stream, "An error occurred while processing " + os.path.basename(nextTask), LEVEL.EXCEPT) self.resultsQueue.put(None) continue return
def calcRSAOverlapOrder2(order1, sites, stream=None): """ returns a n by n matrix where n is len(sites) of jaccard index :param order1: dictionary of sets of ids :param sites: collection of enzymes :param stream: logging stream :return: n by n dataframe that has the form of a named matrix: enz1 enz2 enz3 enz1 1 0.3 0.4 enz2 0.3 1 0.5 enz3 0.5 0.5 1 """ printto( stream, "The 2nd order overlapping matrix is being calculated using Jaccard Index ... " ) overlap = [] for site1 in sites: overlap.append([]) for site2 in sites: inter = len(order1[site1].intersection(order1[site2])) uni = len(order1[site1].union(order1[site2])) if uni != 0: overlap[-1].append(inter / uni) else: overlap[-1].append(1) overlap = DataFrame(overlap, columns=sites, index=sites) # overlap = linkage(overlap) return overlap
def collectRSAResults(sitesInfo, resultsQueue, totalTasks, noSeqs, simple=True, stream=None): stats = initRSAStats(simple=simple) total = 0 while totalTasks: statsi = resultsQueue.get() if statsi is None: continue totalTasks -= 1 # -------- update relevant statistics ------- # # 1. total number of sequences that are cut by any sites at all (i.e. number of sequences that are cut by # *at least* one site) stats["seqsCutByAny"] += statsi["seqsCutByAny"] for site in sitesInfo.keys(): # 2. total number of "possible hits" of this 'site' on all sequences (note, multi-hits are counted) stats["siteHitsCount"][site] += statsi["siteHitsCount"][site] # 3. total number of "hits" of this 'site' on all sequences (note, multi-hits on one sequence are still # counted as one, not multi) - this is a "duplicate" field of siteHitsSeqsIDs, we could've taken the # length of sitHitsSeqsIDs, it would be equal to this. This is left here from legacy code. stats["siteHitSeqsCount"][site] += statsi["siteHitSeqsCount"][site] # 4. the ids of which this site has at least one match, this length of this value should be equal to # siteHitSeqsCount stats['siteHitsSeqsIDs'][site] = stats["siteHitsSeqsIDs"][ site].union(statsi["siteHitsSeqsIDs"][site]) if not simple: # these keys are only available for detailed RS analysis # 5. collect the total number of region where a match with this site has been registered # Counter object stats['hitRegion'][site] += statsi['hitRegion'][site] # 6. collect all the germline sequences that were recorded during a match with this site # list object stats['siteHitSeqsGermline'][site] += statsi[ 'siteHitSeqsGermline'][site] # 7. collect all the IGV sequences that were recorded during a match with this site # set object stats['siteHitsSeqsIGV'][site] = stats['siteHitsSeqsIGV'][ site].union(statsi['siteHitsSeqsIGV'][site]) total += statsi["total"] if total % 50000 == 0: printto( stream, '\t%d/%d records have been collected ... ' % (total, noSeqs)) printto(stream, '\t%d/%d sequences have been collected ... ' % (total, noSeqs)) assert total == noSeqs stats["total"] = noSeqs return stats
def readSeqFileIntoDict(seqFile, outDict=None, stream=None): printto(stream, "Processing {} ... loading sequences into dictionary".format(os.path.basename(seqFile))) format = detectFileFormat(seqFile) if outDict is None: outDict = {} with safeOpen(seqFile) as fp: for rec in SeqIO.parse(fp, format): outDict[rec.id] = str(rec.seq) return outDict
def extractProteinFrag(protein, start, end, offset=0, trimAtStop=False, stream=None): """ Extract a protein fragment from a protein sequence based on DNA positions start and end are 1-based :param protein: :param start: :param end: :param offset: :param trimAtStop: :param stream: :return: """ if isnan(start) or isnan(end): return '' if start != -1 and end != -1 and end - start < 1: return '' # start and end are 1-based positions start = (start - offset) if start != -1 else start end = (end - offset) if end != -1 else end try: if start != -1: # s = int(round((start - 1.0 ) / 3))# 0-based s = int(((start - 1) / 3)) # 0-based else: s = 0 if end != -1: # e = int(round( (end*1.0) / 3)) # 1-based e = int(((end) / 3)) # 1-based else: e = len(protein) if (s + 1) < e: frag = protein[s:e] elif (s + 1) == e: frag = protein[s] else: return '' if trimAtStop and ('*' in frag): frag = frag[:frag.index('*')] return frag except: printto( stream, "ERROR at Extract Protein Fragment {} {} {}".format( protein, start, end), LEVEL.ERR) return None
def findMotifClusters(ighvMotifs, outputPrefix, stream=None): from TAMO.Clustering.UPGMA import UPGMA from TAMO.Clustering.UPGMA import DFUNC from TAMO.Clustering.UPGMA import print_tree_id # cluster using a variant of the UPGMA algorithm implemented in the TAMO package motifsFile = os.path.abspath(outputPrefix + '_motifs.tamo') if not exists(motifsFile): if len(ighvMotifs) > 0: pickle.dump(ighvMotifs, open(motifsFile, 'wb')) else: ighvMotifs = pickle.load(open(motifsFile, 'rb')) prefixName, sampleName = os.path.split(outputPrefix) dendrogramDirectory = os.path.join(prefixName, 'dendrograms') if not exists(dendrogramDirectory): os.makedirs(dendrogramDirectory) if len(ighvMotifs) > 0: groupedMotifs = defaultdict(list) for m in ighvMotifs: ighv = m.id.split('-')[0].split('/')[0] groupedMotifs[ighv].append(m) try: motifClustersFile = os.path.join(dendrogramDirectory, sampleName + '_pwm_clusters.txt') _old_stdout = sys.stdout sys.stdout = open(motifClustersFile, 'w') for ighv in groupedMotifs.keys(): newickdendrogramFile = os.path.join(dendrogramDirectory, sampleName + '_{}_newick.dnd'.format(ighv)) tree = UPGMA(groupedMotifs[ighv], DFUNC) print_tree_id(tree) saveNewickdendrogram(newickdendrogramFile, tree, sys.stdout, title=(ighv + " family clustering"), logger=stream) lists = groupedMotifs.values() tree = UPGMA([m for lst in lists for m in lst], DFUNC) print_tree_id(tree) newickdendrogramFile = os.path.join(dendrogramDirectory, sampleName + '_newick.dnd') saveNewickdendrogram(newickdendrogramFile, tree, sys.stdout, title="Clustering of all IGHV", logger=stream) sys.stdout.close() sys.stdout = _old_stdout printto(stream, "\tMotif clusters were written to " + os.path.basename(motifClustersFile)) except Exception as e: printto(stream, "Motifs couldn't be clustered! Error: {}".format(str(e)), LEVEL.ERR)
def writeClonotypeDiversityRegionAnalysis(clonoTypes, sampleName, outDir, stream=None): """ For a given set of similar CDR3 clonotypes, it may be classified as a different clonotype if the entire V region is considered. This writes the unique counts of other region aside form CDR3s to see if the clonotype will differ if the entire V region is considered. Consequently, it's possible to learn which region is (mostly) the one responsible of changing the clonotype if it was included. :param clonoTypes: DataFrame of clonotypes per read. Requires the CDRs and FRs columns :param sampleName: Sample name for output file :param outDir: Out directory for output file :param stream: debug stream :return: None. Produces an output gzipped csv file """ fname = os.path.join( outDir, sampleName + "_clonotype_diversity_region_analysis.csv.gz") if os.path.exists(fname): printto(stream, "\t File found {}".format(fname), LEVEL.WARN) return # regions of analysis cols = ["cdr1", "cdr2", "fr1", "fr2", "fr3", "fr4"] def regionCounts(selectedRows): """ returns a list of numbers that corresponds to the frequency of *UNIQUE* "CDR1", "CDR2", .. "FR4" (in the order of cols as defined above) :param selectedRows: this "DataFrame" of rows should have the same CDR3 region :return: a list of numbers, each representing the number of unique region in the order of COLS as defined above """ return [str(len(set(selectedRows[region]))) for region in cols] # obtain all CDR3s cdr3s = set(clonoTypes['cdr3']) with gzip.open(fname, "wb") as fp: writeBuffer = "" # write csv header writeBuffer += "cdr3,count," + ','.join(cols) + "\n" # for each unique CDR3, find all rows(reads) that have the same CDR3 for cdr3 in cdr3s: rows = clonoTypes[clonoTypes['cdr3'] == cdr3] writeBuffer += cdr3 + "," + str(len(rows)) + "," + ','.join( regionCounts(rows)) + '\n' if len(writeBuffer) > 4e9: fp.write(writeBuffer) writeBuffer = "" fp.write(writeBuffer)
def run(self): printto(self.stream, self.name + " process is now ready to start a new job ...") while True: nextTask = self.tasksQueue.get() # poison pill check if nextTask is None: printto(self.stream, self.name + " process has stopped.") self.exitQueue.put("exit") break try: if not self.firstJobTaken: printto(self.stream, self.name + " process commenced a new task ... ") self.firstJobTaken = True qsRecs = [] seqsAll = [] recordLengths = defaultdict(_defaultdefaultInt) flags = {} for f in self.refineFlagNames: flags[f] = [] for (record, qsRec) in zip(nextTask[0], nextTask[1]): seqs = refineCloneAnnotation(qsRec, record, self.actualQstart, self.chain, self.fr4cut, self.trim5End, self.trim3End, flags, stream=self.stream) # out-of-frame clones are excluded if qsRec['v-jframe'] != 'Out-of-frame': stillInFrame = refineInFramePrediction( qsRec, record, self.actualQstart, flags, stream=self.stream) if stillInFrame: _recordFRLength(qsRec, recordLengths) # append the FR and CDR protein clones qsRec['queryid'] = record.id qsRecs.append( convertCloneRecordToOrderedList(qsRec, self.chain)) seqsAll.append(seqs) self.procCounter.increment(len(qsRecs)) self.resultsQueue.put((qsRecs, seqsAll, flags, recordLengths)) except Exception as e: printto(self.stream, "An error occurred while processing " + self.name, LEVEL.EXCEPT) self.resultsQueue.put(None) continue return
def _parsePrimerFile(primerFile, stream=None): if primerFile: primerids = [] primerLengths = [] primerSequences = [] for rec in SeqIO.parse(primerFile, "fasta"): primerLengths.append(len(rec.seq)) primerids.append(rec.id) primerSequences.append(str(rec.seq).upper()) maxScores = calMaxIUPACAlignScores(primerSequences) if len(set(primerLengths)) != 1: printto(stream, "WARNING: Provided primer file {} has primers with different length. " "Analysis assumes uniform primer length" .format(primerFile), LEVEL.WARN) return max(primerLengths), zip(primerids, primerSequences, maxScores) return None, None
def writeClonoTypesToFiles(clonoTypes, name, outDir, topClonotypes=100, stream=None): printto(stream, "Clonotype files are being written out ... ") cloneFolder = os.path.join(outDir, "clonotypes") if not os.path.exists(cloneFolder): os.makedirs(cloneFolder) for k in clonoTypes.keys(): # check if the required topClonotypes went overboard, if so, cap to the max length if topClonotypes != float('inf') and len(clonoTypes[k]) < topClonotypes: stringTopClonotypes = str(len(clonoTypes[k])) else: stringTopClonotypes = 'all' if topClonotypes == float('inf') else str(topClonotypes) # descending order filename = os.path.join(cloneFolder, name + ("_{}_clonotypes_{}_over.csv".format(k, stringTopClonotypes))) writeClonoTypesToFile(clonoTypes[k], filename, topClonotypes, overRepresented=True) # ascending order filename = os.path.join(cloneFolder, name + ("_{}_clonotypes_{}_under.csv".format(k, stringTopClonotypes))) writeClonoTypesToFile(clonoTypes[k], filename, topClonotypes, overRepresented=False)
def run(self): while True: nextTask = self.taskQueue.get() if nextTask is None: printto(self.stream, self.name + " process has stopped.") self.exitQueue.put("exit") break try: recs = [] if not self.firstJobTaken: printto(self.stream, self.name + " process commenced a new task ... ") self.firstJobTaken = True for record, qsRec in zip(nextTask[0], nextTask[1]): qsRec['queryid'] = record.id recs.append(_matchClosestPrimer(qsRec, record, self.actualQstart, self.trim5end, self.trim3end, self.end5offset, self.fr4cut, self.maxPrimer5Length, self.maxPrimer3Length, self.primer5sequences, self.primer3sequences)) self.resultsQueue.put(recs) self.procCounter.increment(len(recs)) except Exception as e: printto(self.stream, "An error as occurred while processing " + self.name + " with error {}".format( str(e) ), LEVEL.EXCEPT) self.resultsQueue.put(None) continue return
def _collectPrimerResults(columns, queue, totalTasks, noSeqs, stream=None): processed = 0 cloneAnnot = [] totalUnexpected5 = totalUnexpected3 = 0 while totalTasks: result = queue.get() totalTasks -= 1 if result is None: continue for entry, unexpected5, unexpected3 in result: totalUnexpected5 += unexpected5 totalUnexpected3 += unexpected3 # put them as a list (in the ordering specified by 'columns') cloneAnnot.append([entry[col] for col in columns]) processed = len(cloneAnnot) if processed % 50000 == 0: printto( stream, "\t{:,}/{:,} records have been collected ... ".format( processed, noSeqs)) sys.stdout.flush() printto( stream, "\t{:,}/{:,} records have been collected ... ".format( processed, noSeqs)) printto( stream, "\tThere were {} unexpected 5' alignment and {} unexpected 3' alignment" .format(totalUnexpected5, totalUnexpected3), LEVEL.WARN) return cloneAnnot
def fastq2fasta(fastqFile, outputDir, stream=None): """ Converts a fastq file into fasta file. Fastq can be compressed if it was provided as such :param fastqFile: (un)compressed fastq file. If compressed, will leave original compressed untouched :param outputDir: Where to produce the new fasta file :param stream: debugging stream :return: fasta filename """ # FASTQ to FASTA # awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' my.fastq > my.fasta filename = os.path.basename(fastqFile) seqOut = os.path.join(outputDir, "seq") if not os.path.isdir(seqOut): os.makedirs(seqOut) # rename all fastq files to fasta, including gzipped files if filename.endswith(".gz"): filename = os.path.join( seqOut, filename.replace(filename.split('.')[-2] + ".gz", 'fasta')) fastqFile = gunzip(fastqFile) else: filename = os.path.join( seqOut, filename.replace(filename.split('.')[-1], 'fasta')) if exists(filename): printto(stream, "\tThe FASTA file was found!", LEVEL.WARN) return filename printto( stream, "\t" + os.path.basename(fastqFile) + " is being converted into FASTA ...") SeqIO.convert(fastqFile, 'fastq', filename, 'fasta') # not all systems have AWK by default (cough, windows) # command = ("awk 'NR % 4 == 1 {sub(\"@\", \"\", $0) ; print \">\" $0} NR % 4 == 2 " # "{print $0}' " + fastqFile + " > " + filename # ) # os.system(command) return filename
def alignListOfSeqs(signals, outDir, threads, name, stream=None): """ perform multiple sequence alignment using CLUSTAL :param signals: :param outDir: :param threads: :param name: :param stream: :return: """ L = map(len, signals) printto( stream, "\t\t{} sequences are being aligned using CLUSTAL-OMEGA (L in [{}, {}])... " .format(len(L), min(L), max(L))) tempSeq = os.path.join(outDir, "csl_temp_seq_" + name + ".fasta") tempAlign = tempSeq.replace('.fasta', '.aln') seqs = [] for i in range(len(signals)): seqs.append(SeqRecord(Seq(signals[i]), id='seq' + str(i))) SeqIO.write(seqs, tempSeq, 'fasta') clustal = ShortOpts(CLUSTALOMEGA, i=quote(tempSeq), o=quote(tempAlign))\ .append("--threads={} --outfmt=clustal".format(threads)) # printto(stream, "Executing: " + str(clustal)) # throw away stderr and stdout clustal(stdout=None, stderr=None) alignment = AlignIO.read(tempAlign, 'clustal') alignedSeq = [] for rec in alignment: alignedSeq.append(str(rec.seq)) os.remove(tempSeq) os.remove(tempAlign) return alignedSeq
def generateMotifs(seqGroups, align, outputPrefix, transSeq=False, extendAlphabet=False, clusterMotifs=False, protein=False, threads=2, stream=None): from TAMO.MotifTools import Motif ighvMotifs = [] if clusterMotifs and 'gene' in outputPrefix: findMotifClusters(ighvMotifs, outputPrefix, stream=stream) printto(stream, '\t\tPWMs, consensus and logos are being generated for {} motifs ... '.format(len(seqGroups))) pwmFile = open(outputPrefix + '_pwm.txt', 'w') consensusFile = open(outputPrefix + '_consensus.txt', 'w') logosFolder = outputPrefix + '_logos' if not os.path.exists(logosFolder): os.makedirs(logosFolder) # create the sequence alphabet: DNA or Protein alphabet = createAlphabet(align, transSeq, extendAlphabet, protein) groups = seqGroups.keys() groups.sort() for group in groups: filename = os.path.join(logosFolder, group.replace('/', '') + '.png') seqs = seqGroups[group] m = generateMotif(seqs, group, alphabet, filename, align, transSeq, protein, outDir=logosFolder, threads=threads, stream=stream) if m is None: # motif file found, no further work required return motifSeqs = m.instances pwm = m.counts.normalize(pseudocounts=None) # {'A':0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6} consensusMax = str(m.consensus) pwmFile.write('#{} {} sequences\n'.format(group, len(motifSeqs))) pwmFile.write(str(pwm)) consensusFile.write('>{} max_count\n'.format(group)) consensusFile.write(consensusMax + '\n') # print(str(m.anticonsensus)) # smallest values in the columns if not transSeq and not align and not protein: consensusIupac = str(m.degenerate_consensus) # print(consensusIupac) # IUPAC ambiguous nucleotides consensusFile.write('>{} degenerate\n'.format(group)) consensusFile.write(consensusIupac + '\n') pwmFile.flush() consensusFile.flush() gc.collect() if clusterMotifs and len(motifSeqs) > 10: motif = Motif(map(lambda x: str(x), motifSeqs), backgroundD={'A': 0.6, 'C': 0.4, 'G': 0.4, 'T': 0.6}, id=group) motif.addpseudocounts(0.1) ighvMotifs.append(motif) pwmFile.close() consensusFile.close() gc.collect() printto(stream, "\tPosition weight matrices are written to " + os.path.basename(outputPrefix + '_pwm.txt')) printto(stream, "\tConsensus sequences are written to " + os.path.basename(outputPrefix + '_consensus.txt')) if clusterMotifs: findMotifClusters(ighvMotifs, outputPrefix, stream=stream)
def saveNewickdendrogram(newickClusterFile, tree, stream, title="", logger=None): """ :param newickClusterFile: :param tree: UPGMA object :param stream: :param title: :param logger: :return: """ from TAMO.Clustering.UPGMA import create_tree_phylip desc = '' if not title else " for {} ".format(title) # get phylip newick syntax phylipTree = create_tree_phylip(tree) with open(newickClusterFile, 'w') as newickfp: newickfp.write(phylipTree) printto(logger, "Newick dendrogram{}written to ".format(desc) + os.path.basename(newickClusterFile)) # show ascii art phylipTree = Phylo.read(newickClusterFile, format='newick') try: print("\n\nASCII phylip tree{}:\n".format(desc), file=stream) Phylo.draw_ascii(phylipTree, file=stream) except ZeroDivisionError: # if the weights are 0 print("\t Not drawn because of 0 weights", file=stream) pass # plot dendrogram in matplotlib phylipTree.ladderize() fig, axes = plt.subplots(figsize=(8, 5)) Phylo.draw(phylipTree, do_show=False, axes=axes, show_confidence=True) axes.set_title(title) fig.savefig(newickClusterFile.replace('.dnd', '.png'), dpi=300) plt.close()
def printRefineFlags(flags, records, refineFlagNames, refineFlagMsgs, stream=None): # print statistics and a few of the flagged clones for f in refineFlagNames: if len(flags[f]) > 0: printto(stream, refineFlagMsgs[f].format(len(flags[f])), LEVEL.INFO) examples = random.choice(range(len(flags[f])), min(3, len(flags[f])), replace=False) for i in examples: printto(stream, ">" + flags[f][i], LEVEL.INFO) printto(stream, str(records[flags[f][i]].seq), LEVEL.INFO)
def collectRefineResults(resultsQueue, totalTasks, noSeqs, refineFlagNames, stream=None): total = 0 cloneAnnot = [] transSeqs = [] frameworkLengths = defaultdict(_defaultCounter) flags = {} for f in refineFlagNames: flags[f] = [] while totalTasks: result = resultsQueue.get() totalTasks -= 1 if result is None: continue qsRecsOrdered, seqs, flagsi, recordLengths = result # convert dict to Counter object for keys, regions in recordLengths.items(): for region in regions: frameworkLengths[keys][region] += Counter(recordLengths[keys][region]) # update relevant annotation fields cloneAnnot += qsRecsOrdered transSeqs += seqs # update flags for f in refineFlagNames: flags[f] += flagsi[f] total += len(qsRecsOrdered) if total % 50000 == 0: printto(stream, '\t{}/{} records have been collected ... '.format(total, noSeqs)) printto(stream, '\t{}/{} records have been collected ... '.format(total, noSeqs)) return cloneAnnot, transSeqs, flags, frameworkLengths
def run(self): printto(self.stream, self.name + " process is now ready to start a new job ...") while True: nextTask = self.tasksQueue.get() if nextTask is None: printto(self.stream, self.name + " process has stopped.") self.exitQueue.put("exit") break try: if self.simpleScan: self.runSimple(nextTask) else: self.runDetailed(nextTask) except Exception as e: printto( self.stream, "An error occurred while processing " + self.name + " error: {}".format(str(e)), LEVEL.ERR) self.resultsQueue.put(None) continue return
def loadRestrictionSites(sitesFile, stream=None): """ given a whitespace separated file containing 2 columns, return a dictionary of restriction enzyme names to a regex translated sequence. Ignores all lines that starts with "#" :param sitesFile: file with 2 cols, enzyme <ws> seq. Any line that *starts* with # will be ignored :param stream: logging stream :return: dictionary of enzyme to precompiled regex mapping, for example: { "ENZYME1": re.compile("AC[GT]..A") # assuming ENZYME1's IUPAC sequence was "ACKNNA" } """ with open(sitesFile) as fp: sites = {} for line in fp: line = line.strip() if line and not line.startswith("#"): try: enzyme, seq = line.split() if enzyme in sites: printto( stream, enzyme + " is duplicated, the older enzyme sequence {} ". format(sites[enzyme]) + "will be overridden.", LEVEL.WARN) sites[enzyme] = re.compile( replaceIUPACLetters(str(seq).upper().strip())) except Exception as e: printto( stream, "Offending line: {}, {}".format(line, line.split()), LEVEL.EXCEPT) raise e printto(stream, "Restricting sites have been loaded") return sites
def extractUpstreamSeqs(cloneAnnot, recordFile, upstream, upstreamFile, stream=None): """ extract the upstream DNA sequences and write them into a FASTA file named upstreamFile :param cloneAnnot: cloneAnnot DataFrame :param recordFile: raw record file (string) :param upstream: list of 2 numbers, denoting [start, end] inclusive in 1-index. np.Inf is also allowed for end value :param upstreamFile: output FASTA filename :param stream: logging stream object :return: None """ printto(stream, "\tExtracting the upstream sequences ... ") # alignments with - strand revAlign = 0 # num. seqs with trimmed beginning (vstart > 3) trimmedBegin = 0 # num. seqs with sequences shorter than expected upstream length (len(seq) < expectLength) # @see expectLength trimmedUpstream = 0 # excluded sequences because end <= 1 noSeq = 0 # num. processed sequences procSeqs = 0 # buffer to hold sequences before flushing into file recordsBuffer = [] # max buffer size allowed maxBufferSize = int(10.0**5) / 2 # expected upstream length = expectLegth (end - start + 1) where start,end are both 1-index expectLength = upstream[1] - upstream[0] + 1 queryIds = cloneAnnot.index # NOTE: SeqIO.index can only index string filenames and it has to be unzipped _, ext = os.path.splitext(os.path.basename(recordFile.rstrip(os.path.sep))) records = SeqIO.index(gunzip(recordFile), ext.lstrip('.')) with open(upstreamFile, 'w') as fp: for id_ in queryIds: record = records[id_] qsRec = cloneAnnot.loc[record.id] if qsRec.strand != 'forward': revAlign += 1 record.seq = record.seq.reverse_complement() if qsRec.vstart <= 3: end = qsRec.vqstart - upstream[0] - qsRec.vstart + 1 if end <= 1: noSeq += 1 else: start = max(1, qsRec.vqstart - upstream[1] - qsRec.vstart + 1) record.seq = record.seq[int(start - 1):int(end)] if expectLength != Inf and len(record.seq) < expectLength: trimmedUpstream += 1 record.id = record.id + _UPSTREAM_SEQ_FILE_SEP + qsRec.vgene record.description = "" recordsBuffer.append(record) procSeqs += 1 if procSeqs % maxBufferSize == 0: printto( stream, '{}/{} sequences have been processed ... '.format( procSeqs, len(queryIds))) SeqIO.write(recordsBuffer, fp, 'fasta') recordsBuffer = [] else: trimmedBegin += 1 # flush remaining sequences if len(recordsBuffer) > 0: printto( stream, '{}/{} sequences have been processed ... '.format( procSeqs, len(queryIds))) SeqIO.write(recordsBuffer, fp, 'fasta') if revAlign > 0: printto( stream, "\t\t\t{} sequences are in reversed alignment ... ".format( revAlign), LEVEL.INFO) if trimmedBegin > 0: printto( stream, "\t\t\tThe query sequence is not aligned within 3bp of the IGV start " "position ... {} found and excluded!".format(trimmedBegin), LEVEL.WARN) if trimmedUpstream > 0: printto( stream, "\t\t\tUpstream sequences shorter than the expected length are detected ... {} found" .format(trimmedUpstream), LEVEL.WARN) if noSeq > 0: printto( stream, "\t\t\tNo upstream sequence can be extracted (too short) for {} sequences." .format(noSeq), LEVEL.WARN) gc.collect()
def findUpstreamMotifs(upstreamFile, sampleName, outAuxDir, outResDir, expectLength, level, startCodon=True, type='secsig', clusterMotifs=False, threads=2, stream=None): """ finds and visualizes motifs from the sequences provided in upstreamFile :param upstreamFile: string path to FASTA file containing upstream sequences :param sampleName: string name to refer the sample as :param outAuxDir: string path to aux directory :param outResDir: string path to result directory :param expectLength: tuple or list index-able of length 2 denoting start and end. If start == end, this implies that the analysis should be conducted ONLY on sequences with length == start == end, the rest are ignored. :param level: string one of 'gene', 'family' or 'variant' :param startCodon: bool whether or not to segregate sequences with start codon :param type: string one of upstream analysis types: '5utr' or 'secsig' :param clusterMotifs: bool whether or not to cluster sequences using TAMO :param threads: int number of threads to use :param stream: stream logging stream :return: None """ from abseqPy.IgRepAuxiliary.seqUtils import generateMotifs if level == 'variant': # single argument identity function compressor = lambda signals: signals elif level == 'gene': compressor = compressSeqGeneLevel elif level == 'family': compressor = compressSeqFamilyLevel else: raise ValueError( "Unknown level {} requested, accepted values are family, gene, or variant" .format(level)) if type not in ['secsig', '5utr']: raise ValueError( "Unknown parameter type={}, expected one of 'secsig', '5utr'". format(type)) # output files always have this format: <sampleName>_<type>_<exp[0]>_<exp[1]>_* OUTPUT_FILE_PACKET = (sampleName, type, expectLength[0], expectLength[1]) # only analyze motifs of secretion signals that have exactly length == expectLength[0] == expectLength[1] EXACT_LENGTH = expectLength[0] == expectLength[1] validSeqFile = os.path.join( outAuxDir, _VALID_SEQ_FASTA_TEMPLATE.format(*OUTPUT_FILE_PACKET)) faultySeqFile = os.path.join( outAuxDir, _FAULTY_SEQ_FASTA_TEMPLATE.format(*OUTPUT_FILE_PACKET)) noStartCodonFile = os.path.join( outAuxDir, _STARTCOD_SEQ_FASTA_TEMPLATE.format(*OUTPUT_FILE_PACKET)) allFiles = [validSeqFile, faultySeqFile, noStartCodonFile] if all(map(lambda x: os.path.exists(x), allFiles)): printto( stream, "Sequences were already analyzed at {}, loading from files instead ... " + ' '.join(allFiles), LEVEL.WARN) ighvSignals, faultySeq, noStartCodonSeq = _loadIGVSeqsFromFasta(validSeqFile),\ _loadIGVSeqsFromFasta(faultySeqFile),\ _loadIGVSeqsFromFasta(noStartCodonFile) else: printto(stream, "Sequences are being analyzed ... ") ighvSignals, faultySeq, noStartCodonSeq = collectUpstreamSeqs( upstreamFile, sampleName, expectLength, outResDir, outAuxDir, startCodon, type, stream=stream) ighvSignals = compressor(ighvSignals) generateMotifs(ighvSignals, align=(expectLength[0] < expectLength[1]), outputPrefix=os.path.join( outResDir, ("{}_{}_{:.0f}_{:.0f}_dna_" + level).format(*OUTPUT_FILE_PACKET)), clusterMotifs=clusterMotifs, threads=threads, stream=stream) if EXACT_LENGTH and type == 'secsig': faultySeq = compressor(faultySeq) generateMotifs(faultySeq, align=True, outputPrefix=os.path.join( outResDir, ("{}_{}_{:.0f}_{:.0f}_faulty_" + level).format(*OUTPUT_FILE_PACKET)), transSeq=False, extendAlphabet=True, clusterMotifs=clusterMotifs, threads=threads, stream=stream) noStartCodonSeq = compressor(noStartCodonSeq) generateMotifs(noStartCodonSeq, align=True, outputPrefix=os.path.join( outResDir, ("{}_{}_{:.0f}_{:.0f}_untranslated_" + level).format(*OUTPUT_FILE_PACKET)), transSeq=False, extendAlphabet=True, clusterMotifs=clusterMotifs, threads=threads, stream=stream) generateMotifs(ighvSignals, align=False, outputPrefix=os.path.join( outResDir, ("{}_{}_{:.0f}_{:.0f}_protein_" + level).format(*OUTPUT_FILE_PACKET)), transSeq=True, clusterMotifs=clusterMotifs, threads=threads, stream=stream)
def collectUpstreamSeqs(upstreamFile, sampleName, expectLength, outResDir, outAuxDir, startCodon=True, type='secsig', plotDist=True, stream=None): """ segregates and plots upstream file sequences. They are segregated as sequences with no start codon, faulty sequences (stop codon post translation if type == secsig or X or N nucleotides in the sequence), and valid sequences. :param upstreamFile: string upstream FASTA file :param sampleName: string name of sampel :param expectLength: tuple or list index-able of length 2 denoting start and end :param outResDir: string name of result output directory :param outAuxDir: string name of auxiliary output directory :param startCodon: bool whether or not to care about start codons during segregation :param type: string either 'secsig' or '5utr' :param plotDist: bool whether or not to also save a txt and png file denoting the distribution of segregated sequences :param stream: stream debugging stream :return: tuple (ighvValidSignals : dict, faultySeqs : dict and noStartCodonSeqs: dict) """ if type not in ['secsig', '5utr']: raise ValueError( "Unknown parameter type={}, expected one of 'secsig', '5utr'". format(type)) printto( stream, "\tSequences between {} and {} are being extracted ... ".format( expectLength[0], expectLength[1])) START_CODON = "ATG" # valid sequences ighvSignals = defaultdict(list) ighvSignalsCounts = defaultdict(int) # no start codons ighvSignalsNoATG = defaultdict(list) noStartCodonCounts = defaultdict(int) # faulty translations faultyTrans = defaultdict(list) faultyTransCounts = defaultdict(int) ignoredSeqs = 0 records = SeqIO.index(gunzip(upstreamFile), 'fasta') for id_ in records: rec = records[id_] ighv = rec.id.split(_UPSTREAM_SEQ_FILE_SEP)[1] seq = rec.seq if expectLength[0] <= len(rec) <= expectLength[1]: if not startCodon or START_CODON in seq: if type == 'secsig': seq = seq[:len(seq) - (len(seq) % 3)].translate(to_stop=False)[1:] if 'X' in seq or '*' in seq: faultyTrans[ighv].append(rec) faultyTransCounts[ighv] += 1 elif 'N' not in rec.seq: ighvSignals[ighv].append(rec) ighvSignalsCounts[ighv] += 1 else: printto(stream, "Ignored: " + str(rec.seq) + ' ' + str(seq)) if type == 'secsig': faultyTrans[ighv].append(rec) faultyTransCounts[ighv] += 1 elif startCodon: # START_CODON not in seq ighvSignalsNoATG[ighv].append(rec) noStartCodonCounts[ighv] += 1 else: ignoredSeqs += 1 if ignoredSeqs: printto( stream, "\tThere are {} sequences that were ignored because the length of the provided upstream" "sequences were not {} <= length(upstream_seqs) <= {}".format( ignoredSeqs, *expectLength), LEVEL.WARN) if sum(ighvSignalsCounts.values()): flattenRecs = list(itertools.chain.from_iterable(ighvSignals.values())) assert len(flattenRecs) == sum(ighvSignalsCounts.values()) title = 'Valid Secretion Signals' if type == 'secsig' else "Valid 5'-UTRs" printto( stream, "\tThere are {} {} within expected " "length ({} to {}) and startCodon={}".format( sum(ighvSignalsCounts.values()), title, expectLength[0], expectLength[1], startCodon), LEVEL.INFO) validSeqFile = os.path.join( outAuxDir, _VALID_SEQ_FASTA_TEMPLATE.format(sampleName, type, *expectLength)) SeqIO.write(flattenRecs, validSeqFile, 'fasta') if plotDist: writeCountsCategoriesToFile( ighvSignalsCounts, sampleName, os.path.join( outResDir, "{}_{}_{:.0f}_{:.0f}_valid_".format( sampleName, type, expectLength[0], expectLength[1])), title) if sum(faultyTransCounts.values()): flattenRecs = (itertools.chain.from_iterable(faultyTrans.values())) assert len(flattenRecs) == sum(faultyTransCounts.values()) faultySeqFile = os.path.join( outAuxDir, _FAULTY_SEQ_FASTA_TEMPLATE.format(sampleName, type, *expectLength)) SeqIO.write(flattenRecs, faultySeqFile, 'fasta') if plotDist: writeCountsCategoriesToFile( faultyTransCounts, sampleName, os.path.join( outResDir, "{}_{}_{:.0f}_{:.0f}_faulty_".format( sampleName, type, *expectLength)), 'Faulty Translations') printto( stream, "\tTotal faulty secretion signals is {} (excluded)".format( len(flattenRecs)), LEVEL.INFO) for i in random.choice(range(len(flattenRecs)), min(5, len(flattenRecs)), replace=False): sequence = flattenRecs[i].seq printto( stream, "\t{}\n\tTranslated:{}".format( sequence, sequence[:len(sequence) - (len(sequence) % 3)].translate())) if sum(noStartCodonCounts.values()): flattenRecs = list( itertools.chain.from_iterable(ighvSignalsNoATG.values())) assert len(flattenRecs) == sum(noStartCodonCounts.values()) noStartCodonFile = os.path.join( outAuxDir, _STARTCOD_SEQ_FASTA_TEMPLATE.format(sampleName, type, *expectLength)) SeqIO.write(flattenRecs, noStartCodonFile, 'fasta') if plotDist: writeCountsCategoriesToFile( noStartCodonCounts, sampleName, os.path.join( outResDir, "{}_{}_{:.0f}_{:.0f}_no_atg_".format( sampleName, type, *expectLength)), "Upstream sequences without start codon") printto( stream, "\tThere is no ATG codon in {} sequences (excluded)".format( len(flattenRecs)), LEVEL.INFO) for i in random.choice(range(len(flattenRecs)), min(5, len(flattenRecs)), replace=False): printto(stream, "\t{}".format(flattenRecs[i].seq)) # the output of each ighv key's value should be a list of strings, not SeqRecord object for k in ighvSignals: ighvSignals[k] = map(lambda x: str(x.seq), ighvSignals[k]) for k in faultyTrans: faultyTrans[k] = map(lambda x: str(x.seq), faultyTrans[k]) for k in ighvSignalsNoATG: ighvSignalsNoATG[k] = map(lambda x: str(x.seq), ighvSignalsNoATG[k]) return ighvSignals, faultyTrans, ighvSignalsNoATG
def extractCDRInfo(blastOutput, chain, stream=None): # Extract the top hits printto(stream, '\tExtracting top hit tables ... ' + os.path.basename(blastOutput)) # process igblast output and extract top hit cloneAnnot = [] filteredIDs = [] line = "" warning = False # RE: parsing IGBLAST: # VDJ junction details MAY give N/A instead of just missing: # eg: # V-(D)-J junction details based on top germline gene matches # (V end, V-D junction, D region, D-J junction, J start). Note that possible # overlapping nucleotides at VDJ junction (i.e, nucleotides that could # be assigned to either rearranging gene) are indicated in parentheses (i.e., (TACT)) but are # not included under the V, D, or J gene itself # GGGTC TGTTCACGAGGGCATCTGTGTCCTGTTTTTAGGTTCTCCTCCC TTTTGAC N/A N/A # it also has a variable number of hits (depending on presence of region) # EG: # V-(D)-J junction details based on top germline gene matches (V end, V-J junction, J start). Note that possible overlapping nucleotides at VDJ junction (i.e, nucleotides that could be assigned to either rearranging gene) are indicated in parentheses (i.e., (TACT)) but are not included under the V, D, or J gene itself # CCTCT N/A GGTGT with open(blastOutput) as blast: while True: try: if not line.startswith('# Query'): line = blast.readline() if not line: break continue cloneRecord = createCloneRecord(chain) cloneRecord['queryid'] = line.split()[2].strip() # parse V-(D)-J rearrangement line = blast.readline() while (line and not line.startswith('# Query') and not line.startswith('# V-(D)-J rearrangement')): line = blast.readline() if not line: filteredIDs.append(cloneRecord['queryid']) break if line.startswith('# Query'): filteredIDs.append(cloneRecord['queryid']) continue line = blast.readline().strip().split('\t') cloneRecord[ 'strand'] = 'forward' if line[-1] == '+' else 'reversed' # print line, cloneRecord['strand'] # sys.exit() # XXX: the or len(line) == 8 may happen to light chains too, when there is # a rogue D-gene that was a hit. It then follows heavy chain's indexing if (chain == 'hv') or len(line) == 8: cloneRecord['stopcodon'] = line[4] cloneRecord['v-jframe'] = line[5] cloneRecord['vgene'] = line[0].split(',')[0] cloneRecord['dgene'] = line[1].split(',')[0] cloneRecord['jgene'] = line[2].split(',')[0] cloneRecord['chain'] = line[3] else: cloneRecord['stopcodon'] = line[3] cloneRecord['v-jframe'] = line[4] cloneRecord['vgene'] = line[0].split(',')[0] cloneRecord['jgene'] = line[1].split(',')[0] cloneRecord['chain'] = line[2] line = ' '.join(line) # Parse Sub-region analysis and ignore it if there's no CDR3 hit by IGBLAST while line and \ not line.startswith("# Alignment") and \ not line.startswith("# Sub-region") and \ not line.startswith("# Query"): line = blast.readline() # EOF if not line: filteredIDs.append(cloneRecord['queryid']) break # there's no # Sub-region, nor is there # Alignment. if line.startswith("# Query"): filteredIDs.append(cloneRecord['queryid']) continue # this implies that IGBLAST successfully classified a CDR3 sequence if line.startswith("# Sub-region"): line = blast.readline() subregionData = line.split() assert subregionData[0] == 'CDR3' if len(subregionData) >= 3 and subregionData[-1].isdigit( ) and subregionData[-2].isdigit(): cloneRecord['cdr3.start'] = to_int(subregionData[-2]) cloneRecord['cdr3.end'] = to_int(subregionData[-1]) # true FR3 end is at position cdr3.start - 1 # (the alignment table only tells us the FR3 germline) # but since fr3.start always begins in the germline, there's no special field for that # and is assumed that fr3.start == fr3g.start cloneRecord['fr3.end'] = cloneRecord['cdr3.start'] - 1 # parse Alignment Summary between query and top germline V gene while (line and not line.startswith('# Query') and not line.startswith("# Alignment")): line = blast.readline() if not line: filteredIDs.append(cloneRecord['queryid']) break if line.startswith('# Query'): filteredIDs.append(cloneRecord['queryid']) continue line = blast.readline() for i in range(1, 4): if line.lower().startswith('fr' + str(i)): line = line.split() cloneRecord['fr%d.start' % i] = to_int(line[1]) cloneRecord['fr%d%s.end' % (i, 'g' if i == 3 else '')] = to_int( line[2]) cloneRecord['fr%d%s.mismatches' % (i, 'g' if i == 3 else '')] = to_int( line[5]) cloneRecord['fr%d%s.gaps' % (i, 'g' if i == 3 else '')] = to_int( line[6]) line = blast.readline() if line.lower().startswith('cdr' + str(i)): # IgBLAST has parenthesis beside CDR3 germline start/end, depending on the domain system # if domain_system == imgt, (germline) # if domain_system == kabat, (V gene only) line = line.replace('(germline)', '').replace('(V gene only)', '').split() cloneRecord['cdr%d%s.start' % (i, 'g' if i == 3 else '')] = to_int( line[1]) cloneRecord['cdr%d%s.end' % (i, 'g' if i == 3 else '')] = to_int( line[2]) cloneRecord['cdr%d%s.mismatches' % (i, 'g' if i == 3 else '')] = to_int( line[5]) cloneRecord['cdr%d%s.gaps' % (i, 'g' if i == 3 else '')] = to_int( line[6]) line = blast.readline() # if the CDR3 region wasn't identified by IgBlast, we can't get FR3 end, so we fallback to # FR3 germline end. Since CDR3.start and CDR3.end isn't really used unitl the refinement process, # we ignore the fallback options for them. if np.isnan(cloneRecord['fr3.end']): cloneRecord['fr3.end'] = cloneRecord['fr3g.end'] # parse alignment information between query and V, D and J genes while (line and not line.startswith('# Query') and not line.startswith("# Fields")): line = blast.readline() if not line: filteredIDs.append(cloneRecord['queryid']) break if line.startswith('# Query'): filteredIDs.append(cloneRecord['queryid']) continue line = blast.readline() noHits = to_int(line.split()[1]) if noHits == 0: filteredIDs.append(cloneRecord['queryid']) continue # retrieve the top hit # parse the top V gene info line = blast.readline() if not line.startswith("V"): filteredIDs.append(cloneRecord['queryid']) continue hit = line.split() score = float(hit[-1]) align = to_int(hit[4]) sStart = to_int(hit[10]) cloneRecord['identity'] = float(hit[3]) cloneRecord['alignlen'] = align cloneRecord['bitscore'] = score cloneRecord['vqstart'] = to_int(hit[8]) cloneRecord['vstart'] = sStart cloneRecord['vmismatches'] = to_int(hit[5]) cloneRecord['vgaps'] = to_int(hit[7]) # parse the top D gene info line = blast.readline() while (line and not line.startswith("# Query") and not line.startswith("D") and not line.startswith("J")): line = blast.readline() if not line: cloneAnnot.append( convertCloneRecordToOrderedList(cloneRecord, chain)) break if line.startswith('# Query'): cloneAnnot.append( convertCloneRecordToOrderedList(cloneRecord, chain)) continue if line.startswith("D"): hit = line.split() cloneRecord['dqstart'] = to_int(hit[8]) cloneRecord['dqend'] = to_int(hit[9]) cloneRecord['dstart'] = to_int(hit[10]) cloneRecord['dmismatches'] = to_int(hit[5]) cloneRecord['dgaps'] = to_int(hit[7]) # parse the top J gene info while (line and not line.startswith("# Query") and not line.startswith("J")): line = blast.readline() if not line: cloneAnnot.append( convertCloneRecordToOrderedList(cloneRecord, chain)) break if line.startswith('# Query'): cloneAnnot.append( convertCloneRecordToOrderedList(cloneRecord, chain)) continue if line.startswith("J"): hit = line.split() cloneRecord['jqstart'] = to_int(hit[8]) cloneRecord['jqend'] = to_int(hit[9]) cloneRecord['jstart'] = to_int(hit[10]) # jend is a little special, we need it for FR4 end deduction cloneRecord['jend'] = to_int(hit[11]) cloneRecord['jmismatches'] = to_int(hit[5]) cloneRecord['jgaps'] = to_int(hit[7]) cloneAnnot.append( convertCloneRecordToOrderedList(cloneRecord, chain)) except Exception: warning = True continue if len(cloneAnnot) > 0: # productive = no stop and in-frame # v-jframe: in-frame, out-of-frame, N/A (no J gene) # stopcodon: yes, no cloneAnnot = DataFrame(cloneAnnot, columns=getAnnotationFields(chain)) cloneAnnot.set_index('queryid', drop=True, inplace=True) else: cloneAnnot = DataFrame() if warning: printto( stream, "WARNING: something went wrong while parsing {}".format( blastOutput), LEVEL.WARN) return cloneAnnot, filteredIDs