def splitFastaFile(fastaFile, totalFiles, seqsPerFile, filesDir, prefix="", ext=".fasta", stream=None): if not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ext)) and \ not exists(os.path.join(filesDir, prefix + "part" + str(int(totalFiles)) + ".out")): # Split the FASTA file into multiple chunks printto(stream, "\tThe clones are distributed into multiple workers .. ") if not os.path.isdir(filesDir): os.makedirs(filesDir) if hasLargeMem(): with safeOpen(fastaFile) as fp: recordsAll = SeqIO.to_dict(SeqIO.parse(fp, 'fasta')) queryIds = recordsAll.keys() else: # SeqIO.index can only open string filenames and they must be unzipped recordsAll = SeqIO.index(gunzip(fastaFile), 'fasta') # recordsAll.keys() is of type <dictionary-keyiterator object>, need to cast to list queryIds = list(recordsAll.keys()) for i in range(totalFiles): ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile] records = map(lambda x: recordsAll[x], ids) out = os.path.join(filesDir, prefix + 'part' + str(i + 1) + ext) SeqIO.write(records, out, 'fasta')
def writeRefineFlags(flags, records, refineFlagNames, refineFlagMsgs, outDir, sampleName): # 8gb buffer size if system has large enough memory (-1 implies system buffer size) with open(os.path.join(outDir, sampleName + "_refinement_flagged.txt"), 'w', buffering=int(1 << 23) if hasLargeMem() else -1) as flaggedFp, \ open(os.path.join(outDir, sampleName + "_refinement_flagged_summary.txt"), "w") as summaryFp: for f in refineFlagNames: if len(flags[f]) > 0: summaryFp.write(refineFlagMsgs[f].format(len(flags[f])) + "\n") flaggedFp.write("# " + refineFlagMsgs[f].format(len(flags[f])) + "\n") for i in range(len(flags[f])): flaggedFp.write(">" + flags[f][i] + "\n") flaggedFp.write(str(records[flags[f][i]].seq) + "\n") flaggedFp.write("\n")
def compositionLogos(name, clonoTypes, flatClonoTypes, outDir, threads=2, detailed=False, stream=None): """ :param name: string sample name :param clonoTypes: dict dict with key for each V germline, each having keys of FR / CDR region, which in turn, each having a value of Counter() where the AA sequences are tallied For example: { 'IGHV3-3': { 'FR1': Counter({"FGWSG": 32, ...}), 'CDR1': Counter(...) }, 'IGHV2-1': { ... } } :param flatClonoTypes: dict dict with keys of FR / CDR region, each having a value of Counter() where the AA sequences are tallied For example: { 'FR1': Counter({"FGWSG": 32, ...}), 'CDR1': Counter(...) } :param outDir: string :param threads: int :param detailed: bool segregate composition logo plots based on IGV gene, FR and CDR (all genes combined) composition logos will still be plotted. (If set to false, only FR and CDR composition logos) :param stream: stream object output stream :return: """ logosFolder = os.path.join(outDir, 'composition_logos') createIfNot(logosFolder) printto(stream, "Generating composition logos ...") if detailed: argBuffer = [] for vgerm in clonoTypes: regions = clonoTypes[vgerm].keys() regions.sort() for region in regions: if region == 'v': continue clonoType = clonoTypes[vgerm][region] seqs = clonoType.keys() weights = clonoType.values() regionDirectory = os.path.join(logosFolder, region.upper()) createIfNot(regionDirectory) filename = os.path.join(regionDirectory, name + "_{}_cumulative_logo.csv" .format(vgerm.replace(os.path.sep, '_'))) if hasLargeMem(): printto(stream, "\tbuffering {} for {}".format(region, vgerm)) argBuffer.append((seqs, weights, region, filename)) else: printto(stream, "\tgenerating {} for {}".format(region, vgerm)) generateCumulativeLogo(seqs, weights, region, filename, stream=stream) if len(argBuffer): printto(stream, "Asynchronously generating composition logos from buffer ...") pool = multiprocessing.Pool(processes=threads) # Generate cumulative sequence logos using Toby's approach res = [pool.apply_async(generateCumulativeLogo, args=arg) for arg in argBuffer] [p.get() for p in res] # join processes pool.close() pool.join() printto(stream, "Completed composition logos for IGV families") # composition logo for a region(CDR,FR) as a combination of all IGV - i.e. not segregated regions = flatClonoTypes.keys() # combined AA counts(V family) of each region into one sum regions.sort() for region in regions: if region == 'v': continue clonoType = flatClonoTypes[region] seqs = clonoType.keys() weights = clonoType.values() regionDirectory = os.path.join(logosFolder, region.upper()) createIfNot(regionDirectory) filename = os.path.join(regionDirectory, name + "_cumulative_logo.csv") generateCumulativeLogo(seqs, weights, region, filename, stream=stream)
def generateSeqMotifs(flatClonoTypes, name, outDir, threads=2, stream=None): """ Create motif plots for FR and CDR regions :param flatClonoTypes: dict dict with keys of FR / CDR region, each having a value of Counter() where the AA sequences are tallied For example: { 'FR1': Counter({"FGWSG": 32, ...}), 'CDR1': Counter(...) } :param name: string name of sample :param outDir: string output directory :param threads: int number of threads to use :param stream: stream object output stream :return: None """ motifsFolder = os.path.join(outDir, 'motifs') createIfNot(motifsFolder) printto(stream, "Generating motifs ...") # create motif logos regions = flatClonoTypes.keys() regions.sort() argBuffer = [] for region in regions: if region == 'v': continue clonoType = flatClonoTypes[region] seqs = clonoType.keys() weights = clonoType.values() # Generate sequence motif logos using weblogo # generate logos without alignment filename = os.path.join(motifsFolder, name + ("_{}_motif_logo.png".format(region))) alphabet = createAlphabet(align=False, protein=True, extendAlphabet=True) if hasLargeMem(): printto(stream, "\tbuffering data for {} motif".format(region)) argBuffer.append((seqs, region, alphabet, filename, False, False, True, weights, outDir, threads)) else: printto(stream, "\tgenerating {} motif".format(region)) generateMotif(seqs, region, alphabet, filename, align=False, protein=True, weights=weights, outDir=outDir, threads=threads, stream=stream) # generate logos after alignment filename = os.path.join(motifsFolder, name + ("_{}_motif_aligned_logo.png".format(region))) alphabet = createAlphabet(align=True, protein=True, extendAlphabet=True) if hasLargeMem(): argBuffer.append((seqs, region, alphabet, filename, True, False, True, weights, outDir, threads)) else: generateMotif(seqs, region, alphabet, filename, align=True, protein=True, weights=weights, outDir=outDir, threads=threads, stream=stream) if len(argBuffer): printto(stream, "Asynchronously generating motifs from buffer ...") pool = multiprocessing.Pool(processes=threads) res = [pool.apply_async(generateMotif, args=arg) for arg in argBuffer] [p.get() for p in res] pool.close() pool.join() printto(stream, "CDR/FR Motif analysis complete")
def addPrimerData(cloneAnnot, readFile, format, fr4cut, trim5end, trim3end, actualQstart, end5, end3, end5offset, threads, stream=None): printto(stream, "Primer specificity analysis has begun ...") queryIds = cloneAnnot.index seqsPerFile = 100 _addPrimerColumns(cloneAnnot, end5, end3) workers = [] records = SeqIO.index(gunzip(readFile), format) newColumns = ['queryid'] + list(cloneAnnot.columns) try: printto( stream, "\t " + format + " index created and primer analysis started ...") noSeqs = len(queryIds) totalTasks = int(ceil(noSeqs * 1.0 / seqsPerFile)) tasks = Queue() exitQueue = Queue() resultsQueue = Queue() procCounter = ProcCounter(noSeqs, stream=stream) threads = min(threads, totalTasks) if not hasLargeMem(): threads = 2 for _ in range(threads): w = PrimerWorker(procCounter, fr4cut, trim5end, trim3end, actualQstart, end5, end3, end5offset, tasks, exitQueue, resultsQueue, stream=stream) workers.append(w) w.start() for i in range(totalTasks): ids = queryIds[i * seqsPerFile:(i + 1) * seqsPerFile] recs = map(lambda x: records[x], ids) qsRecs = map(lambda x: cloneAnnot.loc[x].to_dict(), ids) tasks.put((recs, qsRecs)) # poison pills for _ in range(threads + 10): tasks.put(None) i = 0 while i < threads: m = exitQueue.get() i += (m == 'exit') printto(stream, "All workers have completed their tasks successfully.") printto(stream, "Results are being collated from all workers ...") cloneAnnotList = _collectPrimerResults(newColumns, resultsQueue, totalTasks, noSeqs, stream=stream) printto(stream, "Results were collated successfully.") except Exception as e: printto( stream, "Something went wrong during the primer specificity analysis!", LEVEL.EXCEPT) raise e finally: for w in workers: w.terminate() records.close() primerAnnot = DataFrame(cloneAnnotList, columns=newColumns) primerAnnot.set_index('queryid', drop=True, inplace=True) return primerAnnot