def writeDAbundanceToFiles(stats, sampleName, outDir, stream=None): igdDist = Counter(stats["dgene"].tolist()) igdDist = Counter(dict([(str(k), igdDist[k]) for k in igdDist])) if len(igdDist) == 0: printto(stream, "WARNING: No IGD hits were detected.", LEVEL.WARN) return # Write the counts of all IGVs into a text file # This isn't plotted by default, but we still write the csv file for it classes = sorted(igdDist, key=igdDist.get, reverse=True) total = sum(igdDist.values()) * 1.0 writeCSV(os.path.join(outDir, sampleName + '_igd_dist_variant_level.csv'), "x,y\n", "{},{}\n", [(x, y) for x, y in zip( classes, map(lambda k: (igdDist[k] / total * 100), classes))]) # Group IGVs based on the subfamilies (gene level) and then write into a text file igdDistSub = compressCountsGeneLevel(igdDist) plotDist(igdDistSub, sampleName, os.path.join(outDir, sampleName + '_igd_dist_gene_level.csv'), rotateLabels=False, vertical=False, title='IGD Abundance in Sample ' + sampleName, stream=stream) # Group IGVs based on the families and then write into a text file igdDistfam = compressCountsFamilyLevel(igdDistSub) # Plot the family level distribution plotDist(igdDistfam, sampleName, os.path.join(outDir, sampleName + '_igd_dist_family_level.csv'), title='IGD Abundance in Sample ' + sampleName, stream=stream)
def writeJAbundanceToFiles(stats, sampleName, outDir, stream=None): igjDist = Counter(stats["jgene"].tolist()) igjDist = dict([(str(k), igjDist[k]) for k in igjDist]) if len(igjDist) == 0: printto(stream, "WARNING: No IGJ hits were detected.", LEVEL.WARN) return plotDist(igjDist, sampleName, os.path.join(outDir, sampleName + '_igj_dist_variant_level.csv'), rotateLabels=False, vertical=False, stream=stream) # Group IGVs based on the subfamilies (gene level) and then write into a text file igjDistSub = compressCountsGeneLevel(igjDist) # plotDist(igjDistSub, sampleName, outDir + sampleName + # '_igj_dist_gene_level.csv', rotateLabels=False, vertical=False) # # Group IGVs based on the families and then write into a text file igjDistfam = compressCountsFamilyLevel(igjDistSub) # Plot the family level distribution plotDist(igjDistfam, sampleName, os.path.join(outDir, sampleName + '_igj_dist_family_level.csv'), title='IGJ Abundance in Sample ' + sampleName, stream=stream)
def writeCountsCategoriesToFile(countsVariant, sampleName, filePrefix, title=''): # gene level countsVariant = compressCountsGeneLevel(countsVariant) plotDist(countsVariant, sampleName, filePrefix + 'gene.csv', title) # family level countsVariant = compressCountsFamilyLevel(countsVariant) plotDist(countsVariant, sampleName, filePrefix + 'family.csv', title)
def generateProductivityReport(cloneAnnot, cloneSeqs, name, chain, outputDir, stream=None): # since np.nan is considered different objects, canonicalize them using 'NaN' string representation nanString = 'NaN' cloneAnnot.fillna(nanString, inplace=True) productive = extractProductiveClones(cloneAnnot, name, outputDir, stream=stream) productiveFamilyDist = compressCountsFamilyLevel( Counter(productive['vgene'].tolist())) plotDist(productiveFamilyDist, name, os.path.join(outputDir, name + '_igv_dist_productive.csv'), title='IGV Abundance of Productive Clones', proportion=True, stream=stream) del productiveFamilyDist writeProdStats(cloneAnnot, name, outputDir) writeCDRStats(productive, name, outputDir, suffix='productive', stream=stream) writeFRStats(productive, name, outputDir, suffix='productive', stream=stream) writeGeneStats(productive, name, chain, outputDir, suffix='productive', stream=stream) writeStopCodonStats(cloneAnnot, cloneSeqs, name, outputDir, inframe=True, stream=stream) writeStopCodonStats(cloneAnnot, cloneSeqs, name, outputDir, inframe=False, stream=stream) # now that counting is complete, replace all 'NaN' strings with np.nan again cloneAnnot.replace(nanString, nan, inplace=True)
def writeStopCodonStats(cloneAnnot, cloneSeqs, name, outputDir, inframe, stream=None): """ This function maintains the hypothesis that a stop codon is independent of previous stop codons. It increments the counter for each region as long as there's AT LEAST ONE stop codon in the specified region. This is especially true if the sequence is in-frame. :param cloneAnnot: .*_clone_annot.h5 :param cloneSeqs: .*_clones_seq.h5 :param name: sample name :param outputDir: output directory :param inframe: True if only for inframe sequences, false if only for out-of-frame sequences :param stream: debugging stream :return: """ regions = ['FR1', 'CDR1', 'FR2', 'CDR2', 'FR3', 'CDR3', 'FR4'] counter = {} frameStatus = 'In-frame' if inframe else 'Out-of-frame' cloneSeqs = cloneSeqs.loc[cloneAnnot[cloneAnnot['v-jframe'] == frameStatus].index] for region in regions: counter[region] = sum(cloneSeqs[region.lower()].str.contains( "*", regex=False)) orderedCounter = OrderedDict((reg, counter[reg]) for reg in regions) plotDist( orderedCounter, name, os.path.join(outputDir, name + '_stopcodon_region_{}.csv').format( 'inframe' if inframe else 'outframe'), title="Stop codon in FRs and CDRs of {} sequences".format(frameStatus), proportion=True, sortValues=False, maintainx=True, stream=stream)
def extractProductiveClones(cloneAnnot, name, outputDir, stream=None): # v-j rearrangement frame distribution vjframeDist = Counter(cloneAnnot['v-jframe'].tolist()) plotDist(vjframeDist, name, os.path.join(outputDir, name + '_vjframe_dist.csv'), title='V-D-J Rearrangement', proportion=False, rotateLabels=False, stream=stream) del vjframeDist # plot the family distribution of out-of-frame outOfFrame = cloneAnnot[cloneAnnot['v-jframe'] != 'In-frame'] outOfFrameFamilyDist = compressCountsFamilyLevel( Counter(outOfFrame['vgene'].tolist())) plotDist(outOfFrameFamilyDist, name, os.path.join(outputDir, name + '_igv_dist_out_of_frame.csv'), title='IGV Abundance of Out-Of-frame Clones', proportion=True, stream=stream) del outOfFrameFamilyDist # Indels in CDR1 and FR1 cdrGaps = Counter(outOfFrame['cdr1.gaps'].tolist()) plotDist(cdrGaps, name, os.path.join(outputDir, name + '_cdr1_gaps_dist_out_of_frame.csv'), title='Gaps in CDR1', proportion=False, rotateLabels=False, stream=stream) frGaps = Counter(outOfFrame['fr1.gaps'].tolist()) plotDist(frGaps, name, os.path.join(outputDir, name + '_fr1_gaps_dist_out_of_frame.csv'), title='Gaps in FR1', proportion=False, rotateLabels=False, stream=stream) del cdrGaps, frGaps # Indels in CDR2 and FR2 cdrGaps = Counter(outOfFrame['cdr2.gaps'].tolist()) plotDist(cdrGaps, name, os.path.join(outputDir, name + '_cdr2_gaps_dist_out_of_frame.csv'), title='Gaps in CDR2', proportion=False, rotateLabels=False, stream=stream) frGaps = Counter(outOfFrame['fr2.gaps'].tolist()) plotDist(frGaps, name, os.path.join(outputDir, name + '_fr2_gaps_dist_out_of_frame.csv'), title='Gaps in FR2', proportion=False, rotateLabels=False, stream=stream) del cdrGaps, frGaps # Indels in CDR3 and FR3 cdrGaps = Counter(outOfFrame['cdr3g.gaps']) # print(len(cdrGaps)) plotDist(cdrGaps, name, os.path.join(outputDir, name + '_cdr3_gaps_dist_out_of_frame.csv'), title='Gaps in CDR3 (Germline)', proportion=False, rotateLabels=False, stream=stream) frGaps = Counter(outOfFrame['fr3g.gaps'].tolist()) plotDist(frGaps, name, os.path.join(outputDir, name + '_fr3_gaps_dist_out_of_frame.csv'), title='Gaps in FR3 (Germline)', proportion=False, rotateLabels=False, stream=stream) del cdrGaps, frGaps # # Indels in FR4 # frGaps = Counter(outOfFrame['fr3.gaps'].tolist()) # plotDist(frGaps, name, outputDir + name + # '_fr3_gaps_dist_out_of_frame.csv', title='Gaps in FR3', # proportion=False, rotateLabels=False) del outOfFrame # choose only In-frame RNA clones inFrame = cloneAnnot[cloneAnnot['v-jframe'] == 'In-frame'] # Stop Codon stopcodonInFrameDist = Counter(inFrame['stopcodon'].tolist()) plotDist(stopcodonInFrameDist, name, os.path.join(outputDir, name + '_stopcodon_dist_in_frame.csv'), title='Stop Codons in In-frame Clones', proportion=False, rotateLabels=False, stream=stream) # stop codon family distribution stopcodFamily = Counter( inFrame[inFrame['stopcodon'] == 'Yes']['vgene'].tolist()) stopcodFamily = compressCountsFamilyLevel(stopcodFamily) plotDist(stopcodFamily, name, os.path.join(outputDir, name + '_igv_dist_inframe_unproductive.csv'), title='IGV Abundance of In-frame Unproductive Clones', proportion=True, stream=stream) del stopcodonInFrameDist, stopcodFamily # print(stopcodFamily) # choose only productive RNA sequences productive = inFrame[inFrame['stopcodon'] == 'No'] gc.collect() return productive
def writeFRStats(cloneAnnot, name, outputDir, suffix='', stream=None): # FR1 statistics gaps = Counter(cloneAnnot['fr1.gaps'].tolist()) plotDist(gaps, name, os.path.join(outputDir, name + '_fr1_gaps_dist.csv'), title='Gaps in FR1', proportion=False, rotateLabels=False, stream=stream) mismatches = Counter(cloneAnnot['fr1.mismatches'].tolist()) plotDist(mismatches, name, os.path.join(outputDir, name + '_fr1_mismatches_dist.csv'), title='Mismatches in FR1', proportion=False, rotateLabels=False, stream=stream) # FR2 statistics gaps = Counter(cloneAnnot['fr2.gaps'].tolist()) plotDist(gaps, name, os.path.join(outputDir, name + '_fr2_gaps_dist.csv'), title='Gaps in FR2', proportion=False, rotateLabels=False, stream=stream) mismatches = Counter(cloneAnnot['fr2.mismatches'].tolist()) plotDist(mismatches, name, os.path.join(outputDir, name + '_fr2_mismatches_dist.csv'), title='Mismatches in FR2', proportion=False, rotateLabels=False, stream=stream) # FR3 statistics gaps = Counter(cloneAnnot['fr3g.gaps'].tolist()) plotDist(gaps, name, os.path.join(outputDir, name + '_fr3_gaps_dist.csv'), title='Gaps in FR3 (Germline)', proportion=False, rotateLabels=False, stream=stream) mismatches = Counter(cloneAnnot['fr3g.mismatches'].tolist()) plotDist(mismatches, name, os.path.join(outputDir, name + '_fr3_mismatches_dist.csv'), title='Mismatches in FR3 (Germline)', proportion=False, rotateLabels=False, stream=stream) gc.collect()
def writeCDRStats(cloneAnnot, name, outputDir, suffix='', stream=None): # CDR1 statistics cdrGaps = Counter(cloneAnnot['cdr1.gaps'].tolist()) plotDist(cdrGaps, name, os.path.join(outputDir, name + '_cdr1_gaps_dist.csv'), title='Gaps in CDR1', proportion=False, rotateLabels=False, stream=stream) cdrMismatches = Counter(cloneAnnot['cdr1.mismatches'].tolist()) plotDist(cdrMismatches, name, os.path.join(outputDir, name + '_cdr1_mismatches_dist.csv'), title='Mismatches in CDR1', proportion=False, rotateLabels=False, stream=stream) # CDR2 stats cdrGaps = Counter(cloneAnnot['cdr2.gaps'].tolist()) plotDist(cdrGaps, name, os.path.join(outputDir, name + '_cdr2_gaps_dist.csv'), title='Gaps in CDR2', proportion=False, rotateLabels=False, stream=stream) cdrMismatches = Counter(cloneAnnot['cdr2.mismatches'].tolist()) plotDist(cdrMismatches, name, os.path.join(outputDir, name + '_cdr2_mismatches_dist.csv'), title='Mismatches in CDR2', proportion=False, rotateLabels=False, stream=stream) # CDR3 stats cdrGaps = Counter(cloneAnnot['cdr3g.gaps']) # print(len(cdrGaps)) plotDist(cdrGaps, name, os.path.join(outputDir, name + '_cdr3_gaps_dist.csv'), title='Gaps in CDR3 (Germline)', proportion=False, rotateLabels=False, stream=stream) cdrMismatches = Counter(cloneAnnot['cdr3g.mismatches'].tolist()) plotDist(cdrMismatches, name, os.path.join(outputDir, name + '_cdr3_mismatches_dist.csv'), title='Mismatches in CDR3 (Germline)', proportion=False, rotateLabels=False, stream=stream) gc.collect()
def writeGeneStats(cloneAnnot, name, chain, outputDir, suffix, stream=None): # V gene stats gaps = Counter(cloneAnnot['vgaps'].tolist()) plotDist(gaps, name, os.path.join(outputDir, name + '_igv_gaps_dist.csv'), title='Gaps in V Gene', proportion=True, rotateLabels=False, top=20, stream=stream) mismatches = Counter(cloneAnnot['vmismatches'].tolist()) plotDist(mismatches, name, os.path.join(outputDir, name + '_igv_mismatches_dist.csv'), title='Mismatches in V Gene', proportion=True, rotateLabels=False, top=20, stream=stream) # D gene stats if chain == 'hv': gaps = Counter(cloneAnnot['dgaps'].tolist()) plotDist(gaps, name, os.path.join(outputDir, name + '_igd_gaps_dist.csv'), title='Gaps in D Gene', proportion=False, rotateLabels=False, stream=stream) mismatches = Counter(cloneAnnot['dmismatches'].tolist()) # print(mismatches) plotDist(mismatches, name, os.path.join(outputDir, name + '_igd_mismatches_dist.csv'), title='Mismatches in D Gene', proportion=False, rotateLabels=False, stream=stream) # J gene stats gaps = Counter(cloneAnnot['jgaps'].tolist()) plotDist(gaps, name, os.path.join(outputDir, name + '_igj_gaps_dist.csv'), title='Gaps in J Gene', proportion=False, rotateLabels=False, stream=stream) mismatches = Counter(cloneAnnot['jmismatches'].tolist()) plotDist(mismatches, name, os.path.join(outputDir, name + '_igj_mismatches_dist.csv'), title='Mismatches in J Gene', proportion=False, rotateLabels=False, stream=stream)
def writeVAbundanceToFiles(stats, sampleName, outDir, stream=None): igvDist = Counter(stats["vgene"].tolist()) if len(igvDist) == 0: printto(stream, "WARNING: No IGV hits were detected.", LEVEL.WARN) return # Write the counts of all IGVs into a text file - variant_level isn't plotted by default. classes = sorted(igvDist, key=igvDist.get, reverse=True) total = sum(igvDist.values()) * 1.0 writeCSV(os.path.join(outDir, sampleName + '_igv_dist_variant_level.csv'), "x,y\n", "{},{}\n", [(x, y) for x, y in zip( classes, map(lambda k: (igvDist[k] / total * 100), classes))]) # Group IGVs based on the subfamilies (gene level) and then write into a text file igvDistSub = compressCountsGeneLevel(igvDist) # for k in igvDist.keys(): # ksub = k.split('*')[0] # igvDistSub[ksub] = igvDistSub.get(ksub, 0) + igvDist[k] plotDist(igvDistSub, sampleName, os.path.join(outDir, sampleName + '_igv_dist_gene_level.csv'), rotateLabels=False, vertical=False, stream=stream) # Group IGVs based on the families and then write into a text file igvDistfam = compressCountsFamilyLevel(igvDistSub) # for k in igvDistSub.keys(): # kfam = k.split('-')[0].split('/')[0] # igvDistfam[kfam] = igvDistfam.get(kfam, 0) + igvDistSub[k] # Plot the family level distribution plotDist(igvDistfam, sampleName, os.path.join(outDir, sampleName + '_igv_dist_family_level.csv'), stream=stream) # plot alignment length vs %identity generateStatsHeatmap( stats, sampleName, ['alignlen', 'identity'], ['Alignment Length', '%Identity'], os.path.join(outDir, sampleName + '_igv_align_quality_identity_hm.tsv'), stream=stream) # plot alignment length vs bitScore generateStatsHeatmap( stats, sampleName, ['alignlen', 'bitscore'], ['Alignment Length', 'bitScore'], os.path.join(outDir, sampleName + '_igv_align_quality_bitscore_hm.tsv'), stream=stream) # plot query start vs. subject start generateStatsHeatmap( stats, sampleName, ['vqstart', 'vstart'], ['Query Start', 'Subject Start'], os.path.join(outDir, sampleName + '_igv_align_quality_start_hm.tsv'), stream=stream) generateStatsHeatmap( stats, sampleName, ['alignlen', 'vmismatches'], ['Alignment Length', 'Mismatches'], os.path.join(outDir, sampleName + '_igv_align_quality_mismatches_hm.tsv'), stream=stream) c = Counter(stats['vmismatches'].tolist()) plotDist(c, sampleName, os.path.join(outDir, sampleName + '_igv_mismatches_dist.csv'), title='Number of Mismatches in V gene', proportion=True, rotateLabels=False, top=20, stream=stream) generateStatsHeatmap( stats, sampleName, ['alignlen', 'vgaps'], ['Alignment Length', 'Gaps'], os.path.join(outDir, sampleName + '_igv_align_quality_gaps_hm.tsv'), stream=stream) c = Counter(stats['vgaps'].tolist()) plotDist(c, sampleName, os.path.join(outDir, sampleName + '_igv_gaps_dist.csv'), title='Number of Gaps in V gene', proportion=True, rotateLabels=False, top=20, stream=stream)
def writePrimerStats(end, name, cloneAnnot, fileprefix, category="All", stream=None): NA = str(np.nan) PRIMER = str(end) + 'endPrimer' MISMATCH = str(end) + 'endMismatchIndex' INDEL = str(end) + 'endIndelIndex' known = cloneAnnot[cloneAnnot[PRIMER] != NA] integrity = { 'Unknown': (len(cloneAnnot) - len(known)), 'Indelled': sum(known[INDEL] != 0), 'Mismatched': sum(known[MISMATCH] != 0), 'Intact': len(known[(known[INDEL] == 0) & (known[MISMATCH] == 0)]) } plotDist(integrity, name, fileprefix + 'integrity_dist.csv', title='Integrity of {}\'-end Primer Sequence (%s)'.format(end) % (category), proportion=True, rotateLabels=False) invalidClones = known.index[known[INDEL] != 0].tolist() valid = known.index[known[INDEL] == 0].tolist() printto( stream, "Example of Indelled {}'-end: {}".format(end, str(invalidClones[1:10])), LEVEL.INFO) printto( stream, "Example of non-indelled {}'-end: {}".format(end, str(valid[1:10])), LEVEL.INFO) c1 = Counter(known[known[INDEL] != 0][PRIMER].tolist()) plotDist(c1, name, fileprefix + 'indelled_dist.csv', title='Abundance of Indelled {}\'-end Primers ({})'.format( end, category), proportion=False, rotateLabels=False, vertical=False, top=50) c = Counter(known[known[INDEL] != 0][INDEL].tolist()) plotDist( c, name, fileprefix + 'indel_pos_dist.csv', title='Abundance of Indel Positions in {}\'-end Primers ({})'.format( end, category), proportion=False, rotateLabels=False, vertical=True, sortValues=False, top=50) primers = set(known[PRIMER].tolist()) for primer in primers: # get only ighv abundance of indelled primers df = known[known[INDEL] != 0] df = df[df[PRIMER] == primer] germLineDist = compressCountsGeneLevel(Counter(df['vgene'].tolist())) plotDist(germLineDist, name, fileprefix + primer + '_igv_dist.csv', title='IGV Abundance of indelled {} ({})'.format( primer, category), proportion=False, vertical=False, top=20, rotateLabels=False)