예제 #1
0
def writeDAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igdDist = Counter(stats["dgene"].tolist())
    igdDist = Counter(dict([(str(k), igdDist[k]) for k in igdDist]))
    if len(igdDist) == 0:
        printto(stream, "WARNING: No IGD hits were detected.", LEVEL.WARN)
        return

    # Write the counts of all IGVs into a text file
    # This isn't plotted by default, but we still write the csv file for it
    classes = sorted(igdDist, key=igdDist.get, reverse=True)
    total = sum(igdDist.values()) * 1.0
    writeCSV(os.path.join(outDir, sampleName + '_igd_dist_variant_level.csv'),
             "x,y\n", "{},{}\n", [(x, y) for x, y in zip(
                 classes, map(lambda k: (igdDist[k] / total * 100), classes))])

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igdDistSub = compressCountsGeneLevel(igdDist)
    plotDist(igdDistSub,
             sampleName,
             os.path.join(outDir, sampleName + '_igd_dist_gene_level.csv'),
             rotateLabels=False,
             vertical=False,
             title='IGD Abundance in Sample ' + sampleName,
             stream=stream)

    # Group IGVs based on the families and then write into a text file
    igdDistfam = compressCountsFamilyLevel(igdDistSub)
    # Plot the family level distribution
    plotDist(igdDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igd_dist_family_level.csv'),
             title='IGD Abundance in Sample ' + sampleName,
             stream=stream)
예제 #2
0
def writeJAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igjDist = Counter(stats["jgene"].tolist())
    igjDist = dict([(str(k), igjDist[k]) for k in igjDist])
    if len(igjDist) == 0:
        printto(stream, "WARNING: No IGJ hits were detected.", LEVEL.WARN)
        return

    plotDist(igjDist,
             sampleName,
             os.path.join(outDir, sampleName + '_igj_dist_variant_level.csv'),
             rotateLabels=False,
             vertical=False,
             stream=stream)

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igjDistSub = compressCountsGeneLevel(igjDist)
    #     plotDist(igjDistSub, sampleName, outDir + sampleName +
    #              '_igj_dist_gene_level.csv', rotateLabels=False, vertical=False)
    #
    # Group IGVs based on the families and then write into a text file
    igjDistfam = compressCountsFamilyLevel(igjDistSub)
    # Plot the family level distribution
    plotDist(igjDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igj_dist_family_level.csv'),
             title='IGJ Abundance in Sample ' + sampleName,
             stream=stream)
예제 #3
0
def writeCountsCategoriesToFile(countsVariant, sampleName, filePrefix, title=''):
    # gene level
    countsVariant = compressCountsGeneLevel(countsVariant)
    plotDist(countsVariant, sampleName, filePrefix + 'gene.csv', title)
    # family level
    countsVariant = compressCountsFamilyLevel(countsVariant)
    plotDist(countsVariant, sampleName, filePrefix + 'family.csv', title)
예제 #4
0
def generateProductivityReport(cloneAnnot,
                               cloneSeqs,
                               name,
                               chain,
                               outputDir,
                               stream=None):
    # since np.nan is considered different objects, canonicalize them using 'NaN' string representation
    nanString = 'NaN'
    cloneAnnot.fillna(nanString, inplace=True)

    productive = extractProductiveClones(cloneAnnot,
                                         name,
                                         outputDir,
                                         stream=stream)
    productiveFamilyDist = compressCountsFamilyLevel(
        Counter(productive['vgene'].tolist()))
    plotDist(productiveFamilyDist,
             name,
             os.path.join(outputDir, name + '_igv_dist_productive.csv'),
             title='IGV Abundance of Productive Clones',
             proportion=True,
             stream=stream)
    del productiveFamilyDist
    writeProdStats(cloneAnnot, name, outputDir)
    writeCDRStats(productive,
                  name,
                  outputDir,
                  suffix='productive',
                  stream=stream)
    writeFRStats(productive,
                 name,
                 outputDir,
                 suffix='productive',
                 stream=stream)
    writeGeneStats(productive,
                   name,
                   chain,
                   outputDir,
                   suffix='productive',
                   stream=stream)
    writeStopCodonStats(cloneAnnot,
                        cloneSeqs,
                        name,
                        outputDir,
                        inframe=True,
                        stream=stream)
    writeStopCodonStats(cloneAnnot,
                        cloneSeqs,
                        name,
                        outputDir,
                        inframe=False,
                        stream=stream)

    # now that counting is complete, replace all 'NaN' strings with np.nan again
    cloneAnnot.replace(nanString, nan, inplace=True)
예제 #5
0
def writeStopCodonStats(cloneAnnot,
                        cloneSeqs,
                        name,
                        outputDir,
                        inframe,
                        stream=None):
    """
    This function maintains the hypothesis that a stop codon is independent of
    previous stop codons. It increments the counter for each region as long as there's
    AT LEAST ONE stop codon in the specified region. This is especially true if the sequence
    is in-frame.
    :param cloneAnnot: .*_clone_annot.h5
    :param cloneSeqs: .*_clones_seq.h5
    :param name: sample name
    :param outputDir: output directory
    :param inframe: True if only for inframe sequences, false if only for out-of-frame sequences
    :param stream: debugging stream
    :return:
    """
    regions = ['FR1', 'CDR1', 'FR2', 'CDR2', 'FR3', 'CDR3', 'FR4']

    counter = {}
    frameStatus = 'In-frame' if inframe else 'Out-of-frame'
    cloneSeqs = cloneSeqs.loc[cloneAnnot[cloneAnnot['v-jframe'] ==
                                         frameStatus].index]
    for region in regions:
        counter[region] = sum(cloneSeqs[region.lower()].str.contains(
            "*", regex=False))
    orderedCounter = OrderedDict((reg, counter[reg]) for reg in regions)
    plotDist(
        orderedCounter,
        name,
        os.path.join(outputDir, name + '_stopcodon_region_{}.csv').format(
            'inframe' if inframe else 'outframe'),
        title="Stop codon in FRs and CDRs of {} sequences".format(frameStatus),
        proportion=True,
        sortValues=False,
        maintainx=True,
        stream=stream)
예제 #6
0
def extractProductiveClones(cloneAnnot, name, outputDir, stream=None):
    # v-j rearrangement frame distribution
    vjframeDist = Counter(cloneAnnot['v-jframe'].tolist())
    plotDist(vjframeDist,
             name,
             os.path.join(outputDir, name + '_vjframe_dist.csv'),
             title='V-D-J Rearrangement',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    del vjframeDist
    # plot the family distribution of out-of-frame
    outOfFrame = cloneAnnot[cloneAnnot['v-jframe'] != 'In-frame']
    outOfFrameFamilyDist = compressCountsFamilyLevel(
        Counter(outOfFrame['vgene'].tolist()))
    plotDist(outOfFrameFamilyDist,
             name,
             os.path.join(outputDir, name + '_igv_dist_out_of_frame.csv'),
             title='IGV Abundance of Out-Of-frame Clones',
             proportion=True,
             stream=stream)
    del outOfFrameFamilyDist
    # Indels in CDR1 and FR1
    cdrGaps = Counter(outOfFrame['cdr1.gaps'].tolist())
    plotDist(cdrGaps,
             name,
             os.path.join(outputDir,
                          name + '_cdr1_gaps_dist_out_of_frame.csv'),
             title='Gaps in CDR1',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    frGaps = Counter(outOfFrame['fr1.gaps'].tolist())
    plotDist(frGaps,
             name,
             os.path.join(outputDir, name + '_fr1_gaps_dist_out_of_frame.csv'),
             title='Gaps in FR1',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    del cdrGaps, frGaps
    # Indels in CDR2 and FR2
    cdrGaps = Counter(outOfFrame['cdr2.gaps'].tolist())
    plotDist(cdrGaps,
             name,
             os.path.join(outputDir,
                          name + '_cdr2_gaps_dist_out_of_frame.csv'),
             title='Gaps in CDR2',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    frGaps = Counter(outOfFrame['fr2.gaps'].tolist())
    plotDist(frGaps,
             name,
             os.path.join(outputDir, name + '_fr2_gaps_dist_out_of_frame.csv'),
             title='Gaps in FR2',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    del cdrGaps, frGaps
    # Indels in CDR3 and FR3
    cdrGaps = Counter(outOfFrame['cdr3g.gaps'])
    #         print(len(cdrGaps))
    plotDist(cdrGaps,
             name,
             os.path.join(outputDir,
                          name + '_cdr3_gaps_dist_out_of_frame.csv'),
             title='Gaps in CDR3 (Germline)',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    frGaps = Counter(outOfFrame['fr3g.gaps'].tolist())
    plotDist(frGaps,
             name,
             os.path.join(outputDir, name + '_fr3_gaps_dist_out_of_frame.csv'),
             title='Gaps in FR3 (Germline)',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    del cdrGaps, frGaps
    #     # Indels in FR4
    #     frGaps = Counter(outOfFrame['fr3.gaps'].tolist())
    #     plotDist(frGaps, name, outputDir + name +
    #              '_fr3_gaps_dist_out_of_frame.csv', title='Gaps in FR3',
    #              proportion=False, rotateLabels=False)
    del outOfFrame
    # choose only In-frame RNA clones
    inFrame = cloneAnnot[cloneAnnot['v-jframe'] == 'In-frame']
    # Stop Codon
    stopcodonInFrameDist = Counter(inFrame['stopcodon'].tolist())
    plotDist(stopcodonInFrameDist,
             name,
             os.path.join(outputDir, name + '_stopcodon_dist_in_frame.csv'),
             title='Stop Codons in In-frame Clones',
             proportion=False,
             rotateLabels=False,
             stream=stream)

    # stop codon family distribution
    stopcodFamily = Counter(
        inFrame[inFrame['stopcodon'] == 'Yes']['vgene'].tolist())
    stopcodFamily = compressCountsFamilyLevel(stopcodFamily)
    plotDist(stopcodFamily,
             name,
             os.path.join(outputDir,
                          name + '_igv_dist_inframe_unproductive.csv'),
             title='IGV Abundance of In-frame Unproductive Clones',
             proportion=True,
             stream=stream)
    del stopcodonInFrameDist, stopcodFamily
    #         print(stopcodFamily)
    # choose only productive RNA sequences
    productive = inFrame[inFrame['stopcodon'] == 'No']
    gc.collect()

    return productive
예제 #7
0
def writeFRStats(cloneAnnot, name, outputDir, suffix='', stream=None):
    # FR1 statistics
    gaps = Counter(cloneAnnot['fr1.gaps'].tolist())
    plotDist(gaps,
             name,
             os.path.join(outputDir, name + '_fr1_gaps_dist.csv'),
             title='Gaps in FR1',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    mismatches = Counter(cloneAnnot['fr1.mismatches'].tolist())
    plotDist(mismatches,
             name,
             os.path.join(outputDir, name + '_fr1_mismatches_dist.csv'),
             title='Mismatches in FR1',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    # FR2 statistics
    gaps = Counter(cloneAnnot['fr2.gaps'].tolist())
    plotDist(gaps,
             name,
             os.path.join(outputDir, name + '_fr2_gaps_dist.csv'),
             title='Gaps in FR2',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    mismatches = Counter(cloneAnnot['fr2.mismatches'].tolist())
    plotDist(mismatches,
             name,
             os.path.join(outputDir, name + '_fr2_mismatches_dist.csv'),
             title='Mismatches in FR2',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    # FR3 statistics
    gaps = Counter(cloneAnnot['fr3g.gaps'].tolist())
    plotDist(gaps,
             name,
             os.path.join(outputDir, name + '_fr3_gaps_dist.csv'),
             title='Gaps in FR3 (Germline)',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    mismatches = Counter(cloneAnnot['fr3g.mismatches'].tolist())
    plotDist(mismatches,
             name,
             os.path.join(outputDir, name + '_fr3_mismatches_dist.csv'),
             title='Mismatches in FR3 (Germline)',
             proportion=False,
             rotateLabels=False,
             stream=stream)

    gc.collect()
예제 #8
0
def writeCDRStats(cloneAnnot, name, outputDir, suffix='', stream=None):
    # CDR1 statistics
    cdrGaps = Counter(cloneAnnot['cdr1.gaps'].tolist())
    plotDist(cdrGaps,
             name,
             os.path.join(outputDir, name + '_cdr1_gaps_dist.csv'),
             title='Gaps in CDR1',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    cdrMismatches = Counter(cloneAnnot['cdr1.mismatches'].tolist())
    plotDist(cdrMismatches,
             name,
             os.path.join(outputDir, name + '_cdr1_mismatches_dist.csv'),
             title='Mismatches in CDR1',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    # CDR2 stats
    cdrGaps = Counter(cloneAnnot['cdr2.gaps'].tolist())
    plotDist(cdrGaps,
             name,
             os.path.join(outputDir, name + '_cdr2_gaps_dist.csv'),
             title='Gaps in CDR2',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    cdrMismatches = Counter(cloneAnnot['cdr2.mismatches'].tolist())
    plotDist(cdrMismatches,
             name,
             os.path.join(outputDir, name + '_cdr2_mismatches_dist.csv'),
             title='Mismatches in CDR2',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    # CDR3 stats
    cdrGaps = Counter(cloneAnnot['cdr3g.gaps'])
    #         print(len(cdrGaps))
    plotDist(cdrGaps,
             name,
             os.path.join(outputDir, name + '_cdr3_gaps_dist.csv'),
             title='Gaps in CDR3 (Germline)',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    cdrMismatches = Counter(cloneAnnot['cdr3g.mismatches'].tolist())
    plotDist(cdrMismatches,
             name,
             os.path.join(outputDir, name + '_cdr3_mismatches_dist.csv'),
             title='Mismatches in CDR3 (Germline)',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    gc.collect()
예제 #9
0
def writeGeneStats(cloneAnnot, name, chain, outputDir, suffix, stream=None):
    # V gene stats
    gaps = Counter(cloneAnnot['vgaps'].tolist())
    plotDist(gaps,
             name,
             os.path.join(outputDir, name + '_igv_gaps_dist.csv'),
             title='Gaps in V Gene',
             proportion=True,
             rotateLabels=False,
             top=20,
             stream=stream)
    mismatches = Counter(cloneAnnot['vmismatches'].tolist())
    plotDist(mismatches,
             name,
             os.path.join(outputDir, name + '_igv_mismatches_dist.csv'),
             title='Mismatches in V Gene',
             proportion=True,
             rotateLabels=False,
             top=20,
             stream=stream)
    # D gene stats
    if chain == 'hv':
        gaps = Counter(cloneAnnot['dgaps'].tolist())
        plotDist(gaps,
                 name,
                 os.path.join(outputDir, name + '_igd_gaps_dist.csv'),
                 title='Gaps in D Gene',
                 proportion=False,
                 rotateLabels=False,
                 stream=stream)
        mismatches = Counter(cloneAnnot['dmismatches'].tolist())
        #         print(mismatches)
        plotDist(mismatches,
                 name,
                 os.path.join(outputDir, name + '_igd_mismatches_dist.csv'),
                 title='Mismatches in D Gene',
                 proportion=False,
                 rotateLabels=False,
                 stream=stream)
    # J gene stats
    gaps = Counter(cloneAnnot['jgaps'].tolist())
    plotDist(gaps,
             name,
             os.path.join(outputDir, name + '_igj_gaps_dist.csv'),
             title='Gaps in J Gene',
             proportion=False,
             rotateLabels=False,
             stream=stream)
    mismatches = Counter(cloneAnnot['jmismatches'].tolist())
    plotDist(mismatches,
             name,
             os.path.join(outputDir, name + '_igj_mismatches_dist.csv'),
             title='Mismatches in J Gene',
             proportion=False,
             rotateLabels=False,
             stream=stream)
예제 #10
0
def writeVAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igvDist = Counter(stats["vgene"].tolist())
    if len(igvDist) == 0:
        printto(stream, "WARNING: No IGV hits were detected.", LEVEL.WARN)
        return

    # Write the counts of all IGVs into a text file - variant_level isn't plotted by default.
    classes = sorted(igvDist, key=igvDist.get, reverse=True)
    total = sum(igvDist.values()) * 1.0
    writeCSV(os.path.join(outDir, sampleName + '_igv_dist_variant_level.csv'),
             "x,y\n", "{},{}\n", [(x, y) for x, y in zip(
                 classes, map(lambda k: (igvDist[k] / total * 100), classes))])

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igvDistSub = compressCountsGeneLevel(igvDist)
    #         for k in igvDist.keys():
    #             ksub = k.split('*')[0]
    #             igvDistSub[ksub] = igvDistSub.get(ksub, 0) + igvDist[k]
    plotDist(igvDistSub,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_dist_gene_level.csv'),
             rotateLabels=False,
             vertical=False,
             stream=stream)

    # Group IGVs based on the families and then write into a text file
    igvDistfam = compressCountsFamilyLevel(igvDistSub)
    #         for k in igvDistSub.keys():
    #             kfam = k.split('-')[0].split('/')[0]
    #             igvDistfam[kfam] = igvDistfam.get(kfam, 0) + igvDistSub[k]

    # Plot the family level distribution
    plotDist(igvDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_dist_family_level.csv'),
             stream=stream)

    # plot alignment length vs %identity
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'identity'],
        ['Alignment Length', '%Identity'],
        os.path.join(outDir,
                     sampleName + '_igv_align_quality_identity_hm.tsv'),
        stream=stream)

    # plot alignment length vs bitScore
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'bitscore'], ['Alignment Length', 'bitScore'],
        os.path.join(outDir,
                     sampleName + '_igv_align_quality_bitscore_hm.tsv'),
        stream=stream)

    # plot query start vs. subject start
    generateStatsHeatmap(
        stats,
        sampleName, ['vqstart', 'vstart'], ['Query Start', 'Subject Start'],
        os.path.join(outDir, sampleName + '_igv_align_quality_start_hm.tsv'),
        stream=stream)
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'vmismatches'],
        ['Alignment Length', 'Mismatches'],
        os.path.join(outDir,
                     sampleName + '_igv_align_quality_mismatches_hm.tsv'),
        stream=stream)
    c = Counter(stats['vmismatches'].tolist())
    plotDist(c,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_mismatches_dist.csv'),
             title='Number of Mismatches in V gene',
             proportion=True,
             rotateLabels=False,
             top=20,
             stream=stream)
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'vgaps'], ['Alignment Length', 'Gaps'],
        os.path.join(outDir, sampleName + '_igv_align_quality_gaps_hm.tsv'),
        stream=stream)
    c = Counter(stats['vgaps'].tolist())
    plotDist(c,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_gaps_dist.csv'),
             title='Number of Gaps in V gene',
             proportion=True,
             rotateLabels=False,
             top=20,
             stream=stream)
예제 #11
0
def writePrimerStats(end,
                     name,
                     cloneAnnot,
                     fileprefix,
                     category="All",
                     stream=None):
    NA = str(np.nan)
    PRIMER = str(end) + 'endPrimer'
    MISMATCH = str(end) + 'endMismatchIndex'
    INDEL = str(end) + 'endIndelIndex'

    known = cloneAnnot[cloneAnnot[PRIMER] != NA]
    integrity = {
        'Unknown': (len(cloneAnnot) - len(known)),
        'Indelled': sum(known[INDEL] != 0),
        'Mismatched': sum(known[MISMATCH] != 0),
        'Intact': len(known[(known[INDEL] == 0) & (known[MISMATCH] == 0)])
    }

    plotDist(integrity,
             name,
             fileprefix + 'integrity_dist.csv',
             title='Integrity of {}\'-end Primer Sequence (%s)'.format(end) %
             (category),
             proportion=True,
             rotateLabels=False)

    invalidClones = known.index[known[INDEL] != 0].tolist()
    valid = known.index[known[INDEL] == 0].tolist()
    printto(
        stream,
        "Example of Indelled {}'-end: {}".format(end,
                                                 str(invalidClones[1:10])),
        LEVEL.INFO)
    printto(
        stream,
        "Example of non-indelled {}'-end: {}".format(end, str(valid[1:10])),
        LEVEL.INFO)

    c1 = Counter(known[known[INDEL] != 0][PRIMER].tolist())
    plotDist(c1,
             name,
             fileprefix + 'indelled_dist.csv',
             title='Abundance of Indelled {}\'-end Primers ({})'.format(
                 end, category),
             proportion=False,
             rotateLabels=False,
             vertical=False,
             top=50)

    c = Counter(known[known[INDEL] != 0][INDEL].tolist())
    plotDist(
        c,
        name,
        fileprefix + 'indel_pos_dist.csv',
        title='Abundance of Indel Positions in {}\'-end Primers ({})'.format(
            end, category),
        proportion=False,
        rotateLabels=False,
        vertical=True,
        sortValues=False,
        top=50)

    primers = set(known[PRIMER].tolist())

    for primer in primers:
        # get only ighv abundance of indelled primers
        df = known[known[INDEL] != 0]
        df = df[df[PRIMER] == primer]

        germLineDist = compressCountsGeneLevel(Counter(df['vgene'].tolist()))
        plotDist(germLineDist,
                 name,
                 fileprefix + primer + '_igv_dist.csv',
                 title='IGV Abundance of indelled {} ({})'.format(
                     primer, category),
                 proportion=False,
                 vertical=False,
                 top=20,
                 rotateLabels=False)