Esempio n. 1
0
def writeDAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igdDist = Counter(stats["dgene"].tolist())
    igdDist = Counter(dict([(str(k), igdDist[k]) for k in igdDist]))
    if len(igdDist) == 0:
        printto(stream, "WARNING: No IGD hits were detected.", LEVEL.WARN)
        return

    # Write the counts of all IGVs into a text file
    # This isn't plotted by default, but we still write the csv file for it
    classes = sorted(igdDist, key=igdDist.get, reverse=True)
    total = sum(igdDist.values()) * 1.0
    writeCSV(os.path.join(outDir, sampleName + '_igd_dist_variant_level.csv'),
             "x,y\n", "{},{}\n", [(x, y) for x, y in zip(
                 classes, map(lambda k: (igdDist[k] / total * 100), classes))])

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igdDistSub = compressCountsGeneLevel(igdDist)
    plotDist(igdDistSub,
             sampleName,
             os.path.join(outDir, sampleName + '_igd_dist_gene_level.csv'),
             rotateLabels=False,
             vertical=False,
             title='IGD Abundance in Sample ' + sampleName,
             stream=stream)

    # Group IGVs based on the families and then write into a text file
    igdDistfam = compressCountsFamilyLevel(igdDistSub)
    # Plot the family level distribution
    plotDist(igdDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igd_dist_family_level.csv'),
             title='IGD Abundance in Sample ' + sampleName,
             stream=stream)
Esempio n. 2
0
def writeCountsCategoriesToFile(countsVariant, sampleName, filePrefix, title=''):
    # gene level
    countsVariant = compressCountsGeneLevel(countsVariant)
    plotDist(countsVariant, sampleName, filePrefix + 'gene.csv', title)
    # family level
    countsVariant = compressCountsFamilyLevel(countsVariant)
    plotDist(countsVariant, sampleName, filePrefix + 'family.csv', title)
Esempio n. 3
0
def writeJAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igjDist = Counter(stats["jgene"].tolist())
    igjDist = dict([(str(k), igjDist[k]) for k in igjDist])
    if len(igjDist) == 0:
        printto(stream, "WARNING: No IGJ hits were detected.", LEVEL.WARN)
        return

    plotDist(igjDist,
             sampleName,
             os.path.join(outDir, sampleName + '_igj_dist_variant_level.csv'),
             rotateLabels=False,
             vertical=False,
             stream=stream)

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igjDistSub = compressCountsGeneLevel(igjDist)
    #     plotDist(igjDistSub, sampleName, outDir + sampleName +
    #              '_igj_dist_gene_level.csv', rotateLabels=False, vertical=False)
    #
    # Group IGVs based on the families and then write into a text file
    igjDistfam = compressCountsFamilyLevel(igjDistSub)
    # Plot the family level distribution
    plotDist(igjDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igj_dist_family_level.csv'),
             title='IGJ Abundance in Sample ' + sampleName,
             stream=stream)
Esempio n. 4
0
def writeVAbundanceToFiles(stats, sampleName, outDir, stream=None):
    igvDist = Counter(stats["vgene"].tolist())
    if len(igvDist) == 0:
        printto(stream, "WARNING: No IGV hits were detected.", LEVEL.WARN)
        return

    # Write the counts of all IGVs into a text file - variant_level isn't plotted by default.
    classes = sorted(igvDist, key=igvDist.get, reverse=True)
    total = sum(igvDist.values()) * 1.0
    writeCSV(os.path.join(outDir, sampleName + '_igv_dist_variant_level.csv'),
             "x,y\n", "{},{}\n", [(x, y) for x, y in zip(
                 classes, map(lambda k: (igvDist[k] / total * 100), classes))])

    # Group IGVs based on the subfamilies (gene level) and then write into a text file
    igvDistSub = compressCountsGeneLevel(igvDist)
    #         for k in igvDist.keys():
    #             ksub = k.split('*')[0]
    #             igvDistSub[ksub] = igvDistSub.get(ksub, 0) + igvDist[k]
    plotDist(igvDistSub,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_dist_gene_level.csv'),
             rotateLabels=False,
             vertical=False,
             stream=stream)

    # Group IGVs based on the families and then write into a text file
    igvDistfam = compressCountsFamilyLevel(igvDistSub)
    #         for k in igvDistSub.keys():
    #             kfam = k.split('-')[0].split('/')[0]
    #             igvDistfam[kfam] = igvDistfam.get(kfam, 0) + igvDistSub[k]

    # Plot the family level distribution
    plotDist(igvDistfam,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_dist_family_level.csv'),
             stream=stream)

    # plot alignment length vs %identity
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'identity'],
        ['Alignment Length', '%Identity'],
        os.path.join(outDir,
                     sampleName + '_igv_align_quality_identity_hm.tsv'),
        stream=stream)

    # plot alignment length vs bitScore
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'bitscore'], ['Alignment Length', 'bitScore'],
        os.path.join(outDir,
                     sampleName + '_igv_align_quality_bitscore_hm.tsv'),
        stream=stream)

    # plot query start vs. subject start
    generateStatsHeatmap(
        stats,
        sampleName, ['vqstart', 'vstart'], ['Query Start', 'Subject Start'],
        os.path.join(outDir, sampleName + '_igv_align_quality_start_hm.tsv'),
        stream=stream)
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'vmismatches'],
        ['Alignment Length', 'Mismatches'],
        os.path.join(outDir,
                     sampleName + '_igv_align_quality_mismatches_hm.tsv'),
        stream=stream)
    c = Counter(stats['vmismatches'].tolist())
    plotDist(c,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_mismatches_dist.csv'),
             title='Number of Mismatches in V gene',
             proportion=True,
             rotateLabels=False,
             top=20,
             stream=stream)
    generateStatsHeatmap(
        stats,
        sampleName, ['alignlen', 'vgaps'], ['Alignment Length', 'Gaps'],
        os.path.join(outDir, sampleName + '_igv_align_quality_gaps_hm.tsv'),
        stream=stream)
    c = Counter(stats['vgaps'].tolist())
    plotDist(c,
             sampleName,
             os.path.join(outDir, sampleName + '_igv_gaps_dist.csv'),
             title='Number of Gaps in V gene',
             proportion=True,
             rotateLabels=False,
             top=20,
             stream=stream)
Esempio n. 5
0
def writePrimerStats(end,
                     name,
                     cloneAnnot,
                     fileprefix,
                     category="All",
                     stream=None):
    NA = str(np.nan)
    PRIMER = str(end) + 'endPrimer'
    MISMATCH = str(end) + 'endMismatchIndex'
    INDEL = str(end) + 'endIndelIndex'

    known = cloneAnnot[cloneAnnot[PRIMER] != NA]
    integrity = {
        'Unknown': (len(cloneAnnot) - len(known)),
        'Indelled': sum(known[INDEL] != 0),
        'Mismatched': sum(known[MISMATCH] != 0),
        'Intact': len(known[(known[INDEL] == 0) & (known[MISMATCH] == 0)])
    }

    plotDist(integrity,
             name,
             fileprefix + 'integrity_dist.csv',
             title='Integrity of {}\'-end Primer Sequence (%s)'.format(end) %
             (category),
             proportion=True,
             rotateLabels=False)

    invalidClones = known.index[known[INDEL] != 0].tolist()
    valid = known.index[known[INDEL] == 0].tolist()
    printto(
        stream,
        "Example of Indelled {}'-end: {}".format(end,
                                                 str(invalidClones[1:10])),
        LEVEL.INFO)
    printto(
        stream,
        "Example of non-indelled {}'-end: {}".format(end, str(valid[1:10])),
        LEVEL.INFO)

    c1 = Counter(known[known[INDEL] != 0][PRIMER].tolist())
    plotDist(c1,
             name,
             fileprefix + 'indelled_dist.csv',
             title='Abundance of Indelled {}\'-end Primers ({})'.format(
                 end, category),
             proportion=False,
             rotateLabels=False,
             vertical=False,
             top=50)

    c = Counter(known[known[INDEL] != 0][INDEL].tolist())
    plotDist(
        c,
        name,
        fileprefix + 'indel_pos_dist.csv',
        title='Abundance of Indel Positions in {}\'-end Primers ({})'.format(
            end, category),
        proportion=False,
        rotateLabels=False,
        vertical=True,
        sortValues=False,
        top=50)

    primers = set(known[PRIMER].tolist())

    for primer in primers:
        # get only ighv abundance of indelled primers
        df = known[known[INDEL] != 0]
        df = df[df[PRIMER] == primer]

        germLineDist = compressCountsGeneLevel(Counter(df['vgene'].tolist()))
        plotDist(germLineDist,
                 name,
                 fileprefix + primer + '_igv_dist.csv',
                 title='IGV Abundance of indelled {} ({})'.format(
                     primer, category),
                 proportion=False,
                 vertical=False,
                 top=20,
                 rotateLabels=False)