Esempio n. 1
0
def compute_library_bias(isnvs, inBam, inConsFasta) :
    ''' For each variant, compute read counts in each library and p-value for
          library bias; append them to string for each variant.
        Format is allele:totalF:totalR:1stLibFCount:1stLibRCount:2ndLibFCount:...:p-val.
        Library counts are in alphabetical order of library IDs.
        Note: Total was computed by vphaser, library counts by samtools mpileup,
          so total might not be sum of library counts.
    '''
    alleleCol = 7 # First column of output with allele counts
    samtoolsTool = SamtoolsTool()
    rgs_by_lib = sorted((rg['LB'],rg['ID'])
        for rg in samtoolsTool.getReadGroups(inBam).values())
    rgs_by_lib = itertools.groupby(rgs_by_lib, lambda x: x[0])
    libBams = []
    header_sam = util.file.mkstempfname('.sam')
    samtoolsTool.dumpHeader(inBam, header_sam)
    for lib,rgs in rgs_by_lib:
        rgs = list(id for lb,id in rgs)
        
        # Create libBam containing all the readgroups in rgs.
        # In samtools 1.1, this can be done by including -r multiple times on
        # a single command line, but that doesn't work in 0.1.19, so instead
        # extract readgroups one by one and then concatenate.
        rgBams = []
        for id in rgs :
            rgBam = util.file.mkstempfname('.bam')
            samtoolsTool.view(['-b', '-r', id], inBam, rgBam)
            samtoolsTool.index(rgBam)
            if samtoolsTool.count(rgBam) > 0:
                rgBams.append(rgBam)
            else:
                # most samtools functions don't like empty input bams, so skip them
                os.unlink(rgBam)
        if rgBams:
            if len(rgBams) > 1:
                libBam = util.file.mkstempfname('.bam')
                samtoolsTool.merge(rgBams, libBam, ['-f', '-1', '-h', header_sam])
                for bam in rgBams :
                    os.unlink(bam)
            else:
                # samtools merge cannot deal with only one (or zero) input bams
                libBam = rgBams[0]
            samtoolsTool.index(libBam)
            n_reads = samtoolsTool.count(libBam)
            log.debug("LB:%s has %s reads in %s read groups (%s)",
                lib, n_reads, len(rgs), ', '.join(rgs))
            libBams.append(libBam)
        
    for row in isnvs :
        consensusAllele = row[3]
        pos = int(row[1]) if consensusAllele != 'i' else int(row[1]) - 1
        chrom = row[0]
        libCounts = [get_mpileup_allele_counts(libBam, chrom, pos, inConsFasta)
                     for libBam in libBams]
        numAlleles = len(row) - alleleCol
        countsMatrix = [[0] * numAlleles for lib in libBams]
        libCountsByAllele = []
        for alleleInd in range(numAlleles) :
            allele = row[alleleCol + alleleInd].split(':')[0]
            libCountsByAllele.append([])
            for libAlleleCounts, countsRow in zip(libCounts, countsMatrix) :
                f, r = libAlleleCounts.get(allele, [0, 0])
                libCountsByAllele[-1].append([f, r])
                countsRow[alleleInd] += f + r
        for alleleInd in range(numAlleles) :
            contingencyTable = [
                [         countsRow[alleleInd]         for countsRow in countsMatrix],
                [sum(countsRow) - countsRow[alleleInd] for countsRow in countsMatrix]]
            rowSums = map(sum, contingencyTable)
            dofs = len(libCounts) - 1
            if dofs < 1 :
                pval = 1.0
            elif min(rowSums) ** dofs / dofs < 10000 :
                # At this cutoff, fisher_exact should take <~ 0.1 sec
                pval = fisher_exact(contingencyTable)
            else :
                pval = chi2_contingency(contingencyTable)
            row[alleleCol + alleleInd] = str(AlleleFieldParser(None,
                *(row[alleleCol + alleleInd].split(':') +
                  [pval, libCountsByAllele[alleleInd]])))
        yield row
    for bam in libBams:
        os.unlink(bam)
    os.unlink(header_sam)