def compute_library_bias(isnvs, inBam, inConsFasta) : ''' For each variant, compute read counts in each library and p-value for library bias; append them to string for each variant. Format is allele:totalF:totalR:1stLibFCount:1stLibRCount:2ndLibFCount:...:p-val. Library counts are in alphabetical order of library IDs. Note: Total was computed by vphaser, library counts by samtools mpileup, so total might not be sum of library counts. ''' alleleCol = 7 # First column of output with allele counts samtoolsTool = SamtoolsTool() rgs_by_lib = sorted((rg['LB'],rg['ID']) for rg in samtoolsTool.getReadGroups(inBam).values()) rgs_by_lib = itertools.groupby(rgs_by_lib, lambda x: x[0]) libBams = [] header_sam = util.file.mkstempfname('.sam') samtoolsTool.dumpHeader(inBam, header_sam) for lib,rgs in rgs_by_lib: rgs = list(id for lb,id in rgs) # Create libBam containing all the readgroups in rgs. # In samtools 1.1, this can be done by including -r multiple times on # a single command line, but that doesn't work in 0.1.19, so instead # extract readgroups one by one and then concatenate. rgBams = [] for id in rgs : rgBam = util.file.mkstempfname('.bam') samtoolsTool.view(['-b', '-r', id], inBam, rgBam) samtoolsTool.index(rgBam) if samtoolsTool.count(rgBam) > 0: rgBams.append(rgBam) else: # most samtools functions don't like empty input bams, so skip them os.unlink(rgBam) if rgBams: if len(rgBams) > 1: libBam = util.file.mkstempfname('.bam') samtoolsTool.merge(rgBams, libBam, ['-f', '-1', '-h', header_sam]) for bam in rgBams : os.unlink(bam) else: # samtools merge cannot deal with only one (or zero) input bams libBam = rgBams[0] samtoolsTool.index(libBam) n_reads = samtoolsTool.count(libBam) log.debug("LB:%s has %s reads in %s read groups (%s)", lib, n_reads, len(rgs), ', '.join(rgs)) libBams.append(libBam) for row in isnvs : consensusAllele = row[3] pos = int(row[1]) if consensusAllele != 'i' else int(row[1]) - 1 chrom = row[0] libCounts = [get_mpileup_allele_counts(libBam, chrom, pos, inConsFasta) for libBam in libBams] numAlleles = len(row) - alleleCol countsMatrix = [[0] * numAlleles for lib in libBams] libCountsByAllele = [] for alleleInd in range(numAlleles) : allele = row[alleleCol + alleleInd].split(':')[0] libCountsByAllele.append([]) for libAlleleCounts, countsRow in zip(libCounts, countsMatrix) : f, r = libAlleleCounts.get(allele, [0, 0]) libCountsByAllele[-1].append([f, r]) countsRow[alleleInd] += f + r for alleleInd in range(numAlleles) : contingencyTable = [ [ countsRow[alleleInd] for countsRow in countsMatrix], [sum(countsRow) - countsRow[alleleInd] for countsRow in countsMatrix]] rowSums = map(sum, contingencyTable) dofs = len(libCounts) - 1 if dofs < 1 : pval = 1.0 elif min(rowSums) ** dofs / dofs < 10000 : # At this cutoff, fisher_exact should take <~ 0.1 sec pval = fisher_exact(contingencyTable) else : pval = chi2_contingency(contingencyTable) row[alleleCol + alleleInd] = str(AlleleFieldParser(None, *(row[alleleCol + alleleInd].split(':') + [pval, libCountsByAllele[alleleInd]]))) yield row for bam in libBams: os.unlink(bam) os.unlink(header_sam)