def starts_by_depth(bam_file, config, sample_size=None): """ Return a set of x, y points where x is the number of reads sequenced and y is the number of unique start sites identified If sample size < total reads in a file the file will be downsampled. """ binsize = (bam.count(bam_file, config) / 100) + 1 seen_starts = set() counted = 0 num_reads = [] starts = [] buffer = [] with bam.open_samfile(bam_file) as samfile: # unmapped reads should not be counted filtered = ifilter(lambda x: not x.is_unmapped, samfile) def read_parser(read): return ":".join([str(read.tid), str(read.pos)]) # if no sample size is set, use the whole file if not sample_size: samples = map(read_parser, filtered) else: samples = utils.reservoir_sample(filtered, sample_size, read_parser) shuffle(samples) for read in samples: counted += 1 buffer.append(read) if counted % binsize == 0: seen_starts.update(buffer) buffer = [] num_reads.append(counted) starts.append(len(seen_starts)) seen_starts.update(buffer) num_reads.append(counted) starts.append(len(seen_starts)) return pd.DataFrame({"reads": num_reads, "starts": starts})
def count_duplicate_starts(bam_file, sample_size=10000000): """ Return a set of x, y points where x is the number of reads sequenced and y is the number of unique start sites identified If sample size < total reads in a file the file will be downsampled. """ count = Counter() with bam.open_samfile(bam_file) as samfile: # unmapped reads should not be counted filtered = ifilter(lambda x: not x.is_unmapped, samfile) def read_parser(read): return ":".join([str(read.tid), str(read.pos)]) samples = utils.reservoir_sample(filtered, sample_size, read_parser) count.update(samples) return count