Python reservoir_sample Exemples, bcbio.utils.reservoir_sample Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : qc.py Projet : Galithil/bcbio-nextgen

def starts_by_depth(bam_file, config, sample_size=None):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (bam.count(bam_file, config) / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)
        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])
        # if no sample size is set, use the whole file
        if not sample_size:
            samples = map(read_parser, filtered)
        else:
            samples = utils.reservoir_sample(filtered, sample_size, read_parser)
        shuffle(samples)
        for read in samples:
            counted += 1
            buffer.append(read)
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})

Exemple #2

0

Afficher le fichier

Fichier : analyze_complexity_by_starts.py Projet : Cyberbio-Lab/bcbio-nextgen

def count_duplicate_starts(bam_file, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    count = Counter()
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)
        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])
        samples = utils.reservoir_sample(filtered, sample_size, read_parser)

    count.update(samples)
    return count

Exemple #3

0

Afficher le fichier

Fichier : analyze_complexity_by_starts.py Projet : zhangyupisa/bcbio-nextgen

def count_duplicate_starts(bam_file, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    count = Counter()
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)

        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])

        samples = utils.reservoir_sample(filtered, sample_size, read_parser)

    count.update(samples)
    return count

Exemple #4

0

Afficher le fichier

def starts_by_depth(bam_file, config, sample_size=None):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (bam.count(bam_file, config) / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)

        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])

        # if no sample size is set, use the whole file
        if not sample_size:
            samples = map(read_parser, filtered)
        else:
            samples = utils.reservoir_sample(filtered, sample_size,
                                             read_parser)
        shuffle(samples)
        for read in samples:
            counted += 1
            buffer.append(read)
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})