Esempi in Python per reservoir_sample

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: bcbio.utils

Metodo/funzione: reservoir_sample

Esempi su hotexamples.com: 4

reservoir_sample in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per bcbio.utils.reservoir_sample, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: qc.py Progetto: Galithil/bcbio-nextgen

def starts_by_depth(bam_file, config, sample_size=None):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (bam.count(bam_file, config) / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)
        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])
        # if no sample size is set, use the whole file
        if not sample_size:
            samples = map(read_parser, filtered)
        else:
            samples = utils.reservoir_sample(filtered, sample_size, read_parser)
        shuffle(samples)
        for read in samples:
            counted += 1
            buffer.append(read)
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})

Esempio n. 2

Mostra file

File: analyze_complexity_by_starts.py Progetto: Cyberbio-Lab/bcbio-nextgen

def count_duplicate_starts(bam_file, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    count = Counter()
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)
        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])
        samples = utils.reservoir_sample(filtered, sample_size, read_parser)

    count.update(samples)
    return count

Esempio n. 3

Mostra file

File: analyze_complexity_by_starts.py Progetto: zhangyupisa/bcbio-nextgen

def count_duplicate_starts(bam_file, sample_size=10000000):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    count = Counter()
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)

        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])

        samples = utils.reservoir_sample(filtered, sample_size, read_parser)

    count.update(samples)
    return count

Esempio n. 4

Mostra file

def starts_by_depth(bam_file, config, sample_size=None):
    """
    Return a set of x, y points where x is the number of reads sequenced and
    y is the number of unique start sites identified
    If sample size < total reads in a file the file will be downsampled.
    """
    binsize = (bam.count(bam_file, config) / 100) + 1
    seen_starts = set()
    counted = 0
    num_reads = []
    starts = []
    buffer = []
    with bam.open_samfile(bam_file) as samfile:
        # unmapped reads should not be counted
        filtered = ifilter(lambda x: not x.is_unmapped, samfile)

        def read_parser(read):
            return ":".join([str(read.tid), str(read.pos)])

        # if no sample size is set, use the whole file
        if not sample_size:
            samples = map(read_parser, filtered)
        else:
            samples = utils.reservoir_sample(filtered, sample_size,
                                             read_parser)
        shuffle(samples)
        for read in samples:
            counted += 1
            buffer.append(read)
            if counted % binsize == 0:
                seen_starts.update(buffer)
                buffer = []
                num_reads.append(counted)
                starts.append(len(seen_starts))
        seen_starts.update(buffer)
        num_reads.append(counted)
        starts.append(len(seen_starts))
    return pd.DataFrame({"reads": num_reads, "starts": starts})