Ejemplo n.º 1
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()

    # Concatenate chunks
    if len(chunk_outs) == 1:
        subprocess.call(['mv', chunk_outs[0].phased_possorted_bam, outs.phased_possorted_bam])
    else:
        tk_bam.concatenate(outs.phased_possorted_bam, [out.phased_possorted_bam for out in chunk_outs])
    tk_bam.index(outs.phased_possorted_bam)
    outs.phased_possorted_bam_index = outs.phased_possorted_bam + ".bai"

    total_reads = 0
    phased_reads = 0
    molecule_tagged_reads = 0
    for chunk_out in chunk_outs:
        total_reads += chunk_out.total_reads
        phased_reads += chunk_out.phased_reads
        molecule_tagged_reads += chunk_out.molecule_tagged_reads

    outs.total_reads = total_reads
    outs.phased_reads = phased_reads
    outs.molecule_tagged_reads = molecule_tagged_reads

    fract_reads_phased = tk_stats.robust_divide(float(phased_reads), float(total_reads))
    fract_reads_molecule_id = tk_stats.robust_divide(float(molecule_tagged_reads), float(total_reads))

    stats = {
        "fract_reads_phased": fract_reads_phased,
        "fract_reads_molecule_id": fract_reads_molecule_id,
        }

    with open(outs.summary, 'w') as summary_file:
        json.dump(tenkit.safe_json.json_sanitize(stats), summary_file)
def evaluate_snp_cluster_calls(cluster_assignment, thresholded_calls, actual):
    """ Args:
        - cluster_assignment: list(int)
        - thresholded_calls: list(int), None if no call
        - actual: list(int) """
    cluster_assignment = np.array(cluster_assignment, dtype=int)
    actual = np.array(actual, dtype=int)

    minor_called_class = 1 - sp_stats.mode(cluster_assignment).mode[0]
    minor_actual_class = 1 - sp_stats.mode(actual).mode[0]

    was_called = np.array([x is not None for x in thresholded_calls])

    called_pos = (cluster_assignment == minor_called_class)[was_called]
    actual_pos = (actual == minor_actual_class)[was_called]

    nc = sum(np.logical_not(was_called))
    tp = sum(called_pos & actual_pos)
    tn = sum(np.logical_not(called_pos) & np.logical_not(actual_pos))
    fp = sum(called_pos & np.logical_not(actual_pos))
    fn = sum(np.logical_not(called_pos) & actual_pos)

    return {
        'tp': tp,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'sensitivity': tk_stats.robust_divide(tp, tp + fn),
        'ppv': tk_stats.robust_divide(tp, tp + fp),
        'no_call_rate': tk_stats.robust_divide(nc, len(actual)),
    }
Ejemplo n.º 3
0
    def checkOverlap(self, chrom, start, end):
        if start > end: raise Exception("errRegionForamt","the stop position is smaller than the start position "+" ".join([start,end]))
    # database_bed is the object returned by read_database_bed
    # return
    #    1. True or False for finding or not finding
    #    2. Total overlapping base pairs
    #    3. The percentage of overlap of the query
    #    4. The percentage of overlap of the largest region in database
    #    5. Number of regions overlapped

        if chrom not in self.content:
            return OverlapInfo(False, 0, 0, 0, 0, 0, self.name)
        else:
            overlapping_regions = self.content[chrom].overlapping_regions(start,end)
            if len(overlapping_regions) == 0:
                return OverlapInfo(False, 0, 0, 0, 0, 0, self.name)
            for r in overlapping_regions:
                key = "_".join([chrom, str(r[0]), str(r[1])])
                self.found[key]=1
            region_sizes = [r[1]-r[0] for r in overlapping_regions]
            overlapping_sizes = [min(end,r[1])-max(start,r[0]) for r in overlapping_regions]
            overlapping_fractions=[robust_divide(o*1.0, s) for s, o in zip(region_sizes, overlapping_sizes)]
            total_overlap_size=sum(overlapping_sizes)
            fraction_as_query=robust_divide(total_overlap_size*1.0, (end-start))
        #print region_sizes, overlapping_fractions
            return OverlapInfo(total_overlap_size>0, total_overlap_size, fraction_as_query, \
                max(overlapping_fractions), max(region_sizes), len(overlapping_sizes), self.name)
Ejemplo n.º 4
0
def split(args):
    # default to downsampling by mapped reads
    downsample = True
    use_raw_reads = False

    if args.normalization_mode == cr_constants.NORM_MODE_RAW:
        use_raw_reads = True
    elif args.normalization_mode == cr_constants.NORM_MODE_NONE:
        downsample = False

    # compute downsample rates for each gem group
    downsample_map = args.detect_cells_gg_metrics
    with cr_mol_counter.MoleculeCounter.open(args.molecules,
                                             'r') as mol_counter:
        for (gg, submetrics) in mol_counter.get_metric(
                cr_mol_counter.GEM_GROUPS_METRIC).iteritems():
            info = downsample_map[str(gg)]
            info['total_reads'] = submetrics[
                cr_mol_counter.GG_TOTAL_READS_METRIC]
            reads = info['total_reads'] if use_raw_reads else info['cmb_reads']
            cells = info['cells']
            info['rpc'] = tk_stats.robust_divide(reads,
                                                 cells) if cells > 0 else 0.0

    lowest_rpc = min([gg['rpc'] for gg in downsample_map.values()])
    for gg, info in downsample_map.iteritems():
        if downsample and len(downsample_map) > 1:
            if lowest_rpc == 0:
                # one or more samples are empty. just do the naive thing for now.
                frac_reads_kept = 0.0
            else:
                frac_reads_kept = tk_stats.robust_divide(
                    lowest_rpc, info['rpc'])
        else:
            frac_reads_kept = 1.0
        info['frac_reads_kept'] = frac_reads_kept

    # Split the molecule info h5 into equi-RAM chunks, preserving (barcode, gem_group) boundaries
    # Assumes the molecule_info is sorted by (barcode, gem_group)
    chunks = []
    with cr_mol_counter.MoleculeCounter.open(args.molecules,
                                             'r') as mol_counter:
        for chunk_start, chunk_len in mol_counter.get_chunks(
                cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK,
                preserve_boundaries=False):
            chunks.append({
                'downsample':
                downsample,
                'downsample_map':
                downsample_map,
                'chunk_start':
                str(chunk_start),
                'chunk_len':
                str(chunk_len),
                '__mem_gb':
                cr_mol_counter.MoleculeCounter.estimate_mem_gb(chunk_len),
            })
    return {'chunks': chunks}
Ejemplo n.º 5
0
def compute_summary_metrics(misc_sm):
    """Called in the join step to extract summary metrics from the pooled summarizer objects"""
    metrics = misc_sm.get_summarizer('metrics')

    metrics['r1_q30_bases_fract'] = robust_divide(metrics['r1_q30_bases'],
                                                  metrics['r1_tot_bases'])
    metrics['r2_q30_bases_fract'] = robust_divide(metrics['r2_q30_bases'],
                                                  metrics['r2_tot_bases'])
    metrics['si_q30_bases_fract'] = robust_divide(metrics['si_q30_bases'],
                                                  metrics['si_tot_bases'])
    metrics['bc_q30_bases_fract'] = robust_divide(metrics['bc_q30_bases'],
                                                  metrics['bc_tot_bases'])

    return metrics
def add_doublet_rate_metrics(summary_info, singlecell_df, species_list):
    """Infer doublet rate from observed doublets"""
    def infer_multiplets_from_observed(n_obs_multiplets, n_cells0, n_cells1):
        """Estimates the number of real multiplets based on the number observed from a barnyard (mixed species) experiment"""
        if n_cells0 == 0 or n_cells1 == 0 or n_obs_multiplets == 0:
            return 0

        # Prior probability of a doublet given counts for each cell type (ignore N_cells > 2)
        p_obs_multiplet = (2 * (n_cells0 / (n_cells0 + n_cells1)) *
                           (n_cells1 / (n_cells0 + n_cells1)))

        # Brute force MLE of binomial n
        likelihood = scipy.stats.binom.pmf(n_obs_multiplets,
                                           xrange(0, n_cells0 + n_cells1),
                                           p_obs_multiplet)
        return np.argmax(likelihood)

    has_species_info = (species_list != [""])
    if not has_species_info or len(species_list) < 2:
        return summary_info

    counts = []
    cell_barcodes_dict = {}
    for species in species_list:
        species_cell_mask = singlecell_df["is_%s_cell_barcode" % species] == 1
        print singlecell_df['barcode'][species_cell_mask].values.tolist()
        cell_barcodes_dict[species] = singlecell_df['barcode'][
            species_cell_mask].values.tolist()
        counts.append(len(cell_barcodes_dict[species]))
    total_unique_cell_barcodes = {
        bc
        for barcodes in cell_barcodes_dict.values() for bc in barcodes
    }
    total_cell_barcodes = sum(counts)
    summary_info['cells_detected'] = len(total_unique_cell_barcodes)
    if len(species_list) > 1:
        observed_doublets = total_cell_barcodes - len(
            total_unique_cell_barcodes)
        observed_doublet_rate = robust_divide(observed_doublets,
                                              total_cell_barcodes)
        inferred_doublets = infer_multiplets_from_observed(
            observed_doublets, counts[0], counts[1])
        inferred_doublet_rate = robust_divide(inferred_doublets,
                                              total_cell_barcodes)
        summary_info['observed_doublets'] = observed_doublets
        summary_info['observed_doublet_rate'] = observed_doublet_rate
        summary_info['inferred_doublets'] = inferred_doublets
        summary_info['inferred_doublet_rate'] = inferred_doublet_rate

    return summary_info
Ejemplo n.º 7
0
    def _compute_count_purity(counts0, counts1):
        """ Compute fraction of counts in putative single-cell GEMs
        originating from the non-cell transcriptome """
        gem_occupancy = MultiGenomeAnalysis._classify_gems(counts0, counts1)
        frac0 = counts0.astype(float) / (counts0 + counts1).astype(float)
        purity0 = frac0[gem_occupancy == cr_constants.GEM_CLASS_GENOME0]
        purity1 = 1 - frac0[gem_occupancy == cr_constants.GEM_CLASS_GENOME1]
        overall_purity = np.concatenate([purity0, purity1])

        # Compute number of purity outliers
        threshold0, threshold1 = 1.0, 1.0
        fit_purity0 = purity0[np.logical_and(purity0 > 0, purity0 < 1)]
        fit_purity1 = purity1[np.logical_and(purity1 > 0, purity1 < 1)]
        if len(fit_purity0) > 1 and len(fit_purity1) > 1:
            try:
                alpha0, beta0, _, _ = scipy.stats.beta.fit(fit_purity0,
                                                           floc=0,
                                                           fscale=1)
                alpha1, beta1, _, _ = scipy.stats.beta.fit(fit_purity1,
                                                           floc=0,
                                                           fscale=1)
                threshold0 = scipy.stats.beta.ppf(
                    cr_constants.COUNT_PURITY_OUTLIER_PROB_THRESHOLD, alpha0,
                    beta0)
                threshold1 = scipy.stats.beta.ppf(
                    cr_constants.COUNT_PURITY_OUTLIER_PROB_THRESHOLD, alpha1,
                    beta1)
            except scipy.stats._continuous_distns.FitSolverError as e:
                print >> sys.stderr, e
                threshold0, threshold1 = 1.0, 1.0
            except scipy.stats._continuous_distns.FitDataError as e:
                print >> sys.stderr, e
                threshold0, threshold1 = 1.0, 1.0

        outlier0 = np.logical_and(
            gem_occupancy == cr_constants.GEM_CLASS_GENOME0,
            frac0 < threshold0)
        outlier1 = np.logical_and(
            gem_occupancy == cr_constants.GEM_CLASS_GENOME1,
            (1 - frac0) < threshold1)
        n_outlier0 = sum(outlier0)
        n_outlier1 = sum(outlier1)
        frac_outlier0 = tk_stats.robust_divide(n_outlier0, len(purity0))
        frac_outlier1 = tk_stats.robust_divide(n_outlier1, len(purity1))
        is_outlier = np.logical_or(outlier0, outlier1).astype(int)

        return (purity0.mean(), purity1.mean(), overall_purity.mean(),
                n_outlier0, n_outlier1, frac_outlier0, frac_outlier1,
                is_outlier)
Ejemplo n.º 8
0
def join(args, outs, chunk_defs, chunk_outs):
    os.mkdir(outs.demultiplexed_fastq_path)

    # Move output file to final location
    for chunk_out in chunk_outs:
        for f in os.listdir(chunk_out.demultiplexed_fastq_path):
            in_file = os.path.join(chunk_out.demultiplexed_fastq_path, f)
            subprocess.call(['mv', in_file, outs.demultiplexed_fastq_path])

    # Combine result data
    r = {
        'num_reads': 0,
        'num_clusters': 0,
        'invalid_count': 0,
        'sample_index_counts': {}
    }
    for chunk_out in chunk_outs:
        # We count each end of a paired-end read separately in the summary file.
        summary_counts = json.load(open(chunk_out.demultiplex_summary))
        num_clusters = sum(summary_counts.values())
        num_reads = 2 * num_clusters
        invalid_reads = summary_counts[INVALID_SAMPLE_INDEX]
        del summary_counts[INVALID_SAMPLE_INDEX]
        summary_counts = {k: 2 * v for (k, v) in summary_counts.iteritems()}
        r['num_clusters'] += num_clusters
        r['num_reads'] += num_reads
        r['invalid_count'] += invalid_reads
        r['sample_index_counts'] = tk_dict.add_dicts(r['sample_index_counts'],
                                                     summary_counts,
                                                     depth=1)
    r['invalid_frac'] = tk_stats.robust_divide(r['invalid_count'],
                                               r['num_clusters'])

    json.dump(r, open(outs.demultiplex_summary, 'w'))
Ejemplo n.º 9
0
def split(args):
    # determine number of fastq file for each library and gem group, {gem_group : {library_type : count_of_fastq_file} }
    chunk_counts = defaultdict(lambda: defaultdict(int))
    for chunk in args.chunks:
        chunk_counts[chunk["gem_group"]][chunk["library_type"]] += 1

    single_library = True
    for gem_group in chunk_counts:
        if len(chunk_counts[gem_group]) > 1:
            single_library = False

    if single_library:
        martian.log_info(
            'Single library in input. No need to check barcode compatibility.')
        # `[]` for the chunks will skip the main
        return {'chunks': [], 'join': {}}

    num_reads_to_check_barcode = cr_constants.NUM_READS_TO_CHECK_BARCODE if args.num_reads_to_check_barcode is None else args.num_reads_to_check_barcode
    chunks = []
    for chunk in args.chunks:
        chunk_def = chunk
        chunk_def['num_reads_per_chunk_to_check_barcode'] = int(
            tk_stats.robust_divide(
                num_reads_to_check_barcode,
                chunk_counts[chunk["gem_group"]][chunk["library_type"]]))
        chunks.append(chunk_def)

    return {'chunks': chunks, 'join': {'__mem_gb': 4}}
Ejemplo n.º 10
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    if args.confident_regions is None:
        confident_regions = None
    else:
        confident_regions = tk_io.get_target_regions(
            open(args.confident_regions))

    outfile = open(outs.confident_windows, "w")
    for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci):
        conf_regions = get_conf_regions(chrom, confident_regions)
        location = start
        while location < end:
            region = tk_regions.Regions(regions=[(location, location +
                                                  args.window_size)])
            isect = region.intersect(conf_regions)
            size = isect.get_total_size()
            percent = tk_stats.robust_divide(float(size),
                                             float(args.window_size))
            row = [chrom, location, location + args.window_size, percent]
            outfile.write("\t".join(map(str, row)) + "\n")
            location += args.window_size
    outfile.close()
Ejemplo n.º 11
0
def infer_barcode_reverse_complement(barcode_whitelist, barcode_files):
    rc_valid_count = 0
    reg_valid_count = 0
    if barcode_whitelist:
        barcode_rc = []
        for barcode_file in barcode_files:
            read_num = 0

            if barcode_file[-3:] == ".gz":
                barcode_open_file = gzip.open(barcode_file)
            else:
                barcode_open_file = open(barcode_file, 'r')
            read_iter = tk_fasta.read_generator_fastq(barcode_open_file)
            for (name, seq, qual) in read_iter:
                if seq in barcode_whitelist:
                    reg_valid_count += 1
                if tk_seq.get_rev_comp(seq) in barcode_whitelist:
                    rc_valid_count += 1
                if read_num > 1000:
                    break
                read_num += 1

            if tk_stats.robust_divide(float(rc_valid_count), float(rc_valid_count + reg_valid_count)) > 0.75:
                barcode_rc.append(True)
            else:
                barcode_rc.append(False)
            barcode_open_file.close()
        return barcode_rc
    else:
        return [False] * len(barcode_files)
Ejemplo n.º 12
0
def generate_cell_calling_metrics(parameters, cell_barcodes):
    summary_info = {}
    species_list = parameters.keys()
    for species in species_list:
        key_suffix = "" if len(species_list) == 1 else "_{}".format(species)

        # Cell calling metrics
        summary_info["fitted_mean_noise{}".format(
            key_suffix)] = parameters[species]["noise_mean"]
        summary_info["fitted_dispersion_noise{}".format(
            key_suffix)] = parameters[species]["noise_dispersion"]
        summary_info["fitted_mean_signal{}".format(
            key_suffix)] = parameters[species]["signal_mean"]
        summary_info["fitted_dispersion_signal{}".format(
            key_suffix)] = parameters[species]["signal_dispersion"]
        summary_info["fraction_cell_calling_noise{}".format(
            key_suffix)] = parameters[species]["fraction_noise"]
        summary_info["cell_threshold{}".format(
            key_suffix)] = parameters[species]["cell_threshold"]
        summary_info["goodness_of_fit{}".format(
            key_suffix)] = parameters[species]["goodness_of_fit"]
        summary_info["estimated_cells_present{}".format(
            key_suffix)] = parameters[species]["estimated_cells_present"]

        summary_info["annotated_cells{}".format(key_suffix)] = len(
            cell_barcodes[species])
        summary_info["estimated_fraction_cells_annotated{}".format(key_suffix)] = \
            robust_divide(len(cell_barcodes[species]), parameters[species]["estimated_cells_present"])

    summary_info["cells_detected"] = len(
        {bc
         for barcodes in cell_barcodes.values() for bc in barcodes})

    return summary_info
Ejemplo n.º 13
0
 def get_cov_frac(black_regions, chrom, start, stop):
     regions = tk_sv_utils.strictly_overlapping_regions(
         black_regions, chrom, start, stop)
     tot_black = np.sum([r[1] - r[0] for r in regions])
     tot_len = float(stop - start)
     black_frac = tk_stats.robust_divide(tot_black, tot_len)
     return black_frac
Ejemplo n.º 14
0
def split(args):
    input_bam = tk_bam.create_bam_infile(args.bam_infile)

    chroms = input_bam.references
    chrom_lengths = input_bam.lengths

    cov_hist = p.read_csv(args.cov_hist)
    weighted_count = cov_hist.counts[1:] * cov_hist.coverage[1:]
    mean_pos_cov = tk_stats.robust_divide(weighted_count.sum(),
                                          cov_hist.counts[1:].sum())

    primary_contigs = tenkit.reference.load_primary_contigs(
        args.reference_path) - {'chrM', 'chrY'}

    loci = tk_chunks.chunk_by_locus(chroms,
                                    chrom_lengths,
                                    tenkit.constants.PARALLEL_LOCUS_SIZE * 2,
                                    contig_whitelist=primary_contigs,
                                    extra_args={'mean': mean_pos_cov})

    # Handle empty case
    if len(loci) == 0:
        loci = [{'locus': None, 'mean': None}]

    return {'chunks': loci, 'join': {'__mem_gb': 12.0}}
def load_barcode_dist(filename,
                      barcode_whitelist,
                      gem_group,
                      proportions=True):
    """ Load barcode count distribution from a json file """
    # Input barcode whitelist must be an ordered type;
    # safeguard against it going out of sync with the distribution file
    assert barcode_whitelist is None or isinstance(barcode_whitelist, list)

    if not os.path.isfile(filename):
        return None

    with open(filename, 'r') as f:
        values = json.load(f)

    start = (gem_group - 1) * len(barcode_whitelist)
    end = gem_group * len(barcode_whitelist)
    barcode_counts = {
        bc: value
        for bc, value in zip(barcode_whitelist, values[start:end])
    }
    if proportions:
        total_barcode_counts = sum(barcode_counts.values())
        barcode_dist = {
            bc: tk_stats.robust_divide(float(value),
                                       float(total_barcode_counts))
            for bc, value in barcode_counts.iteritems()
        }
        return barcode_dist
    else:
        return barcode_counts
Ejemplo n.º 16
0
def _compute_frac_barcodes_on_whitelist(fastqs, barcode_whitelist_set, reads_interleaved, read_def):
    """ Compute fraction of observed barcodes on the barcode whitelist """
    num_reads = 0
    barcodes_on_whitelist = 0

    for fastq in fastqs:
        barcode_reads = cr_fastq.FastqReader({read_def.read_type: fastq},
                                             read_def,
                                             reads_interleaved,
                                             None, None)

        for read in barcode_reads.in_iter:
            if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
                break

            _, barcode, _ = read

            num_reads += 1
            if barcode in barcode_whitelist_set:
                barcodes_on_whitelist += 1

        if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
            break

    if num_reads > 0:
        return tk_stats.robust_divide(barcodes_on_whitelist, num_reads)
    else:
        return 0.0
Ejemplo n.º 17
0
def merge_filtered_metrics(filtered_metrics):
    result = {
        'filtered_bcs': 0,
        'filtered_bcs_lb': 0,
        'filtered_bcs_ub': 0,
        'max_filtered_bcs': 0,
        'filtered_bcs_var': 0,
        'filtered_bcs_cv': 0,
    }
    for i, fm in enumerate(filtered_metrics):
        # Add per-gem group metrics
        result.update({
            'gem_group_%d_%s' % (i + 1, key): value
            for key, value in fm.iteritems()
        })

        # Compute metrics over all gem groups
        result['filtered_bcs'] += fm['filtered_bcs']
        result['filtered_bcs_lb'] += fm['filtered_bcs_lb']
        result['filtered_bcs_ub'] += fm['filtered_bcs_ub']
        result['max_filtered_bcs'] += fm['max_filtered_bcs']
        result['filtered_bcs_var'] += fm['filtered_bcs_var']

    # Estimate CV based on sum of variances and means
    result['filtered_bcs_cv'] = tk_stats.robust_divide(
        np.sqrt(result['filtered_bcs_var']), fm['filtered_bcs'])

    return result
Ejemplo n.º 18
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.output_format == 'bam':
        tenkit.bam.concatenate(outs.barcoded_unaligned,
                               [c.barcoded_unaligned for c in chunk_outs])
        outs.barcoded = None

    elif args.output_format == 'fastq':
        fqs = [c.barcoded for c in chunk_outs]
        subprocess.check_call('cat ' + ' '.join(fqs) + ' | bgzip -c > ' +
                              outs.barcoded,
                              shell=True)
        outs.barcoded_unaligned = None

    # Make a basic set of metrics
    num_pairs = sum(c.num_pairs for c in chunk_outs)
    correct_bc_pairs = sum(c.correct_bc_pairs for c in chunk_outs)

    stats = {}
    stats['num_read_pairs'] = num_pairs
    stats['bc_on_whitelist'] = tk_stats.robust_divide(float(correct_bc_pairs),
                                                      num_pairs)

    if args.bc_counts is not None:
        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        count_arrays = [
            np.array(gem_group['bc_counts'], dtype=np.float)
            for gem_group in counts.values()
        ]

        # Compute effective BC diversity and n90 bc count
        bc_df = pandas.DataFrame(
            {'bc_num_reads': np.concatenate(count_arrays)})

        # Read-based effective diversity
        reads = bc_df.bc_num_reads.values
        sum_sq = (reads**2.0).sum()
        effective_diversity = tk_stats.robust_divide((reads.sum()**2.0),
                                                     float(sum_sq))
        stats['barcode_diversity'] = effective_diversity
    else:
        stats['barcode_diversity'] = None

    basic_stats = pandas.DataFrame(stats, index=[0])
    basic_stats.to_csv(outs.basic_stats, index=False)
Ejemplo n.º 19
0
 def _summarize_per_barcode(a):
     mean = np.mean(a)
     stddev = np.std(a)
     return {
         'mean': mean,
         'median': np.median(a),
         'cv': tk_stats.robust_divide(float(stddev), float(mean)),
         'iqr': np.percentile(a, 75) - np.percentile(a, 25),
     }
Ejemplo n.º 20
0
def sample_bcs_and_het_snps(vcf, contigs):
    block_size = 1e6
    num_samples = 10
    total_recs = 0
    het_snp_recs = 0
    het_snp_bcs = 0
    total_size = 0
    for i in xrange(num_samples):
        (chrom, length) = random.choice(contigs)
        start = random.randint(0, max(0, length - block_size))
        end = min(start + block_size, length)
        (tot, snp, bcs) = sample_by_locus(vcf, (chrom, start, end))
        total_recs += tot
        het_snp_recs += snp
        het_snp_bcs += bcs
        total_size += (end - start)
    return (tk_stats.robust_divide(het_snp_recs, total_recs),
            tk_stats.robust_divide(het_snp_bcs, het_snp_recs),
            tk_stats.robust_divide(het_snp_recs, total_size))
Ejemplo n.º 21
0
def summarize_bootstrapped_top_n(top_n_boot):
    top_n_bcs_mean = np.mean(top_n_boot)
    top_n_bcs_sd = np.std(top_n_boot)
    top_n_bcs_var = np.var(top_n_boot)
    result = {}
    result['filtered_bcs_var'] = top_n_bcs_var
    result['filtered_bcs_cv'] = tk_stats.robust_divide(top_n_bcs_sd, top_n_bcs_mean)
    result['filtered_bcs_lb'] = round(sp_stats.norm.ppf(0.025, top_n_bcs_mean, top_n_bcs_sd))
    result['filtered_bcs_ub'] = round(sp_stats.norm.ppf(0.975, top_n_bcs_mean, top_n_bcs_sd))
    result['filtered_bcs'] = int(round(top_n_bcs_mean))
    return result
def get_protospacer_call_metrics(ps_calls_summary, num_gex_cbs, report_prefix):
    metrics_dict = {}
    num_cells_with_multiple_protospacers = ps_calls_summary.loc[
        'More than 1 protospacer expressed', 'num_cells']
    num_cells_with_protospacer = (
        ps_calls_summary.loc['1 protospacer expressed', 'num_cells'] +
        num_cells_with_multiple_protospacers)

    frac_cells_with_protospacer = tk_stats.robust_divide(
        num_cells_with_protospacer, num_gex_cbs)
    frac_cells_with_multiple_protospacer = tk_stats.robust_divide(
        num_cells_with_multiple_protospacers, num_gex_cbs)

    metrics_dict.update({
        report_prefix + 'frac_cells_with_protospacer':
        frac_cells_with_protospacer,
        report_prefix + 'frac_cells_with_multiple_protospacer':
        frac_cells_with_multiple_protospacer,
    })

    return metrics_dict
Ejemplo n.º 23
0
def infer_barcode_reverse_complement(barcode_whitelist, read_iter):
    if barcode_whitelist is None:
        return False
    reg_valid_count = 0
    rc_valid_count = 0
    for name, seq, qual in itertools.islice(read_iter, cr_constants.NUM_CHECK_BARCODES_FOR_ORIENTATION):
        if seq in barcode_whitelist:
            reg_valid_count += 1
        if tk_seq.get_rev_comp(seq) in barcode_whitelist:
            rc_valid_count += 1

    frac_rc = tk_stats.robust_divide(rc_valid_count, rc_valid_count + reg_valid_count)
    return frac_rc >= cr_constants.REVCOMP_BARCODE_THRESHOLD
Ejemplo n.º 24
0
    def checkCallPerformance(self, SENIDX=0, PPVIDX=2, trackAvoided=[], overlapThr={}):
        self.clearStatus()
        overlapThrsGood = True
        for n in self.genomeTracks:
            if not n in overlapThr:
                overlapThrsGood = False
                break
            if overlapThr[n] <0.0 or overlapThr[n] > 1.0:
                overlapThrsGood = False
                break

        if not overlapThrsGood:
            overlapThr = self.genomeTracksOverlapThr


        validTrackAvoided=[]
        for n in trackAvoided:
            if n in self.genomeTracks:
                validTrackAvoided.append(n)


        for i in range(self.TotalEvent):
            evt = self.AllEvents[i]
            isPassTrackFilter = True
            for n in self.genomeTracks:
                if self.Status[i].trackInfo[n].queryFraction > overlapThr[n] :
                    isPassTrackFilter = False
                    break

            if isPassTrackFilter:
                self.Status[i].isPass = True

                self.AllPos+=1
                for j in range(self.NumTD):
                    if self.TruthData[j].checkOverlap(evt.chrom, evt.start, evt.end)[0]:
                        self.NPos[j]+=1
                        self.Status[i].TDStatus[j].isTrue = True
                        self.Status[i].TDStatus[j].isFalse = False
                    else:
                        self.Status[i].TDStatus[j].isTrue = False
                        self.Status[i].TDStatus[j].isFalse = True
            else:
                self.Status[i].isPass = False
        self.HasPerformance = True

        #print len(self.AllEvents), numFailDMAS, numFailTrack, numFail

        if self.AllPos <= 0:
            return self.TruthData[SENIDX].getSensitivity(), 0.0, self.AllPos
        else:
            return self.TruthData[SENIDX].getSensitivity(), robust_divide(self.NPos[PPVIDX]*1.0, self.AllPos), self.AllPos
def construct_perturbation_efficiency_summary(f_change, f_change_ci, num_cells_per_perturbation, by_feature,
                                                summary_columns = PERTURBATION_EFFICIENCY_SUMMARY_COLUMNS):
    if (f_change is None) or (f_change_ci is None):
        return None

    if by_feature:
        summary_columns[1] = 'Target Guide'
    else:
        summary_columns[1] = 'Target Gene'

    this_df = pd.DataFrame(columns = summary_columns)
    counter = 0
    control_num_cells = num_cells_per_perturbation['Non-Targeting']
    for key in sorted(f_change.keys()):
        this_key_results = f_change.get(key)
        this_key_ci = f_change_ci.get(key)
        if this_key_results is None:
            continue

        this_num_cells = num_cells_per_perturbation[key]

        for (ps, results) in this_key_results.iteritems():
            lower_bound = this_key_ci.get(ps)[0]
            upper_bound = this_key_ci.get(ps)[1]
            this_df.loc[counter] = (key,
                                    ps,
                                    results[0],
                                    results[1],
                                    lower_bound,
                                    upper_bound,
                                    this_num_cells,
                                    tk_stats.robust_divide(results[2], this_num_cells),
                                    control_num_cells,
                                    tk_stats.robust_divide(results[3], control_num_cells)
                                    )
            counter += 1
    this_df.sort_values(by=['Log2 Fold Change'], ascending = True, inplace = True)
    return this_df
Ejemplo n.º 26
0
def get_depth_positional_cv(info, trim_tail):
    fixed_info = {int(x): y for (x, y) in info.iteritems()}
    total_count = sum(fixed_info.values())
    cutoff_count = total_count * trim_tail
    seen_count = 0
    for depth in sorted(fixed_info.iterkeys(), reverse=True):
        seen_count += fixed_info[depth]
        if seen_count >= cutoff_count:
            cutoff = depth
            break
    trimmed_info = {x: y for (x, y) in fixed_info.iteritems() if x <= cutoff}
    mean_val, var_val = tk_stats.mean_var_from_counts(trimmed_info)
    if mean_val > var_val:
        return float('NaN')
    return tk_stats.robust_divide(numpy.sqrt(var_val - mean_val), mean_val)
Ejemplo n.º 27
0
def split(args):
    # Need to store umi_info and a json with a dict containing 1 key per barcode
    umi_info_mem_gb = 2 * int(np.ceil(vdj_umi_info.get_mem_gb(args.umi_info)))

    bc_diversity = len(cr_utils.load_barcode_whitelist(args.barcode_whitelist))
    assemble_summary_mem_gb = tk_stats.robust_divide(bc_diversity,
                                                     DICT_BCS_PER_MEM_GB)

    return {
        'chunks': [{
            '__mem_gb':
            int(
                np.ceil(
                    max(cr_constants.MIN_MEM_GB,
                        umi_info_mem_gb + assemble_summary_mem_gb))),
        }]
    }
Ejemplo n.º 28
0
def get_depth_info_json(info):
    fixed_info = {int(x): y for (x, y) in info.iteritems()}

    total_depth_counts = sum(fixed_info.values())
    median_depth = None
    sorted_depths = sorted(fixed_info.keys())
    seen_depth_count = 0
    mean_depth = 0.0
    for depth in sorted_depths:
        seen_depth_count += fixed_info[depth]
        mean_depth += float(
            depth * fixed_info[depth]) / float(total_depth_counts)
        if seen_depth_count > total_depth_counts / 2 and median_depth is None:
            median_depth = depth
    zero_cov_fract = tk_stats.robust_divide(float(fixed_info.get(0, 0.0)),
                                            float(total_depth_counts))

    return (mean_depth, median_depth, zero_cov_fract)
Ejemplo n.º 29
0
def _compute_frac_barcodes_on_whitelist(fastqs,
                                        barcode_whitelist_set,
                                        reads_interleaved,
                                        read_def,
                                        tolerate_n=True):
    """ Compute fraction of observed barcodes on the barcode whitelist """
    num_reads = 0
    barcodes_on_whitelist = 0

    for fastq in fastqs:
        barcode_reads = cr_fastq.FastqReader({read_def.read_type: fastq},
                                             read_def, reads_interleaved, None,
                                             None)

        for read in barcode_reads.in_iter:
            if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
                break

            _, barcode, _ = read

            num_reads += 1
            if barcode in barcode_whitelist_set:
                barcodes_on_whitelist += 1
            elif tolerate_n and 'N' in barcode:

                # If there's a single N in the barcode, check if
                # we can can replace the N with a valid base & get
                # a whitelist hit. This makes us robust to N-cycles.
                npos = barcode.find("N")
                a = array.array('c', barcode)
                for base in ['A', 'C', 'G', 'T']:
                    a[npos] = base
                    new_barcode = a.tostring()
                    if new_barcode in barcode_whitelist_set:
                        barcodes_on_whitelist += 1
                        break

        if num_reads == cr_constants.DETECT_CHEMISTRY_INITIAL_READS:
            break

    if num_reads > 0:
        return tk_stats.robust_divide(barcodes_on_whitelist, num_reads)
    else:
        return 0.0
def add_bulk_targeting_metrics(summary_info, singlecell_df, species_list):
    """Take singlecell targeting data and calculate bulk targeting metrics from them.
    """
    for species in species_list:
        species_cell_mask = singlecell_df["is_%s_cell_barcode" % species] == 1
        key_suffix = "" if len(species_list) == 1 else "_{}".format(species)

        total = singlecell_df[species_cell_mask]["passed_filters"].sum()
        tss = singlecell_df[species_cell_mask]["TSS_fragments"].sum()
        dnase = singlecell_df[species_cell_mask][
            "DNase_sensitive_region_fragments"].sum()
        enhancer = singlecell_df[species_cell_mask][
            "enhancer_region_fragments"].sum()
        promoter = singlecell_df[species_cell_mask][
            "promoter_region_fragments"].sum()
        ontarget = singlecell_df[species_cell_mask]["on_target_fragments"].sum(
        )
        blacklist = singlecell_df[species_cell_mask][
            "blacklist_region_fragments"].sum()
        peaks = singlecell_df[species_cell_mask]["peak_region_fragments"].sum()
        summary_info['frac_fragments_overlapping_targets{}'.format(
            key_suffix)] = robust_divide(ontarget, total)
        summary_info['frac_fragments_overlapping_tss{}'.format(
            key_suffix)] = robust_divide(tss, total)
        summary_info['frac_fragments_overlapping_dnase{}'.format(
            key_suffix)] = robust_divide(dnase, total)
        summary_info['frac_fragments_overlapping_enhancer{}'.format(
            key_suffix)] = robust_divide(enhancer, total)
        summary_info['frac_fragments_overlapping_promoter{}'.format(
            key_suffix)] = robust_divide(promoter, total)
        summary_info['frac_fragments_overlapping_blacklist{}'.format(
            key_suffix)] = robust_divide(blacklist, total)
        summary_info['frac_fragments_overlapping_peaks{}'.format(
            key_suffix)] = robust_divide(peaks, total)
    cell_mask = singlecell_df['cell_id'] != 'None'
    cut_frags_in_peaks = singlecell_df[cell_mask]["peak_region_cutsites"].sum()
    total = singlecell_df[cell_mask]["passed_filters"].sum()
    summary_info['frac_cut_fragments_in_peaks'] = robust_divide(
        cut_frags_in_peaks, 2 * total)

    return summary_info