Beispiel #1
0
def split(args):
    ref = contig_manager.contig_manager( args.reference_path )
    constants = load_h5(args.sc_norm_profiles, "constants").to_dict()
    ncells = constants["ncells"]
    window_size = constants["window_size"]
    # maximum memory usage is the maximum of these four values:
    # sumbins = sum(len(c) for c in primary_contigs)/window_size
    # maxbins = max(len(c) for c in all_contigs)/window_size
    # X + Q + H = ((2*sizeof(i8) + 2*sizeof(f32)) * ncells * sumbins)
    # occupancy = sizeof(f32) * levels(=6) * (ncells - 1) * sumbins / nchunks(=100)
    # het = X + Q + H + occupancy
    # X + Y + Z = ((2*sizeof(float)) * ncells * maxbins)
    # merged_bed = sc_cnv_calls_bed + internal_cnv_calls_bed
    # unmerged_bed = sc_unmerged_cnv_calls_bed + internal_unmerged_cnv_calls_bed
    # * NOTE: ask for the double the matrix sizes to acct for intermediate values
    f32sz = 4
    sumbins = sum(ref.contig_lengths[c]/window_size+1 for c in ref.primary_contigs())
    maxbins = max(ref.contig_lengths[c]/window_size+1 for c in ref.list_all_contigs())
    XQH_mem_gb = float((2 + 2*f32sz) * ncells * sumbins)/1e9
    occ_mem_gb = float(f32sz * 6 * (ncells - 1) * sumbins/100)/1e9
    het_mem_gb = XQH_mem_gb + occ_mem_gb
    XYZ_mem_gb = 2 * float(f32sz * ncells * maxbins) / 1e9
    merged_bed_gb = os.path.getsize(args.sc_cnv_calls)/1e9 + \
                    os.path.getsize(args.internal_cnv_calls)/1e9 + 1
    unmerged_bed_gb = os.path.getsize(args.sc_unmerged_cnv_calls)/1e9 + \
                      os.path.getsize(args.internal_unmerged_cnv_calls)/1e9 + 1
    mem_gb = int(np.ceil(max(het_mem_gb, XYZ_mem_gb, merged_bed_gb, unmerged_bed_gb))) + 3
    return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
Beispiel #2
0
def join(args, outs, chunk_defs, chunk_outs):
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    profiles, gc, mask = load_data(args.raw_profiles, args.tracks, chroms)
    gc_norm_params = json.load(open(args.gc_norm_params, "r"))
    scale = gc_norm_params["scale"]
    linear = gc_norm_params["linear"]
    quadratic = gc_norm_params["quadratic"]

    norm_profiles = gc_normalize(profiles, gc, linear, quadratic, chroms)

    bin_size = coverage_matrix.get_bin_size(args.raw_profiles)

    coverage_matrix.store_matrix(file_name=outs.normalized_profiles,
                                 chroms=chroms,
                                 profiles=norm_profiles,
                                 tracks=None,
                                 window_size=bin_size,
                                 masks=mask,
                                 dtype="float32")

    store = pd.HDFStore(outs.normalized_profiles, "a")
    constants = load_h5(args.raw_profiles, "constants")
    store["constants"] = constants
    store.close()

    store = pd.HDFStore(outs.normalized_profiles, "a")
    store["/gc_params/scale"] = pd.Series(scale)
    store["/gc_params/linear"] = pd.Series(linear)
    store["/gc_params/quadratic"] = pd.Series(quadratic)
    store.close()
def split(args):
    constants = load_h5(args.cluster_data, "constants").to_dict()
    matsize_gb = float(constants["ncells"] * constants["genomebins"]) / 1e9
    return {
        'chunks': [],
        'join': {
            '__mem_gb': int(np.ceil(4 * matsize_gb + 1))
        }
    }
Beispiel #4
0
def split(args):
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.list_all_contigs()
    max_chrom_size = max([ref.contig_lengths[chrom] for chrom in chroms])
    constants = load_h5(args.profiles, "constants").to_dict()
    ncells = constants["ncells"]
    window_size = constants["window_size"]
    max_mat_size_gb = float(
        2 * ncells * max_chrom_size / window_size) / 1e9 * 4
    mem_gb = int(np.ceil(max_mat_size_gb * 4 + 1))
    return {'chunks': [], 'join': {'__mem_gb': mem_gb}}
Beispiel #5
0
def split(args):
    constants = load_h5(args.cnv_tracks, "constants").to_dict()
    matsize_gb = float(constants["ncells"]*constants["genomebins"])/1e9
    return {'chunks': [], 'join': {'__mem_gb' : int(np.ceil(matsize_gb * 12 + 2))}}
Beispiel #6
0
def join(args, outs, chunk_defs, chunk_outs):
    ## merge gc params jsons
    node_gc_params = {}
    sc_gc_params = json.load(open(args.sc_gc_params, "r"))
    internal_gc_params = json.load(open(args.internal_gc_params, "r"))

    ncells = len(sc_gc_params['linear'])
    nnodes = 2*ncells - 1

    for key in ["scale", "linear", "quadratic"]:
        node_gc_params[key] = sc_gc_params[key] + internal_gc_params[key]
    with open(outs.node_gc_params, "w") as out:
        json.dump(node_gc_params, out, indent=4)

    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    index_chrom = dict([(str(i), c) for i, c in enumerate(chroms)])
    chrom_index = dict([(c, str(i)) for i, c in enumerate(chroms)])
    tmp = martian.make_path('tmp.bed')
    tmp_dir = os.path.dirname(tmp)
    tmp_sorted = martian.make_path('tmp_sorted.bed')
    calls = [[args.sc_cnv_calls, args.internal_cnv_calls],
             [args.sc_unmerged_cnv_calls, args.internal_unmerged_cnv_calls]]
    out_calls = [outs.node_cnv_calls, outs.node_unmerged_cnv_calls]
    for calls, out in zip(calls, out_calls):
        with open(tmp, 'w') as outf:
            for f in calls:
                for l in open(f):
                    fields = l.split()
                    # offset internal node indices by ncells
                    if f == calls[1]:
                        fields[3] = str(int(fields[3]) + ncells)
                    # fix type of confidence field to integer
                    fields[-1] = str(int(float(fields[-1])))
                    # replace index number at start for sorting
                    fields[0] = chrom_index[fields[0]]
                    outf.write('\t'.join(fields) + '\n')

        no_unicode = dict(LC_ALL='C')
        tmp_mem_gib = max(1, int(np.ceil(float(os.path.getsize(tmp)) / (1024**3))))
        try:
            subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n',
                                   '--parallel=1',  # force sort to use 1 thread
                                   '-S', '{}G'.format(tmp_mem_gib),
                                   '-T', tmp_dir,
                                   '-o', tmp_sorted, tmp],
                                  env=no_unicode, stderr=sys.stderr)
        # on some systems, --parallel is unavailable
        except subprocess.CalledProcessError:
            subprocess.check_call(['sort', '-k1,1n', '-k2,2n', '-k3,3n',
                                   # will by default only use 1 thread
                                   '-S', '{}G'.format(tmp_mem_gib),
                                   '-T', tmp_dir,
                                   '-o', tmp_sorted, tmp],
                                  env=no_unicode, stderr=sys.stderr)

        # strip index column into outfile
        with open(out, 'w') as outf:
            version = martian.get_pipelines_version()
            outf.write("#cellranger-dna {}\n".format(version))
            outf.write("#reference genome: {}\n".format(args.reference_path))
            outf.write("#chrom\tstart\tend\tid\tcopy_number\tevent_confidence\n")
            for l in open(tmp_sorted):
                l = l.split('\t')
                l[0] = index_chrom[l[0]]
                outf.write('\t'.join(l))

    os.remove(tmp)
    os.remove(tmp_sorted)

    ## cnv tracks file
    sc_windows = load_h5(args.sc_cnv_tracks, "windows")
    internal_windows = load_h5(args.internal_cnv_tracks, "windows")
    windows = sc_windows.append(internal_windows).values
    constants = load_h5(args.sc_cnv_tracks, "constants")
    
    sc_ploidy_conf = scale_confidence_score(load_h5(args.sc_cnv_tracks, 
        "ploidy_conf").values)
    internal_ploidy_conf = scale_confidence_score(load_h5(
        args.internal_cnv_tracks, "ploidy_conf").values)
    
    sc_scale_factor= load_h5(args.sc_cnv_tracks, "scale_factor")
    internal_scale_factor = load_h5(args.internal_cnv_tracks, "scale_factor")

    sc_rpb= load_h5(args.sc_cnv_tracks, "reads_per_bin")
    internal_rpb= load_h5(args.internal_cnv_tracks, "reads_per_bin")
    
    X = load_h5(args.sc_cnv_tracks, "cnv_tracks").values
    nbins = X.shape[1]
    Q = np.zeros((nnodes, nbins), dtype=X.dtype)
    Q[0:ncells, :] = X
    del X
    Q[ncells:, :] = load_h5(args.internal_cnv_tracks, "cnv_tracks").values

    store = pd.HDFStore(outs.node_cnv_tracks, "w")
    store["constants"] = constants
    store["windows"] = sc_windows.append(internal_windows)
    store["ploidy_conf"] = sc_ploidy_conf.append(internal_ploidy_conf)
    store["scale_factor"] = sc_scale_factor.append(internal_scale_factor)
    store["reads_per_bin"] = sc_rpb.append(internal_rpb)
    store["cnv_tracks"] = pd.DataFrame(Q)
    store.close()
    
    ## Compute heterogeneity and store in tree_data
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    if args.tracks is None:
        gmask = np.ones(nbins, dtype=bool)
    else:
        gmask = []
        maptrack = pd.HDFStore(args.tracks, "r")
        for chrom in chroms:
            gmask.extend(maptrack["/map/"+chrom].values > MAPPABILITY_THRESHOLD)
        maptrack.close( )
        gmask = np.array(gmask)

    ## update tree data
    # load tree
    store = pd.HDFStore( args.tree_data, "r" )
    Z = store["/Z"].values
    distances = store["/distances"].values
    constants = store["/constants"]
    store.close( )

    # Compute the heterogeneity at every *internal* node of the tree
    # obviously the heterogeneity is zero at every leaf, so don't
    # store a bunch of zeros
    levels = 6
    het = compute_heterogeneity(Q, Z, gmask, windows, levels=levels)

    del Q

    # dump to disk
    store = pd.HDFStore( outs.tree_data, "w" )
    store["Z"] = pd.DataFrame(Z)
    store["het"] = pd.DataFrame(het)
    store["distances"] = pd.Series(distances)
    store["windows"] = pd.Series(windows)
    store["constants"] = constants
    store.close( )

    del het

    ## normalized profiles
    sc_store = pd.HDFStore(args.sc_norm_profiles, "r")
    internal_store = pd.HDFStore(args.internal_norm_profiles, "r")
    out_store = pd.HDFStore(outs.norm_node_profiles, "w")
    out_store["/constants"] = sc_store["/constants"]
    for chrom in chroms:
        ## first do the /contigs
        X = sc_store["/contigs/"+chrom].values
        Y = internal_store["/contigs/"+chrom].values
        assert X.shape[1] == Y.shape[1]
        nbins = X.shape[1]
        Z = np.zeros((2*ncells-1, nbins), dtype=X.dtype)
        Z[:ncells, :] = X
        Z[ncells:, :] = Y
        out_store["/contigs/"+chrom] = pd.DataFrame(Z)
        del X, Y, Z

        ## next do the /masks
        out_store["/masks/"+chrom] = sc_store["/masks/"+chrom]
    ## gc params
    for key in ["scale", "linear", "quadratic"]:
        out_store["/gc_params/"+key] = pd.concat([sc_store["/gc_params/"+key],
            internal_store["/gc_params/"+key]], ignore_index=True)

    ## do the normalization metrics
    out_store["/normalization_metrics"] =sc_store["normalization_metrics"].append(internal_store["/normalization_metrics"], ignore_index=True)

    out_store.close()
    sc_store.close()
    internal_store.close()
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    start_cell = args.chunk["start"]
    end_cell = args.chunk["end"]

    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    ## load genome data and cell profiles
    X, gmask, bdy = load_genome_data(
        args.profiles,
        args.tracks,
        chroms,
        start_cell=start_cell,
        end_cell=end_cell,
        integer=False,
        rounding=False,
        mappability_threshold=crdna.constants.MAPPABILITY_THRESHOLD)

    # compute chromosome boundaries after masking by gmask
    cbdy = np.zeros_like(bdy)
    for i in xrange(1, len(bdy)):
        cbdy[i] = (gmask[0:bdy[i]].sum())

    ## load GC info and create GC track
    gctrack = []
    store = pd.HDFStore(args.tracks, "r")
    for chrom in chroms:
        gctrack.extend(store["/GC/" + chrom].values)
    gctrack = np.array(gctrack)[gmask]
    store.close()

    ## if calling on nodes use scale factors from cells
    if args.is_singlecell:
        scale_guess_chunk = [None for _ in xrange(start_cell, end_cell)]
        cell_offset = 0
    else:
        scale_guess = load_h5(args.profiles, "scale_guess")
        scale_guess_chunk = [[s] for s in scale_guess[start_cell:end_cell]]
        ## num cells = num internal nodes + 1
        cell_offset = args.chunk["ncells"] + 1

    ncells = X.shape[0]
    nbins = gmask.sum()
    P = 2 * np.ones((ncells, nbins), dtype="int8")
    S = np.zeros((ncells, nbins), dtype=bool)
    sdfs = []
    scale_factors = np.zeros(ncells)
    pconf = np.zeros(ncells)
    windows = np.zeros(ncells, dtype=int)

    gc_norm_params = json.load(open(args.gc_norm_params, "r"))

    ## initialize parameters
    read_threshold = crdna.constants.BREAKPOINT_READ_THRESHOLD
    heuristics = crdna.constants.BREAKPOINT_CALLER_HEURISTICS
    ## override/augment heuristics by supplied params
    if args.params is not None:
        for k, v in args.params.iteritems():
            heuristics[k] = v

    ## log heuristics used
    martian.log_info("Heuristics used:")
    for k, v in heuristics.iteritems():
        martian.log_info("%s: %s" % (str(k), str(v)))

    debug_out = open(outs.debug, "w")

    if len(ref.list_species()) == 1:
        for i in xrange(ncells):
            debug_out.write("-" * 80 + "\n")
            debug_out.write("Cell %d\n" % (cell_offset + start_cell + i))

            ## GC coefficients
            gc_linear = gc_norm_params["linear"][start_cell + i]
            gc_quadratic = gc_norm_params["quadratic"][start_cell + i]

            ## GC correction track for cell
            xi = parabola(gctrack, crdna.constants.GC_ORIGIN, gc_linear,
                          gc_quadratic)
            xi_low = parabola(crdna.constants.MIN_GC,
                              crdna.constants.GC_ORIGIN, gc_linear,
                              gc_quadratic)
            xi_high = parabola(crdna.constants.MAX_GC,
                               crdna.constants.GC_ORIGIN, gc_linear,
                               gc_quadratic)
            xi[gctrack < crdna.constants.MIN_GC] = xi_low
            xi[gctrack > crdna.constants.MAX_GC] = xi_high

            y = X[i][gmask]

            ## do the CNV calling
            ploidy, S[i], gap, sdf, sf = call_cnvs(
                y,
                xi,
                ref,
                cbdy,
                scale_guess=scale_guess_chunk[i],
                log_func=debug_out.write,
                **heuristics)

            scale_factors[i] = sf
            sdfs.append(sdf)
            P[i] = np.clip(ploidy, 0, np.iinfo("int8").max - 1)
            pconf[i] = gap
            windows[i] = get_segment_window_size(y, read_threshold)
            debug_out.flush()
    debug_out.close()

    out = pd.HDFStore(outs.denoised_profiles, "w")
    out["/quantized"] = pd.DataFrame(P)
    out["/segment_index"] = pd.DataFrame(S)
    out["/scaling_data"] = pd.Series(sdfs)
    out["/scale_factor"] = pd.Series(scale_factors)
    out["/ploidy_conf"] = pd.Series(pconf)
    out["/windows"] = pd.Series(np.clip(windows, 1, None))
    out.close()
def join(args, outs, chunk_defs, chunk_outs):
    ref = contig_manager.contig_manager(args.reference_path)
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)

    ## load genome data and cell profiles
    X, gmask, bdy = load_genome_data(
        args.profiles,
        args.tracks,
        chroms,
        integer=False,
        rounding=False,
        mappability_threshold=crdna.constants.MAPPABILITY_THRESHOLD)
    nbins = gmask.sum()

    # compute chromosome boundaries after masking by gmask
    cbdy = np.zeros_like(bdy)
    for i in xrange(1, len(bdy)):
        cbdy[i] = (gmask[0:bdy[i]].sum())

    ## load GC info and create GC track
    gctrack = []
    store = pd.HDFStore(args.tracks, "r")
    for chrom in chroms:
        gctrack.extend(store["/GC/" + chrom].values)
    gctrack = np.array(gctrack)[gmask]
    store.close()
    gc_norm_params = json.load(open(args.gc_norm_params, "r"))

    ## Aggregate data structures from individual chunks
    P = np.zeros((0, nbins), dtype="int8")  # ploidy per cell
    S = np.zeros((0, nbins), dtype=bool)  # segment index per cell
    sdfs = []  # scaling dataframes per cell
    scale_factors = []  # scale factor per cell
    pconf = np.zeros((0, ), dtype=float)  # scaling confidence per cell
    windows = np.zeros((0, ), dtype=int)  # segment window size per cell

    logger = sys.stdout.write

    ## add logging info from chunks
    for chunk_out in chunk_outs:
        with open(chunk_out.debug, "r") as debug_in:
            for line in debug_in:
                logger(line)
    logger("\n" + "*" * 80 + "\n")

    for chunk_out, chunk_def in zip(chunk_outs, chunk_defs):
        start_cell = chunk_def.chunk["start"]
        end_cell = chunk_def.chunk["end"]
        chunk_store = pd.HDFStore(chunk_out.denoised_profiles, "r")
        p_chunk = chunk_store["/quantized"].values
        s_chunk = chunk_store["/segment_index"].values
        sf_chunk = chunk_store["/scale_factor"].values
        pc_chunk = chunk_store["/ploidy_conf"].values
        sd_chunk = list(chunk_store["/scaling_data"])
        w_chunk = chunk_store["/windows"].values
        chunk_store.close()

        if P.shape[0] == 0:
            ncells = chunk_def.chunk["ncells"]
            nbins = p_chunk.shape[1]
            P = np.zeros((ncells, nbins), dtype="int8")
            S = np.zeros((ncells, nbins), dtype=bool)
            scale_factors = np.zeros(ncells, dtype=float)
            pconf = np.zeros(ncells, dtype=float)
            windows = np.zeros(ncells, dtype=int)
        P[start_cell:end_cell, :] = p_chunk
        S[start_cell:end_cell, :] = s_chunk
        scale_factors[start_cell:end_cell] = sf_chunk
        sdfs.extend(sd_chunk)
        pconf[start_cell:end_cell] = pc_chunk
        windows[start_cell:end_cell] = w_chunk

    ## Find cells with low scaling confidence

    fix_scaling = np.zeros(0, dtype=int)
    if args.is_singlecell:
        cell_offset = 0
        fix_scaling = np.where((pconf >= 0) & (pconf <= 0.02))[0]
    else:
        cell_offset = X.shape[0] + 1

    good_cells = np.where(np.logical_or(pconf == -2, pconf > 0.10))[0]

    agg_window = int(np.median(windows if len(windows) > 0 else [sum(gmask)]))
    X_agg = aggregate_matrix(X[:, gmask], agg_window)

    ## initialize parameters
    heuristics = crdna.constants.BREAKPOINT_CALLER_HEURISTICS
    ## override/augment heuristics by supplied params
    if args.params is not None:
        for k, v in args.params.iteritems():
            heuristics[k] = v
    logger("%d cells with low ploidy confidence\n" % len(fix_scaling))

    for cell in fix_scaling:
        if len(good_cells) == 0:
            continue
        logger("-" * 80 + "\n")
        logger("Fixing cell %d\n" % (cell + cell_offset))

        ## GC coefficients
        gc_linear = gc_norm_params["linear"][cell]
        gc_quadratic = gc_norm_params["quadratic"][cell]

        ## GC correction track for cell
        xi = parabola(gctrack, crdna.constants.GC_ORIGIN, gc_linear,
                      gc_quadratic)
        xi_low = parabola(crdna.constants.MIN_GC, crdna.constants.GC_ORIGIN,
                          gc_linear, gc_quadratic)
        xi_high = parabola(crdna.constants.MAX_GC, crdna.constants.GC_ORIGIN,
                           gc_linear, gc_quadratic)
        xi[gctrack < crdna.constants.MIN_GC] = xi_low
        xi[gctrack > crdna.constants.MAX_GC] = xi_high

        y = X[cell][gmask]

        ## find the correlation distance to all cells that were scaled
        ## confidently. Then take all matches with > 90% correlation and
        ## compute the median ploidy over these cells. Find the closest
        ## scaling solution to the median and declare that the answer.
        all_corrs = compute_corr_dist_to_all(X_agg[cell][np.newaxis, :], X_agg)
        good_corrs = all_corrs[good_cells]
        best_matches = good_cells[good_corrs > 0.90]
        if len(best_matches) == 0:
            continue
        best_guess_ploidy = np.median(P[best_matches, :].mean(axis=1))

        best_scaling_soln = np.argmin(
            np.abs(sdfs[cell]["aploidy"].values - best_guess_ploidy))
        lam_best = sdfs[cell].loc[best_scaling_soln]["lam"]

        sindex = S[cell]
        segment_bdy2 = get_segment_bdy_from_index(sindex)
        window = get_segment_window_size(y, heuristics["ll_read_threshold"])

        segment_means2, _ = compute_segment_data(y, xi, segment_bdy2, window)
        ploidy = get_ploidy_vector(y, segment_means2, segment_bdy2, lam_best)
        delta_ploidy = np.abs(P[cell].mean() - ploidy.mean())
        logger("Ploidy: %.2f -> %.2f\n" % (P[cell].mean(), ploidy.mean()))

        P[cell] = np.clip(ploidy, 0, np.iinfo("int8").max - 1).astype("int8")
        if delta_ploidy > 0.1:
            pconf[cell] = -4

    ## Compute read depth
    depth = np.zeros_like(pconf)
    for cell in xrange(len(pconf)):
        depth[cell] = X[cell][gmask].mean()

    ## Write data to h5
    store = pd.HDFStore(outs.denoised_profiles, "w")
    store["/quantized"] = pd.DataFrame(P)
    store["/scale_factor"] = pd.Series(scale_factors)
    store["/reads_per_bin"] = pd.Series(depth)
    store["/segment_index"] = pd.DataFrame(S)
    store["/ploidy_conf"] = pd.Series(pconf)
    store["/scaling_data"] = pd.Series(sdfs)
    store["/windows"] = pd.Series(windows)
    segment_windows = int(np.median(windows)) if len(windows) else 1
    constants = load_h5(args.profiles, "constants").to_dict()
    constants["segment_windows"] = segment_windows
    store["constants"] = pd.Series(constants)
    store.close()
Beispiel #9
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    stats = pd.read_csv(args.barnyard)
    stats = stats[stats['cell_id'] != 'None'].copy()
    ncells = len(stats)
    martian.log_info('Subsetting per-barcode statistics to %d cells' % ncells)

    ref = contig_manager.contig_manager(args.reference_path)
    contig_lengths = [
        ref.contig_lengths[k]
        for k in ref.primary_contigs(allow_sex_chromosomes=True)
    ]
    tot_ref_len = float(sum(contig_lengths))
    martian.log_info('Reference sequence at %s has %d bp' %
                     (args.reference_path, tot_ref_len))

    #
    # Accumulate per-cell summary stats
    #

    PER_CELL_HEADER = [
        'barcode', 'cell_id', 'total_num_reads', 'num_unmapped_reads',
        'num_lowmapq_reads', 'num_duplicate_reads', 'num_mapped_dedup_reads',
        'frac_mapped_duplicates', 'effective_depth_of_coverage',
        'effective_reads_per_1Mbp', 'raw_mapd', 'normalized_mapd',
        'raw_dimapd', 'normalized_dimapd', 'mean_ploidy', 'ploidy_confidence',
        'is_high_dimapd', 'is_noisy'
    ]

    # unusable reads are those that are non-cell barcodes that are also any of mapped, low mapq, nor dups
    # no barcode are reads whose barcode is not on the whitelist
    num_dups = stats['dups']
    num_lowmapq = stats['low_mapq_lt_30']
    num_unmapped = stats['no_barcode'] + stats['unusable_read'] + stats[
        'unmapped']
    num_mapped = stats['mapped']

    assert all(num_unmapped + num_dups + num_lowmapq +
               num_mapped == stats['denominator'])

    per_cell = pd.DataFrame(columns=PER_CELL_HEADER)
    per_cell['barcode'] = stats['BC']
    per_cell['cell_id'] = np.arange(0, ncells)
    per_cell['num_mapped_dedup_reads'] = num_mapped
    per_cell['frac_mapped_duplicates'] = stats['dups_frac']
    per_cell['num_unmapped_reads'] = num_unmapped
    per_cell['num_lowmapq_reads'] = num_lowmapq
    per_cell['num_duplicate_reads'] = num_dups
    per_cell['total_num_reads'] = stats['denominator']
    per_cell['effective_depth_of_coverage'] = stats['num_mapped_bases'].astype(
        float) / tot_ref_len
    per_cell['effective_reads_per_1Mbp'] = np.round(
        stats['mapped'] / (tot_ref_len / 1e6)).astype(int)

    flat_metrics = load_h5(args.norm_node_profiles, 'normalization_metrics')
    per_cell['raw_mapd'] = flat_metrics['raw_mapd'].iloc[0:ncells].values
    per_cell['normalized_mapd'] = flat_metrics['norm_mapd'].iloc[
        0:ncells].values
    per_cell['raw_dimapd'] = flat_metrics['raw_dimapd'].iloc[0:ncells].values
    per_cell['normalized_dimapd'] = flat_metrics['norm_dimapd'].iloc[
        0:ncells].values

    mean_ploidy, num_altevents = process_cnv_metrics(ncells,
                                                     args.node_cnv_calls,
                                                     DEFAULT_CONFIDENCE)
    per_cell['mean_ploidy'] = mean_ploidy

    # per cell confidence score
    pconf = load_h5(args.node_cnv_tracks, "ploidy_conf").values[0:ncells]
    per_cell['ploidy_confidence'] = pconf

    # is noisy cell flag
    high_dimapd = flat_metrics['is_high_dimapd'].iloc[0:ncells].values
    per_cell['is_high_dimapd'] = high_dimapd
    # cells with low confidence, or cells whose ploidy estimate was
    # overruled using high confidence cells
    low_ploidy_conf = np.logical_or(pconf == -4, (pconf >= 0) & (pconf <= 2))
    per_cell['is_noisy'] = np.logical_or(high_dimapd == 1,
                                         low_ploidy_conf).astype(int)
    with open(outs.per_cell_summary_metrics, 'w') as outfile:
        per_cell.to_csv(outfile, columns=PER_CELL_HEADER, index=False)

    #
    # Accumulate per-analysis summary stats
    #

    # combine mask vectors to calculate genome-wide mappability
    chroms = ref.primary_contigs(allow_sex_chromosomes=True)
    masks, _ = load_mask_bdy(args.norm_node_profiles, chroms)

    with open(args.report_basic, 'r') as infile:
        report_basic = json.load(infile)

    with open(args.singlecell_summary, 'r') as infile:
        singlecell_summary = json.load(infile)

    per_analysis = {
        'total_num_bases_R1':
        report_basic['r1_tot_bases'],
        'total_num_bases_R1_Q30':
        report_basic['r1_q30_bases'],
        'total_num_bases_R2':
        report_basic['r2_tot_bases'],
        'total_num_bases_R2_Q30':
        report_basic['r2_q30_bases'],
        'frac_bases_R1_Q30':
        tk_stats.robust_divide(report_basic['r1_q30_bases'],
                               report_basic['r1_tot_bases']),
        'frac_bases_R2_Q30':
        tk_stats.robust_divide(report_basic['r2_q30_bases'],
                               report_basic['r2_tot_bases']),
        'total_num_reads':
        report_basic['num_reads'],
        'total_num_reads_in_cells':
        per_cell['total_num_reads'].sum(),
        'total_num_mapped_dedup_reads_in_cells':
        per_cell['num_mapped_dedup_reads'].sum(),
        'mean_mapped_dedup_reads_per_cell':
        per_cell['num_mapped_dedup_reads'].mean(),
        'median_frac_mapped_duplicates_per_cell':
        np.median(per_cell['frac_mapped_duplicates']),
        'num_cells':
        ncells,
        'median_effective_reads_per_1Mbp':
        np.median(per_cell['effective_reads_per_1Mbp']),
        'frac_mappable_bins':
        tk_stats.robust_divide(sum(masks), len(masks)),
        'frac_noisy_cells':
        tk_stats.robust_divide(sum(per_cell['is_noisy']),
                               len(per_cell['is_noisy'])),
        'shortest_primary_contig':
        min(contig_lengths),
        'frac_non_cell_barcode':
        singlecell_summary['frac_waste_non_cell_barcode'],
        'correct_bc_rate':
        report_basic['correct_bc_rate'],
        'median_unmapped_frac':
        singlecell_summary['median_unmapped_frac']
    }

    for prefix in ["normalized", "raw"]:
        for metric in ["mapd", "dimapd"]:
            per_cell_key = "%s_%s" % (prefix, metric)
            for perc in [25, 50, 75]:
                summary_key = "%s_%s_p%d" % (prefix, metric, perc)
                per_analysis[summary_key] = tk_stats.robust_percentile(
                    per_cell[per_cell_key], perc)

    for cutoff in (25, 50, 75):
        k = 'mean_ploidy_p{:.2g}'.format(cutoff)
        per_analysis[k] = tk_stats.robust_percentile(per_cell['mean_ploidy'],
                                                     cutoff)
    per_analysis['median_ploidy'] = per_analysis['mean_ploidy_p50']

    with open(outs.summary, 'w') as outfile:
        outfile.write(
            tk_json.safe_jsonify(per_analysis, pretty=True) + os.linesep)
    SUMMARY_METRICS = [
        'total_num_reads', 'frac_bases_R1_Q30', 'frac_bases_R2_Q30',
        'correct_bc_rate', 'frac_non_cell_barcode', 'shortest_primary_contig',
        'frac_mappable_bins', 'num_cells', 'total_num_reads_in_cells',
        'total_num_mapped_dedup_reads_in_cells',
        'median_frac_mapped_duplicates_per_cell',
        'mean_mapped_dedup_reads_per_cell', 'median_effective_reads_per_1Mbp',
        'median_unmapped_frac', 'mean_ploidy_p25', 'mean_ploidy_p50',
        'mean_ploidy_p75', 'raw_mapd_p25', 'raw_mapd_p50', 'raw_mapd_p75',
        'normalized_mapd_p25', 'normalized_mapd_p50', 'normalized_mapd_p75',
        'normalized_dimapd_p25', 'normalized_dimapd_p50',
        'normalized_dimapd_p75', 'raw_dimapd_p25', 'raw_dimapd_p50',
        'raw_dimapd_p75', 'frac_noisy_cells'
    ]
    with open(outs.summary_cs, 'w') as outfile:
        values = [per_analysis[key] for key in SUMMARY_METRICS]
        outfile.write(",".join(SUMMARY_METRICS) + "\n")
        outfile.write(",".join(map(str, values)) + "\n")