Ejemplo n.º 1
0
def get_cell_barcodes(filename, ref, with_species=False):
    """Read singlecell.csv and emit barcodes"""
    scdf = pd.read_csv(filename, sep=',')
    ctg_mgr = ReferenceManager(ref)
    if not with_species:
        cell_barcodes = set()
        for species in ctg_mgr.list_species():
            species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1
            cell_barcodes.update(scdf[species_cell_mask]['barcode'].values.tolist())
    else:
        cell_barcodes = {}
        for species in ctg_mgr.list_species():
            species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1
            cell_barcodes[species] = set(scdf[species_cell_mask]['barcode'].values.tolist())
    return cell_barcodes
Ejemplo n.º 2
0
def split(args):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    chunks = []
    matrix_mem_gb = 0.
    if args.filtered_tf_bc_matrix is not None:
        matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_tf_bc_matrix) * 1.5
    matrix_mem_gb += cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_peak_bc_matrix)
    chunk_mem_gb = int(np.ceil(max(matrix_mem_gb, h5_constants.MIN_MEM_GB)))

    if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS):
        raise ValueError('Invalid factorization provided')

    # create a chunk for each method x clustering combo
    for method in args.factorization:
        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):
            clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, key)
            for cluster in set(clustering.clusters):
                chunks.append({
                    'method': method,
                    'clustering_key': key,
                    'cluster': cluster,
                    '__mem_gb': chunk_mem_gb,
                    '__vmem_gb': chunk_mem_gb + int(np.ceil(ctg_mgr.get_vmem_est())) + 1,
                    '__threads': 1,
                })

    return {'chunks': chunks, 'join': {'__mem_gb': 3}}
def check_reference_format(reference_path):
    """Check file formats for files present in the reference"""
    try:
        contig_manager = ReferenceManager(reference_path)
    except Exception as e:
        martian.exit("Contig manager could not be initialized, Error:\n%s" % str(e))

    # formatting
    error_msg = contig_manager.verify_contig_defs()
    if error_msg is not None:
        martian.exit(error_msg)

    # filecheck
    contig_manager.genes

    # check if motif file is in right format (naming convention)
    if len(contig_manager.list_species()) == 1:
        motif_format_checker(contig_manager.motifs)

    # checks for valid bed file formats in regions/
    faidx_file = os.path.join(reference_path, 'fasta', 'genome.fa.fai')

    bed_format_checker(contig_manager.tss_track, faidx_file)
    bed_format_checker(contig_manager.transcripts_track, faidx_file)
    bed_format_checker(contig_manager.ctcf_track, faidx_file)
    bed_format_checker(contig_manager.blacklist_track, faidx_file)
    bed_format_checker(contig_manager.dnase_track, faidx_file)
    bed_format_checker(contig_manager.enhancer_track, faidx_file)
    bed_format_checker(contig_manager.promoter_track, faidx_file)
Ejemplo n.º 4
0
def split(args):
    """Compute base background in split and use it in each chunk."""

    ref_mgr = ReferenceManager(args.reference_path)
    npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0
    if len(ref_mgr.list_species()
           ) > 1 or npeaks == 0 or ref_mgr.motifs is None:
        chunk_def = [{'skip': True}]
        return {'chunks': chunk_def}

    with open(args.globalGCdict, 'r') as f:
        GCdict = pickle.load(f)

    GCdict_paths = {}
    GCbins = sorted(GCdict.keys())
    for gc in GCbins:
        GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format(
            gc[0], gc[1]))
        with open(GCdict_paths[gc], 'w') as dump:
            pickle.dump(GCdict[gc], dump)

    # write rows of each chunk to a new peak file
    mem_in_gb = 8
    chunk_def = [{
        '__mem_gb':
        mem_in_gb,
        '__vmem_gb':
        mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1,
        'skip':
        False,
        'GCdict':
        GCdict_paths[chunk]
    } for chunk in GCbins]
    return {'chunks': chunk_def}
Ejemplo n.º 5
0
def join(args, outs, chunk_defs, chunk_outs):
    ref_mgr = ReferenceManager(args.reference_path)
    if args.filtered_matrix is None or args.peak_motif_hits is None or len(
            ref_mgr.list_species()) > 1:
        outs.filtered_tf_bc_matrix = None
        outs.filtered_tf_bc_matrix_mex = None
        outs.tf_propZ_matrix = None
        return

    # motif scan is completed in ANNOTATE_PEAKS

    peaks = BedTool(args.peaks)
    motifs = Motifs(args.reference_path)

    peak_motif_hits = BedTool(args.peak_motif_hits)

    # extract peak coordinate to numerical index map
    peak_idx, n_peaks = _get_peak_indexes(peaks)

    # extract motif names to numerical index map
    motif_idx, n_motifs = _get_motif_indexes(motifs)

    # extract 3 lists: peak indexes, motif indexes and counts, each entry correspond to a peak-motif pair
    peak_coor, motif_coor, values = motifscan_bed_to_sparse_matrix(
        peak_motif_hits, peak_idx, motif_idx, format='binary')

    # convert it to a sparse matrix, default is binary format, motifs are rows and peaks are columns
    tf_peak_matrix = sp.csr_matrix((values, (motif_coor, peak_coor)),
                                   shape=(n_motifs, n_peaks),
                                   dtype='int32')

    # compute the motif-BC matrix via pooling
    # The current method simply counts the number of hits for a motif inside the peaks in a barcode
    # cast as a CountMatrix
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix)
    motif_names = motif_idx.keys()
    barcodes = peak_matrix.bcs
    genomes = utils.generate_genome_tag(args.reference_path)
    motifs_def = atac_feature_ref.from_motif_list(motif_names, genomes)
    tf_matrix = cr_matrix.CountMatrix(motifs_def, barcodes,
                                      tf_peak_matrix * peak_matrix.m)

    # perform MAD-zscoring of proportion values
    propZ_matrix = np.array(tf_matrix.m / peak_matrix.m.sum(axis=0))
    propZ_matrix = MADzscore(propZ_matrix)

    outs.coerce_strings()

    # save to h5 and csv
    tf_matrix.save_h5_file(outs.filtered_tf_bc_matrix,
                           sw_version=martian.get_pipelines_version())
    if not os.path.exists(outs.filtered_tf_bc_matrix_mex):
        os.mkdir(outs.filtered_tf_bc_matrix_mex)
    atac_matrix.save_mex(
        tf_matrix,
        outs.filtered_tf_bc_matrix_mex,
        feature_type=cr_lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE,
        sw_version=martian.get_pipelines_version())
    # save propZ matrix as gz
    np.savetxt(outs.tf_propZ_matrix, propZ_matrix)
Ejemplo n.º 6
0
def generate_genome_tag(ref_path):
    """Replace empty genome name for single genomes with valid genome name"""
    # For a single species reference, use contents of <reference_path>/genome
    ref_contig_manager = ReferenceManager(ref_path)
    genomes = ref_contig_manager.list_species()
    if len(genomes) == 1 and genomes[0] == '' or len(genomes) == 0:
        genomes = [ref_contig_manager.genome]
    return genomes
Ejemplo n.º 7
0
def main(args, outs):
    reference = ReferenceManager(args.reference_path)
    species_list = reference.list_species()
    is_barnyard = len(species_list) > 1 and args.singlecell is not None

    summary_data = None
    if args.summary_results:
        with open(args.summary_results, 'r') as infile:
            summary_data = json.load(infile)

    # Pull up the correct template information
    template_path = os.path.dirname(os.path.abspath(__file__))
    template_file = os.path.join(
        template_path,
        '{}{}.html'.format('barnyard' if is_barnyard else 'single',
                           '_debug' if args.debug else ''))
    with open(template_file, 'r') as infile:
        template = infile.read()

    metadata = MetricAnnotations()
    websummary_data = {
        'alarms': {
            'alarms': []
        },
        'sample': {
            'id': args.sample_id,
            'description': args.sample_desc,
            'pipeline': "Cell Ranger ATAC Renalyzer"
        }
    }

    singlecell_df = pd.read_csv(
        args.singlecell) if args.singlecell is not None else None

    add_data(
        websummary_data,
        get_hero_metric_data(metadata, summary_data, species_list, args.debug))

    add_data(websummary_data, get_pipeline_info(args, reference, args.debug))

    add_data(
        websummary_data,
        get_clustering_plots(metadata, summary_data, args.analysis,
                             args.filtered_peak_bc_matrix, species_list,
                             singlecell_df, is_barnyard))

    # Modify the titles of plots to add consistent plot styling sample ID/descriptions
    for key, subdata in websummary_data.iteritems():
        if "layout" in subdata:
            subdata["layout"][
                "title"] += '<br><sup>Sample {} - {}</sup>'.format(
                    args.sample_id, args.sample_desc)
            subdata["layout"]["hovermode"] = "closest"
            subdata["config"] = PLOT_CONFIG_KWARGS

    with open(outs.web_summary, 'w') as outfile:
        summarize.generate_html_summary(websummary_data, template,
                                        template_path, outfile)
Ejemplo n.º 8
0
def main(args, outs):
    if args.singlecell_mapping is None or args.singlecell_targets is None or args.singlecell_cells is None:
        outs.singlecell = None
        outs.summary = None
        return

    ref = ReferenceManager(args.reference_path)
    species_list = ref.list_species()

    # Merge the input singlecell data into a single dataframe and write it out
    mapping = pd.read_csv(args.singlecell_mapping)
    cells = pd.read_csv(args.singlecell_cells)
    targeting = pd.read_csv(args.singlecell_targets)

    merged = mapping.merge(cells,
                           how="left",
                           on="barcode",
                           sort=False,
                           validate="one_to_one")
    merged["cell_id"] = merged["cell_id"].fillna("None")
    for column in merged.columns:
        if column.endswith("_cell_barcode") or column.startswith(
                "passed_filters_") or column.startswith(
                    "peak_region_fragments_"):
            merged[column] = merged[column].fillna(0).astype(int)

    merged = merged.merge(targeting,
                          how="left",
                          on="barcode",
                          sort=False,
                          validate="one_to_one")
    keys = [
        "{}_fragments".format(region) for region in [
            "TSS", "DNase_sensitive_region", "enhancer_region",
            "promoter_region", "on_target", "blacklist_region", "peak_region"
        ]
    ] + ["peak_region_cutsites"]
    for column in keys:
        merged[column] = merged[column].fillna(0).astype(int)
    merged.to_csv(outs.singlecell, index=None)

    summary_info = {}

    summary_info = add_bulk_targeting_metrics(summary_info, merged,
                                              species_list)
    summary_info = add_doublet_rate_metrics(summary_info, merged, species_list)
    summary_info = add_purity_metrics(summary_info, merged, species_list)
    summary_info = add_bulk_mapping_metrics(summary_info, merged, species_list)
    summary_info = add_singlecell_sensitivity_metrics(summary_info, merged,
                                                      species_list)

    with open(outs.summary, 'w') as summary_file:
        summary_file.write(json.dumps(summary_info, indent=4))
Ejemplo n.º 9
0
def join(args, outs, chunk_defs, chunk_outs):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        outs.enrichment_analysis = None
        outs.enrichment_analysis_summary = {}
        return

    peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix)
    tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None
    outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}}
    # for each method, we merge h5 files and copy csv directories to one place
    cr_io.mkdir(outs.enrichment_analysis, allow_existing=True)
    for method in args.factorization:
        method_dir = os.path.join(outs.enrichment_analysis, method)
        cr_io.mkdir(method_dir, allow_existing=True)

        _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method))
        outs.enrichment_analysis_summary['h5'][method] = _h5
        chunk_h5s = []

        _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method))
        outs.enrichment_analysis_summary['csv'][method] = _csv
        diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs]
        if args.filtered_tf_bc_matrix is not None:
            diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs]

        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):

            chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for
                                                       chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                                       if chunk_def.clustering_key == key], key=lambda x: x[1].cluster)
            chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering]

            # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering
            diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering]))

            # write out h5
            chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key))
            with analysis_io.open_h5_for_writing(chunk_h5) as f:
                cr_diffexp.save_differential_expression_h5(f, key, diffexp)
            chunk_h5s += [chunk_h5]

            # write out csv
            cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv)

        analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP,
                                                      analysis_constants.ANALYSIS_H5_MAP_DE[method]])
Ejemplo n.º 10
0
def split(args):
    """Compute base background in split and use it in each chunk
    """

    n_peaks = utils.quick_line_count(args.peaks) if args.peaks else 0
    ref_mgr = ReferenceManager(args.reference_path)
    if len(ref_mgr.list_species()) > 1 or n_peaks == 0 or ref_mgr.tss_track is None:
        chunk_def = [{'skip': True}]
        return {'chunks': chunk_def}

    # write rows of each chunk to a new peak file
    mem_in_gb = 4.0
    chunk_def = [{'__mem_gb': mem_in_gb,
                  'skip': False,
                  'chunk_start': chunk[0],
                  'chunk_end': chunk[1]} for chunk in utils.get_chunks(n_peaks, chunks=20)]
    return {'chunks': chunk_def}
Ejemplo n.º 11
0
def main(args, outs):
    """Run this for each method x clustering key combination from split"""
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        return

    # Load the peak-BC matrix and a clustering and perform DE
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix)
    clustering_h5 = args.clustering_summary['h5'][args.method]
    clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, args.clustering_key)
    mask = clustering.clusters == args.cluster
    clustering.clusters[mask] = 1
    clustering.clusters[np.logical_not(mask)] = 2

    # find depth using peak matrix and normalize
    scale = np.array(peak_matrix.m.sum(axis=0)).squeeze()
    depth = (scale + 1) / np.median(scale)

    cov_peak = [np.log(depth)]
    diffexp_peak = nb2_diffexp.run_differential_expression(peak_matrix.m, clustering.clusters, model='poisson',
                                                           impute_rest=True, test_params={'cov': cov_peak}, verbose=True)

    # find empirical estimates of alpha
    tf_matrix = None
    diffexp_tf = None
    # do DE on tf-BC matrix
    if args.filtered_tf_bc_matrix is not None:
        tf_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_tf_bc_matrix)
        ntfmatrix = normalize_matrix(tf_matrix.m, scale)
        alpha_tf = nb2_diffexp.empirical_dispersion(ntfmatrix)
        barcode_GC = get_barcode_gc(args.reference_path, args.peaks, peak_matrix)
        cov_tf = [barcode_GC, np.log(depth)]
        diffexp_tf = nb2_diffexp.run_differential_expression(tf_matrix.m, clustering.clusters, model='nb', impute_rest=True,
                                                             test_params={'cov': cov_tf, 'alpha': alpha_tf}, verbose=True)

    # vstack
    diffexp = diffexp_peak if tf_matrix is None else cr_diffexp.DIFFERENTIAL_EXPRESSION(np.vstack([diffexp_peak.data, diffexp_tf.data]))

    # write out temp file
    np.savetxt(outs.tmp_diffexp, diffexp.data, delimiter=',')
    outs.enrichment_analysis = None
    outs.enrichment_analysis_summary = None
Ejemplo n.º 12
0
def split(args):
    ref_mgr = ReferenceManager(args.reference_path)
    if args.filtered_matrix is None or args.peak_motif_hits is None or len(
            ref_mgr.list_species()) > 1:
        return {'chunks': []}

    matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(
        args.filtered_matrix)
    npeaks, nbcs, nnz = cr_matrix.CountMatrix.load_dims_from_h5(
        args.filtered_matrix)
    # assume we will never test more than 1000 TFs and
    # the relative hit-rate of a TF is a generous 1 out of every 10 peaks
    MAX_TF_COUNT = 1000
    MAX_TF_PEAK_SPARSITY = 0.1
    BYTES_PER_INT = np.dtype(int).itemsize
    BYTES_PER_FLOAT = np.dtype(float).itemsize
    BYTES_PER_GB = 1024**3
    ENTRIES_PER_VAL = 3
    predicted_tf_peak_matrix_mem_gb = ENTRIES_PER_VAL * MAX_TF_PEAK_SPARSITY * npeaks * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB
    predicted_tf_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB
    predicted_tf_propZ_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_FLOAT / BYTES_PER_GB
    chunk_mem_gb = int(
        np.ceil(
            max(
                matrix_mem_gb + predicted_tf_peak_matrix_mem_gb * 2 +
                predicted_tf_matrix_mem_gb * 2 +
                predicted_tf_propZ_matrix_mem_gb * 2,
                h5_constants.MIN_MEM_GB)))
    vmem_peak_motif_hits = int(
        np.ceil(predicted_tf_peak_matrix_mem_gb) * 3 +
        predicted_tf_peak_matrix_mem_gb)

    # HACK - give big jobs more threads in order to avoid overloading a node
    threads = cr_io.get_thread_request_from_mem_gb(chunk_mem_gb)

    return {
        'chunks': [],
        'join': {
            '__mem_gb': chunk_mem_gb,
            '__vmem_gb': chunk_mem_gb + vmem_peak_motif_hits + 1,
            '__threads': threads
        }
    }
Ejemplo n.º 13
0
def join(args, outs, chunk_defs, chunk_outs):
    """Compute base background in each peak."""
    ref_mgr = ReferenceManager(args.reference_path)
    npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0

    if len(ref_mgr.list_species()
           ) > 1 or npeaks == 0 or ref_mgr.motifs is None:
        outs.GCdist = None
        return

    # get peak-GC distribution
    genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0])
    GCdist = [
        utils.get_peak_GC_counts(peak, genome_fa, counts=False)
        for peak in peak_reader(args.peaks)
    ]

    # compute base background from peaks in bins
    # merge extreme GC bins with adjoining ones if they're too narrow for motif scanner to work correctly
    GCbounds = []
    nbins = NBINS
    for n, gc in enumerate(
            np.percentile(GCdist,
                          np.linspace(0, 100, nbins + 1, endpoint=True),
                          interpolation='lower')):
        if n == 0 or n == nbins:
            GCbounds += [gc]
            continue
        if gc >= LOW_GC and gc < HIGH_GC:
            GCbounds += [gc]
    GCbins = sorted(list(set(zip(GCbounds, GCbounds[1:]))))  # uniqify
    peaks = peak_reader(args.peaks)
    GCdict = get_GCbinned_peaks_and_bg(peaks, genome_fa, GCbins)

    # dump
    with open(outs.GCdict, 'w') as f:
        pickle.dump(GCdict, f)
Ejemplo n.º 14
0
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()

    if args.fragments is None:
        outs.regenerated_metrics = None
        outs.singlecell = None
        return

    target_counts_by_barcode = {}
    ref_mgr = ReferenceManager(args.reference_path)
    for chunk_in, chunk_out in zip(chunk_defs, chunk_outs):
        with open(chunk_out.target_counts_by_barcode, 'r') as infile:
            chunk_counts = pickle.load(infile)
        for barcode, barcode_counts in chunk_counts.iteritems():
            if barcode not in target_counts_by_barcode:
                target_counts_by_barcode[barcode] = barcode_counts
            else:
                for key, value in barcode_counts.iteritems():
                    if key == 'cell_id':
                        target_counts_by_barcode[barcode][key] = value
                    else:
                        target_counts_by_barcode[barcode][key] += value

    species_list = ref_mgr.list_species()
    keys = ["{region}_fragments".format(region=reg)
            for reg in ["TSS", "DNase_sensitive_region", "enhancer_region",
                        "promoter_region", "on_target", "blacklist_region", "peak_region"]] +\
           ["peak_region_cutsites", "passed_filters", "duplicate", "cell_id"] +\
           ["is_{}_cell_barcode".format(species) for species in species_list]
    if len(species_list) > 1:
        keys += ["passed_filters_{}".format(species) for species in species_list] +\
                ["peak_region_fragments_{}".format(species) for species in species_list]

    with open(outs.singlecell, 'w') as outfile:
        outfile.write("barcode,")
        outfile.write(",".join(keys))
        outfile.write("\n")
        for barcode in sorted(target_counts_by_barcode.keys()):
            outfile.write("{},".format(barcode))
            outfile.write(",".join(
                [str(target_counts_by_barcode[barcode][key]) for key in keys]))
            outfile.write("\n")

    # write cell barcodes if uniques > 0 (i.e. subsampling didn't lose barcodes)
    # overwrite the singlecell.csv with update cell calls
    scdf = pd.read_csv(outs.singlecell, sep=',')
    scdf['cell_id'] = np.full(len(scdf), "None")
    ctg_mgr = ReferenceManager(args.reference_path)
    for species in ctg_mgr.list_species():
        species_cell_mask = (scdf['is_{}_cell_barcode'.format(species)] >=
                             1) & (scdf['passed_filters'] > 0)
        scdf['is_{}_cell_barcode'.format(species)] = np.where(
            species_cell_mask, 1, 0)
        scdf['cell_id'][species_cell_mask] = np.array([
            "{}_cell_{}".format(species, num)
            for num in xrange(np.sum(species_cell_mask))
        ])
    scdf.to_csv(outs.singlecell, sep=',', index=False)
    cell_barcodes = get_cell_barcodes(outs.singlecell,
                                      args.reference_path,
                                      with_species=True)
    with open(outs.cell_barcodes, 'w') as f:
        for species in cell_barcodes:
            f.write(species + "," + ",".join(cell_barcodes[species]) + "\n")

    # write frag metrics
    summary_info = {}
    summary_info = add_bulk_targeting_metrics(summary_info, scdf, species_list)
    summary_info = add_doublet_rate_metrics(summary_info, scdf, species_list)
    summary_info = add_purity_metrics(summary_info, scdf, species_list)
    summary_info = add_singlecell_sensitivity_metrics(summary_info, scdf,
                                                      species_list)
    for species in species_list:
        key_suffix = "" if len(species_list) == 1 else "_{}".format(species)
        summary_info['annotated_cells{}'.format(key_suffix)] = scdf[
            'is_{}_cell_barcode'.format(species)].sum()

    with open(outs.regenerated_metrics, 'w') as summary_file:
        summary_file.write(json.dumps(summary_info, indent=4))
Ejemplo n.º 15
0
def join(args, outs, chunk_defs, chunk_outs):
    # Sample ID / pipestance name
    check_sample_id(args.sample_id)

    # force_cells
    check_force_cells(args.force_cells, ulimit=10000000)  # allow arbitrarily large limit for reanalyzer

    # # Reference
    # ref directory structure and timestamps
    ok, msg = check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)

    # formatting
    check_reference_format(args.reference_path)
    contig_manager = ReferenceManager(args.reference_path)

    # peaks format check and nonoverlapping
    if args.peaks is None:
        martian.exit("peaks file not provided")
    exists_and_readable(args.peaks, "peaks")
    bed_format_checker(args.peaks, contig_manager.fasta_index)
    contain_three_columns(args.peaks)
    if is_overlapping(args.peaks):
        martian.exit("{} contains overlapping peak regions".format(args.peaks))

    # check parameters files
    if args.parameters is not None:
        if not os.path.exists(args.parameters):
            martian.exit("{} does not exist".format(args.parameters))

    # fragments checks
    whitelist_barcodes = load_barcode_whitelist(args.barcode_whitelist)
    species_list = contig_manager.list_species()
    observed_gem_groups = set()
    observed_species = set()
    if args.fragments is None:
        martian.exit("fragments file not provided")
    exists_and_readable(args.fragments, "fragments")
    contig_lens = contig_manager.get_contig_lengths()
    # check bounds and matching contigs in reference and species
    for chrom, start, stop, bc, _ in open_fragment_file(args.fragments):
        spec = chrom.split("_")
        observed_species.add(spec[0] if spec[0] != chrom else "")
        barcode, gem_group = bc.split("-")
        observed_gem_groups.add(gem_group)
        if args.check_executables:  # run this only non-locally
            if barcode not in whitelist_barcodes:
                martian.exit("{} is not a valid whitelist barcode".format(barcode))
            if chrom not in contig_lens:
                martian.exit("contig {} not present in reference".format(chrom))
            if stop > contig_lens[chrom]:
                martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom]))
    # ensure fragments are on the correct reference
    for species in observed_species:
        if species not in species_list:
            martian.exit("{} contains fragments mapped to species not recognized in the reference".format(args.fragments))
    if len(observed_gem_groups) > 1:
        martian.log_info("multiple gem groups present in {}, likely generated in a previous aggregation run".format(args.fragments))

    # fragments index is synced with fragments
    if args.fragments_index is None:
        martian.exit("fragments index file not provided")
    if not os.path.exists(args.fragments_index):
        martian.exit("{} does not exist".format(args.fragments_index))
    try:
        all_contigs = contig_manager.primary_contigs(allow_sex_chromosomes=True)
        for contig in all_contigs:
            en = 0
            for chrom, start, end, bc, dups in parsed_fragments_from_contig(contig, args.fragments, index=args.fragments_index):
                if en >= FRAGMENTS_SCAN_SIZE:
                    break
                en += 1
    except:
        martian.exit("fragments index is not in sync with the fragments file")

    # aggr csv checks
    if args.aggregation_csv is not None:
        check_aggr_csv(args.aggregation_csv, args.reference_path, cursory=True)

    # cell barcode checks
    if args.cell_barcodes is not None:
        if not os.path.exists(args.cell_barcodes):
            martian.exit("{} does not exist".format(args.cell_barcodes))
        check_singlecell_format(args.cell_barcodes, species_list, whitelist_barcodes)

    # Open file handles limit
    if args.check_executables:
        check_filehandle_limit()

    martian.log_info(tk_preflight.record_package_versions())
Ejemplo n.º 16
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.fragments is None:
        outs.connect_matrix = None
        outs.gel_bead_doublet_summary = None
        outs.gel_bead_doublet_barcodes = None
        return

    with open(args.barcode_counts, "r") as infile:
        barcode_counts = Counter(json.load(infile))

    with open(chunk_defs[0].valid_barcodes, 'r') as f:
        valid_barcodes = np.array(f.readlines()[0].strip("\n").split(","))

    barcode_seqs, gem_groups = query_barcodes_and_gem_groups(valid_barcodes)
    barcode_seq_count = max([len(barcode_seqs[gg]) for gg in gem_groups])
    n_gem_groups = len(gem_groups)
    index_by_barcode = {
        gg: {bc: i
             for i, bc in enumerate(barcode_seqs[gg])}
        for gg in gem_groups
    }
    index_by_gg = {gg: i for i, gg in enumerate(gem_groups)}

    connect_matrix = np.zeros(
        (n_gem_groups, barcode_seq_count, barcode_seq_count), dtype=np.uint32)

    # This can be memory intensive due to loading the same amount of memory
    for chunk_out in chunk_outs:
        with open(chunk_out.connect_matrix, "r") as infile:
            connect_matrix += np.load(infile)

    # Write out the raw matrix
    with open(outs.connect_matrix, "w") as outfile:
        for gg in gem_groups:
            outfile.write(",".join([
                merge_barcode_and_gem_group(bc, gg) for bc in barcode_seqs[gg]
            ]))
            outfile.write("\n")
            for i in range(len(barcode_seqs[gg])):
                outfile.write(",".join(
                    (str(count)
                     for count in connect_matrix[index_by_gg[gg], i, :])))
                outfile.write("\n")

    # Identify mutual nearest neighbors as putative doublets
    putative_doublets = []
    for barcode in valid_barcodes:
        bc_seq, gg = split_barcode_and_gem_group(barcode)
        gg_index = index_by_gg[gg]
        bc_index = index_by_barcode[gg][bc_seq]
        neighbor = nearest_neighbor(connect_matrix, bc_index, gg_index)
        if nearest_neighbor(connect_matrix, neighbor, gg_index) == bc_index:
            if bc_index < neighbor:
                putative_doublets.append(
                    (barcode,
                     merge_barcode_and_gem_group(barcode_seqs[gg][neighbor],
                                                 gg)))

    # Generate the exclusions.  Note we write it out once per species since
    # cell calling is species-specific but these exclusions are not.
    ref = ReferenceManager(args.reference_path)
    species_list = ref.list_species()
    excluded_barcodes = {
        "label": "gel_bead_doublet",
        "data": {species: {}
                 for species in species_list}
    }
    for pair in putative_doublets:
        if barcode_counts[pair[0]] < barcode_counts[pair[1]]:
            excluded_bc, major_bc = pair
        else:
            major_bc, excluded_bc = pair
        for species in species_list:
            excluded_barcodes["data"][species][excluded_bc] = major_bc
    with open(outs.gel_bead_doublet_barcodes, "w") as outfile:
        outfile.write(json.dumps(excluded_barcodes))

    estimated_doublet_gelbeads = len(putative_doublets)

    metrics = {"putative_gelbead_doublets_found": estimated_doublet_gelbeads}
    with open(outs.gel_bead_doublet_summary, "w") as outfile:
        outfile.write(json.dumps(metrics))
Ejemplo n.º 17
0
def main(args, outs):
    """Compute the depth and signal per library"""
    # read
    lib_id = args.n + 1
    aggr_df = pd.read_csv(args.aggr_csv, sep=',')
    library_info = {lib_id: {}}
    for label in aggr_df.columns.values.tolist():
        library_info[lib_id][label] = str(aggr_df.iloc[args.n][label])

    # if no normalization, don't waste compute
    if args.normalization is None:
        with open(outs.library_info, 'w') as f:
            pickle.dump(library_info, f)
        return

    # set ref properties
    ctg_mgr = ReferenceManager(args.reference_path)
    contig_lens = ctg_mgr.get_contig_lengths()
    max_contig_len = max(contig_lens.values())
    curr_chrom = None
    count_dict = Counter()
    chrom_len = 1
    half_window = WINDOW_SIZE // 2

    # traverse fragments file and count stats
    fragments_f = aggr_df.iloc[args.n]['fragments']
    Cuts = None
    special_normalization = (args.normalization
                             in ["signal_mean", "signal_noise_threshold"])
    if special_normalization:
        Cuts = np.zeros(max_contig_len, dtype='int32')
    for chrom, start, stop, bc, dups in open_fragment_file(
            filename=fragments_f):
        if chrom != curr_chrom:
            curr_chrom = chrom
            if chrom not in contig_lens:
                martian.exit(
                    "fragment {}:{}-{} in {} is mapped to a contig not in the reference"
                    .format(chrom, start, stop, fragments_f))
            if special_normalization:
                count_dict += Counter(
                    Cuts[i] for i in xrange(chrom_len)
                    if Cuts[i] > 0)  # only traverse chrom len
                Cuts[:] = 0  # reset and reuse
                chrom_len = contig_lens[chrom]
        if special_normalization:
            Cuts[max(0, start - half_window):min(start + half_window +
                                                 1, chrom_len)] += 1
            Cuts[max(0, stop - half_window):min(stop + half_window +
                                                1, chrom_len)] += 1
    if special_normalization:
        count_dict += Counter(Cuts[i] for i in xrange(chrom_len)
                              if Cuts[i] > 0)  # only traverse chrom len

    scdf = pd.read_csv(library_info[lib_id]['cells'], sep=',')
    cell_mask = np.full(len(scdf), False)
    for species in ctg_mgr.list_species():
        cell_mask |= scdf['is_{}_cell_barcode'.format(species)] == 1
    library_info[lib_id]['total_fragments_per_cell'] = np.median(
        scdf[cell_mask]['total'] if 'total' in
        scdf[cell_mask].columns else scdf[cell_mask]['passed_filters'] +
        scdf[cell_mask]['duplicate'])
    library_info[lib_id]['unique_fragments_per_cell'] = np.median(
        scdf[cell_mask]['passed_filters'])

    # do peak calling fit on the count dict and get signal fit
    if args.normalization in ["signal_mean", "signal_noise_threshold"]:
        threshold, params = estimate_final_threshold(count_dict,
                                                     PEAK_ODDS_RATIO)
        library_info[lib_id]['original_threshold'] = threshold
        library_info[lib_id]['signal_mean'] = 1 / params.p_signal

    # dump library info
    with open(outs.library_info, 'w') as f:
        pickle.dump(library_info, f)
Ejemplo n.º 18
0
def check_aggr_csv(aggr_csv, reference_path, cursory=False):
    """Check aggr csv has correct columns, then progressively stronger checks on duplicates and formating of files.
    These stronger checks are enabled by default, unless you want to test the basic minimum, for example in reanalyzer"""
    contig_manager = ReferenceManager(reference_path)

    # aggr_csv checks
    exists_and_readable(aggr_csv, "aggr_csv")

    if cursory:
        nlibs, library_info, msg = parse_aggr_csv(aggr_csv, whitelist=["library_id"], blacklist=None)
    else:
        nlibs, library_info, msg = parse_aggr_csv(aggr_csv)
    if msg is not None:
        martian.exit(msg)

    # At least one library should be there
    if nlibs == 0:
        martian.exit("aggregation csv does not include any library. Provide at least two libraries.")

    if cursory:
        return
    # Enable aggr(count1) to run
    if nlibs == 1:
        martian.log_info("Aggregator should be run on more than one library")

    # avoid aggr of duplicate files (assessed by filename).
    species_list = contig_manager.list_species()
    for aggr_key in library_info[1]:  # at least one library is present
        files = {}
        for lib_id in library_info:
            fname = library_info[lib_id][aggr_key]
            if fname in files:
                martian.exit("File {} already specified for a different library under {}".format(fname, aggr_key))

            # singlecell.csv should contain 'barcode' and 'is_{}_cell_barcode' columns with the correct type
            if aggr_key == "cells":
                check_singlecell_format(fname, species_list, allow_multi_gem_groups=False)

            # peaks.bed need to be formatted correctly with right contigs if provided in aggr.csv
            # also check if peaks are non overlapping
            if aggr_key == "peaks":
                exists_and_readable(fname, "peaks")
                bed_format_checker(fname, contig_manager.fasta_index)
                contain_three_columns(fname)
                if is_overlapping(fname):
                    martian.exit("{} contains overlapping peak regions".format(fname))

            # checks on fragments
            contig_lens = contig_manager.get_contig_lengths()
            if aggr_key == "fragments":
                observed_gem_groups = set()
                observed_species = set()
                exists_and_readable(fname, "fragments")
                en = 0
                for chrom, start, stop, bc, _ in open_fragment_file(fname):
                    if en >= FRAGMENTS_SCAN_SIZE:
                        break
                    spec = chrom.split("_")
                    observed_species.add(spec[0] if spec[0] != chrom else "")
                    observed_gem_groups.add(bc.split("-")[1])
                    if chrom not in contig_lens:
                        martian.exit("fragment {}:{}-{} in {} is mapped to a contig not in the reference".format(chrom, start, stop, fname))
                    if stop > contig_lens[chrom]:
                        martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom]))
                    en += 1
                for species in observed_species:
                    if species not in species_list:
                        martian.exit("{} contains fragments mapped to species not recognized in the reference".format(fname))
                if len(observed_gem_groups) > 1:
                    martian.exit("multiple gem groups present in {}, likely generated in a previous aggregation run".format(fname))
Ejemplo n.º 19
0
def get_counts_by_barcode(reference_path, peaks, fragments, fragments_index=None, contig=None, known_cells=None):
    """Generate targeting, raw and dup counts per barcode. If cell identity is known, then also return that as part of
    the counts
    """
    def load_reference_track(track, padding=0):
        if track is not None:
            with open(track, 'r') as infile:
                regions = regtools.get_target_regions(infile, padding=padding)
        else:
            regions = None
        return regions

    def point_is_in_target(contig, position, target_regions):
        if target_regions is None:
            return False
        if contig not in target_regions:
            return False
        return target_regions[contig].contains_point(position)

    def fragment_overlaps_target(contig, start, stop, target_regions):
        if target_regions is None:
            return False
        if contig not in target_regions:
            return False
        return target_regions[contig].overlaps_region(start, stop)

    ref_manager = ReferenceManager(reference_path)

    # Load in and pad TSS/CTCF regions if present
    tss_regions = load_reference_track(ref_manager.tss_track, padding=2000)
    ctcf_regions = load_reference_track(ref_manager.ctcf_track, padding=250)

    # Load in regions from reference-associated tracks
    dnase_regions = load_reference_track(ref_manager.dnase_track)
    enhancer_regions = load_reference_track(ref_manager.enhancer_track)
    promoter_regions = load_reference_track(ref_manager.promoter_track)
    blacklist_regions = load_reference_track(ref_manager.blacklist_track)
    peak_regions = load_reference_track(peaks)

    # load cell - species map
    cell_barcodes = {}
    species_list = ref_manager.list_species()
    if known_cells is not None:
        with open(known_cells, 'r') as infile:
            for line in infile:
                items = line.strip("\n").split(",")
                for barcode in items[1:]:
                    if barcode != "null":
                        if barcode not in cell_barcodes:
                            cell_barcodes[barcode] = []
                        cell_barcodes[barcode] += [items[0]]

    # get cell index
    cell_index = {}
    spnum = {species: 0 for species in species_list}
    for species in species_list:
        for barcode in cell_barcodes:
            if species in cell_barcodes[barcode]:
                label = "{}_cell_{}".format(species, spnum[species])
                spnum[species] += 1
                cell_index[barcode] = label if barcode not in cell_index else '_'.join([cell_index[barcode], label])

    counts_by_barcode = {}
    tss_relpos = Counter()
    ctcf_relpos = Counter()

    read_count = 0

    iterator = open_fragment_file(fragments) if contig is None else \
        parsed_fragments_from_contig(contig, fragments, index=fragments_index)
    for contig, start, stop, barcode, dups in iterator:
        read_count += 2
        if barcode not in counts_by_barcode:
            counts_by_barcode[barcode] = Counter()
            if known_cells is not None:
                cell_species = cell_barcodes.get(barcode, [])
                counts_by_barcode[barcode]["cell_id"] = cell_index.get(barcode, "None")
                for species in species_list:
                    if species in cell_species:
                        counts_by_barcode[barcode]["is_{}_cell_barcode".format(species)] = 1
                    else:
                        counts_by_barcode[barcode]["is_{}_cell_barcode".format(species)] = 0

        # species splits
        if known_cells is not None and len(species_list) > 1:
            contig_species = ref_manager.species_from_contig(contig)
            counts_by_barcode[barcode]["passed_filters_{}".format(contig_species)] += 1
            if fragment_overlaps_target(contig, start, stop, peak_regions):
                counts_by_barcode[barcode]["peak_region_fragments_{}".format(contig_species)] += 1

        # raw mapping
        counts_by_barcode[barcode]["passed_filters"] += 1
        counts_by_barcode[barcode]["total"] += dups
        counts_by_barcode[barcode]["duplicate"] += dups - 1

        # Count up transposition site targeting
        for position in (start, stop):
            if point_is_in_target(contig, position, tss_regions):
                region = tss_regions[contig].get_region_containing_point(position)
                tss_relpos[region.get_relative_position(position)] += 1
            if point_is_in_target(contig, position, ctcf_regions):
                region = ctcf_regions[contig].get_region_containing_point(position)
                ctcf_relpos[region.get_relative_position(position)] += 1
            if point_is_in_target(contig, position, peak_regions):
                counts_by_barcode[barcode]["peak_region_cutsites"] += 1

        # Count up fragment overlap targeting
        is_targeted = False
        if fragment_overlaps_target(contig, start, stop, tss_regions):
            counts_by_barcode[barcode]["TSS_fragments"] += 1
            is_targeted = True
        if fragment_overlaps_target(contig, start, stop, dnase_regions):
            counts_by_barcode[barcode]["DNase_sensitive_region_fragments"] += 1
            is_targeted = True
        if fragment_overlaps_target(contig, start, stop, enhancer_regions):
            counts_by_barcode[barcode]["enhancer_region_fragments"] += 1
            is_targeted = True
        if fragment_overlaps_target(contig, start, stop, promoter_regions):
            counts_by_barcode[barcode]["promoter_region_fragments"] += 1
            is_targeted = True
        if is_targeted:
            counts_by_barcode[barcode]["on_target_fragments"] += 1
        if fragment_overlaps_target(contig, start, stop, blacklist_regions):
            counts_by_barcode[barcode]["blacklist_region_fragments"] += 1
        if fragment_overlaps_target(contig, start, stop, peak_regions):
            counts_by_barcode[barcode]["peak_region_fragments"] += 1
    return read_count, counts_by_barcode, tss_relpos, ctcf_relpos
Ejemplo n.º 20
0
def main(args, outs):
    reference = ReferenceManager(args.reference_path)
    species_list = reference.list_species()
    is_barnyard = len(species_list) > 1 and args.singlecell is not None

    singlecell_df = pd.read_csv(
        args.singlecell) if args.singlecell is not None else None

    summary_data = None
    if args.summary_results:
        with open(args.summary_results, 'r') as infile:
            summary_data = json.load(infile)

    # Pull up the correct template information
    template_path = os.path.dirname(os.path.abspath(__file__))
    template_file = os.path.join(
        template_path,
        '{}{}.html'.format('barnyard' if is_barnyard else 'single',
                           '_debug' if args.debug else ''))
    with open(template_file, 'r') as infile:
        template = infile.read()

    metadata = MetricAnnotations()
    websummary_data = {
        'alarms': {
            'alarms': []
        },
        'sample': {
            'id': args.sample_id,
            'description': args.sample_desc,
            'pipeline': "Cell Ranger ATAC"
        }
    }

    # Pull out all the general-purpose information
    add_data(
        websummary_data,
        get_hero_metric_data(metadata, summary_data, species_list, args.debug))
    add_data(websummary_data, get_pipeline_info(args, reference, args.debug))
    add_data(
        websummary_data,
        get_sequencing_info(metadata, summary_data, species_list, args.debug))
    add_data(
        websummary_data,
        get_cell_metrics_data(metadata, summary_data, species_list,
                              singlecell_df, args.excluded_barcodes,
                              args.debug))
    add_data(
        websummary_data,
        get_clustering_plots(args.analysis, args.filtered_peak_bc_matrix,
                             species_list, singlecell_df, is_barnyard))
    add_data(
        websummary_data,
        get_insertsize_data(metadata, summary_data, singlecell_df,
                            args.insert_sizes, species_list, args.debug))
    add_data(
        websummary_data,
        get_targeting_data(metadata, summary_data, species_list, singlecell_df,
                           args.tss_relpos, args.ctcf_relpos, args.debug,
                           DOWNSAMPLE_TARGETING))
    add_data(
        websummary_data,
        get_complexity_data(metadata, summary_data, args.bulk_complexity,
                            args.singlecell_complexity, species_list,
                            args.debug))

    # For barnyard samples only
    if is_barnyard:
        add_data(
            websummary_data,
            get_barnyard_data(metadata, summary_data, species_list,
                              singlecell_df, args.debug, DOWNSAMPLE_BARNYARD))

    # For PD runs only
    if args.debug:
        add_data(
            websummary_data,
            get_peakcalling_data(metadata, summary_data, species_list,
                                 args.debug))
        add_data(
            websummary_data,
            get_wasted_data(metadata, summary_data, singlecell_df,
                            species_list, args.debug))
        add_data(
            websummary_data,
            get_master_table(metadata, summary_data, species_list, is_barnyard,
                             args.debug))

    # Modify the titles of plots to add consistent plot styling sample ID/descriptions
    for key, subdata in websummary_data.iteritems():
        if "layout" in subdata:
            subdata["layout"][
                "title"] += '<br><sup>Sample {} - {}</sup>'.format(
                    args.sample_id, args.sample_desc)
            subdata["layout"]["hovermode"] = "closest"
            subdata["config"] = PLOT_CONFIG_KWARGS

    with open(outs.web_summary, 'w') as outfile:
        summarize.generate_html_summary(websummary_data, template,
                                        template_path, outfile)
Ejemplo n.º 21
0
def main(args, outs):
    metrics = {}
    for fname in args.metrics:
        if fname is not None:
            with open(fname, 'r') as f:
                metrics.update(json.load(f))

    # Normalize "NaN" values
    for key in metrics:
        value = metrics[key]
        if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)):
            metrics[key] = None

    # add version info
    metrics['cellranger-atac_version'] = martian.get_pipelines_version()

    if len(metrics) > 0:
        martian.log_info('Writing out summary_metrics')
        with open(outs.metrics, 'w') as outfile:
            outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True))

    # compile summary.csv metrics
    # load library info and fake libraries as species
    metric_registry = MetricAnnotations()
    metrics_csv_dict = {}
    if args.library_info is not None:
        with open(args.library_info, 'r') as f:
            library_info = pickle.load(f)
        library_list = [library_info[n]['library_id'] for n in library_info.keys()]
        metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list))

    # load species level metrics
    ctg_mgr = ReferenceManager(args.reference_path)
    metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species()))
    write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
Ejemplo n.º 22
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.fragments is None:
        outs.barcode_multiplets = None
        outs.barcode_multiplets_summary = None
        return

    with open(args.barcode_counts, "r") as infile:
        barcode_counts = Counter(json.load(infile))

    valid_barcodes = barcode_counts.keys()
    part_a_seqs, part_c_seqs, part_b_seqs, gem_group_seqs = query_barcode_subsequences(
        valid_barcodes)

    part_a_count = max([len(part_a_seqs[c]) for c in part_c_seqs])
    part_b_count = max([len(part_b_seqs[c]) for c in part_c_seqs])
    part_c_count = len(part_c_seqs)

    index_by_part_a = {
        part_c: {part_a: i
                 for i, part_a in enumerate(part_a_seqs[part_c])}
        for part_c in part_c_seqs
    }
    index_by_part_b = {
        part_c: {part_b: i
                 for i, part_b in enumerate(part_b_seqs[part_c])}
        for part_c in part_c_seqs
    }
    index_by_part_c = {part_c: i for i, part_c in enumerate(part_c_seqs)}

    part_a_linkage_matrix = np.zeros(
        (part_c_count, part_b_count, part_a_count, part_a_count),
        dtype=np.uint32)
    part_b_linkage_matrix = np.zeros(
        (part_c_count, part_a_count, part_b_count, part_b_count),
        dtype=np.uint32)

    # Search for contaminants as barcodes with higher similarity to a major barcode
    # with some mimimum signal than self-similarity.
    barcode_multiplets = {}

    # group chunks by gem group and aggregate across contigs for post-processing
    for gem_group_seq in gem_group_seqs:
        part_a_linkage_matrix[:, :, :, :] = 0
        part_b_linkage_matrix[:, :, :, :] = 0

        for chunk_in, chunk_out in zip(chunk_defs, chunk_outs):
            if gem_group_seq != chunk_in.gem_group:
                continue

            # aggregate across contigs
            infile = gzip.GzipFile(chunk_out.part_a_linkage_matrix, 'r')
            part_a_linkage_matrix += np.load(infile)
            infile.close()

            infile = gzip.GzipFile(chunk_out.part_b_linkage_matrix, 'r')
            part_b_linkage_matrix += np.load(infile)
            infile.close()

        for major_barcode, count in barcode_counts.iteritems():
            if count < MINIMUM_COUNT:
                continue
            part_a, part_c, part_b, gem_group = split_barcode(major_barcode,
                                                              return_gg=True)
            if gem_group != gem_group_seq:
                continue

            part_a_index = index_by_part_a[part_c][part_a]
            part_b_index = index_by_part_b[part_c][part_b]
            part_c_index = index_by_part_c[part_c]

            for other_part_a in part_a_seqs[part_c]:
                if other_part_a == part_a:
                    continue
                minor_barcode = merge_barcode(other_part_a, part_c, part_b,
                                              gem_group)

                other_part_a_index = index_by_part_a[part_c][other_part_a]
                self_signal = part_a_linkage_matrix[part_c_index, part_b_index,
                                                    other_part_a_index,
                                                    other_part_a_index]
                major_signal = part_a_linkage_matrix[part_c_index,
                                                     part_b_index,
                                                     other_part_a_index,
                                                     part_a_index]
                if major_signal > (self_signal *
                                   SELF_SIGNAL_THRESHOLD_MULTIPLIER):
                    if minor_barcode not in barcode_multiplets:
                        barcode_multiplets[minor_barcode] = major_barcode
                    else:
                        old_major = barcode_multiplets[minor_barcode]
                        old_a, _, _ = split_barcode(old_major)
                        old_a_index = index_by_part_a[part_c][old_a]
                        old_signal = part_a_linkage_matrix[part_c_index,
                                                           part_b_index,
                                                           other_part_a_index,
                                                           old_a_index]
                        if major_signal > old_signal:
                            barcode_multiplets[minor_barcode] = major_barcode

            for other_part_b in part_b_seqs[part_c]:
                if other_part_b == part_b:
                    continue
                minor_barcode = merge_barcode(part_a, part_c, other_part_b,
                                              gem_group)

                other_part_b_index = index_by_part_b[part_c][other_part_b]
                self_signal = part_b_linkage_matrix[part_c_index, part_a_index,
                                                    other_part_b_index,
                                                    other_part_b_index]
                major_signal = part_b_linkage_matrix[part_c_index,
                                                     part_a_index,
                                                     other_part_b_index,
                                                     part_b_index]
                if major_signal > (self_signal *
                                   SELF_SIGNAL_THRESHOLD_MULTIPLIER):
                    if minor_barcode not in barcode_multiplets:
                        barcode_multiplets[minor_barcode] = major_barcode
                    else:
                        old_major = barcode_multiplets[minor_barcode]
                        _, _, old_b = split_barcode(old_major)
                        old_b_index = index_by_part_b[part_c][old_b]
                        old_signal = part_b_linkage_matrix[part_c_index,
                                                           part_a_index,
                                                           other_part_b_index,
                                                           old_b_index]
                        if major_signal > old_signal:
                            barcode_multiplets[minor_barcode] = major_barcode

    # Post-screen the contaminants for pairs that are linked to each other.  In that
    # case, remove the pair where we've excluded the larger barcode
    for minor_barcode in barcode_multiplets.keys():
        if minor_barcode not in barcode_multiplets:
            # Because we've popped it off before we got here
            continue
        major_barcode = barcode_multiplets[minor_barcode]
        if major_barcode in barcode_multiplets and barcode_multiplets[
                major_barcode] == minor_barcode:
            if barcode_counts[major_barcode] > barcode_counts[minor_barcode]:
                barcode_multiplets.pop(major_barcode)
            else:
                barcode_multiplets.pop(minor_barcode)

    # Post-screen barcode multiplets for those where the major barcode is itself
    # linked to another barcode
    for minor_barcode, major_barcode in barcode_multiplets.iteritems():
        if major_barcode in barcode_multiplets:
            major_barcode = barcode_multiplets[major_barcode]
            barcode_multiplets[minor_barcode] = major_barcode

    # Generate the exclusions.  Note we write it out once per species since
    # cell calling is species-specific but these exclusions are not.
    ref = ReferenceManager(args.reference_path)
    species_list = ref.list_species()
    excluded_barcodes = {
        "label": "whitelist_contam",
        "data": {species: barcode_multiplets
                 for species in species_list}
    }
    with open(outs.barcode_multiplets, "w") as outfile:
        outfile.write(json.dumps(excluded_barcodes))

    # Generate some reporting metrics
    summary_metrics = {
        "putative_barcode_multiplets_found": len(barcode_multiplets),
    }
    with open(outs.barcode_multiplets_summary, "w") as outfile:
        outfile.write(json.dumps(summary_metrics))
Ejemplo n.º 23
0
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()
    if args.fragments is None:
        outs.cell_barcodes = None
        outs.cell_calling_summary = None
        outs.singlecell = None
        return

    if args.excluded_barcodes is not None:
        with open(args.excluded_barcodes, 'r') as infile:
            excluded_barcodes = json.load(infile)
    else:
        excluded_barcodes = None

    # Merge the chunk inputs
    ref = ReferenceManager(args.reference_path)
    species_list = ref.list_species()

    barcode_counts_by_species = {
        species: Counter()
        for species in species_list
    }
    targeted_counts_by_species = {
        species: Counter()
        for species in species_list
    }
    fragment_depth = 0
    for chunk_in, chunk_out in zip(chunk_defs, chunk_outs):
        species = ref.species_from_contig(chunk_in.contig)
        with open(chunk_out.barcode_counts, 'r') as infile:
            barcode_counts_by_species[species] += pickle.load(infile)
        with open(chunk_out.targeted_counts, 'r') as infile:
            targeted_counts_by_species[species] += pickle.load(infile)
        fragment_depth += chunk_out.fragment_depth
    print('Total fragments across all chunks: {}'.format(fragment_depth))

    barcodes = list({
        bc
        for species in species_list
        for bc in barcode_counts_by_species[species]
    })
    non_excluded_barcodes = {
        species:
        [bc for bc in barcodes if bc not in excluded_barcodes[species]]
        for species in species_list
    }
    print('Total barcodes observed: {}'.format(len(barcodes)))

    retained_counts = {}
    for species in species_list:
        if excluded_barcodes is None:
            retained_counts[species] = np.array(
                [targeted_counts_by_species[species][bc] for bc in barcodes])
        else:
            retained_counts[species] = np.array([
                targeted_counts_by_species[species][bc] for bc in barcodes
                if bc not in excluded_barcodes[species]
            ])
            print('Barcodes excluded for species {}: {}'.format(
                species, len(excluded_barcodes[species])))
            print('Barcodes remaining for species {}: {}'.format(
                species, len(non_excluded_barcodes[species])))

    parameters = {}

    whitelist_length = len(load_barcode_whitelist(args.barcode_whitelist))
    count_shift = max(
        MINIMUM_COUNT,
        int(fragment_depth * WHITELIST_CONTAM_RATE / whitelist_length))
    print('Count shift for whitelist contamination: {}'.format(count_shift))

    for (species, count_data) in retained_counts.iteritems():
        print('Analyzing species {}'.format(species))
        # Subtract MINIMUM_COUNT from all counts to remove the effects of whitelist contamination
        shifted_data = count_data[count_data >= count_shift] - count_shift
        print('Number of barcodes analyzed: {}'.format(len(shifted_data)))
        count_dict = Counter(shifted_data)
        parameters[species] = {}

        forced_cell_count = None
        if args.force_cells is not None:
            if species in args.force_cells:
                forced_cell_count = int(args.force_cells[species])
            elif "default" in args.force_cells:
                forced_cell_count = int(args.force_cells["default"])
            if forced_cell_count > MAXIMUM_CELLS_PER_SPECIES:
                forced_cell_count = MAXIMUM_CELLS_PER_SPECIES
                martian.log_info(
                    'Attempted to force cells to {}.  Overriding to maximum allowed cells.'
                    .format(forced_cell_count))

        # Initialize parameters to empty
        parameters[species]['noise_mean'] = None
        parameters[species]['noise_dispersion'] = None
        parameters[species]['signal_mean'] = None
        parameters[species]['signal_dispersion'] = None
        parameters[species]['fraction_noise'] = None
        parameters[species]['cell_threshold'] = None
        parameters[species]['goodness_of_fit'] = None
        parameters[species]['estimated_cells_present'] = 0

        # Corner case where FRIP is 0 because the number of peaks is tiny (fuzzer tests)
        if len(count_dict) < 10:
            parameters[species]['cells_detected'] = 0
            forced_cell_count = None
        elif forced_cell_count is None:
            print('Estimating parameters')
            fitted_params = estimate_parameters(count_dict)
            signal_threshold = estimate_threshold(
                fitted_params, CELL_CALLING_THRESHOLD) + count_shift
            print('Primary threshold: {}'.format(signal_threshold))
            parameters[species]['noise_mean'] = fitted_params.mu_noise
            parameters[species]['noise_dispersion'] = fitted_params.alpha_noise
            parameters[species]['signal_mean'] = fitted_params.mu_signal
            parameters[species][
                'signal_dispersion'] = fitted_params.alpha_signal
            parameters[species]['fraction_noise'] = fitted_params.frac_noise
            parameters[species]['cell_threshold'] = signal_threshold
            parameters[species]['goodness_of_fit'] = goodness_of_fit(
                shifted_data, fitted_params)
            called_cell_count = np.sum(count_data >= signal_threshold)
            parameters[species]['cells_detected'] = called_cell_count
            parameters[species]['estimated_cells_present'] = int(
                (1 - fitted_params.frac_noise) * len(shifted_data))
            if called_cell_count > MAXIMUM_CELLS_PER_SPECIES:
                # Abort the model fitting and instead force cells to the maximum
                forced_cell_count = MAXIMUM_CELLS_PER_SPECIES

        if forced_cell_count is not None:
            print('Forcing cells to {}'.format(forced_cell_count))

            if forced_cell_count <= 0:
                raise ValueError("Force cells must be positive")
            else:
                adj_data = shifted_data[shifted_data > 0]
                print('Total barcodes considered for forcing cells: {}'.format(
                    len(adj_data)))
                parameters[species]['cell_threshold'] = min(adj_data) if forced_cell_count >= len(adj_data) else \
                    sorted(adj_data, reverse=True)[forced_cell_count - 1]
                parameters[species]['cell_threshold'] += count_shift
                parameters[species]['cells_detected'] = np.sum(
                    count_data >= parameters[species]['cell_threshold'])

    # For barnyard samples, mask out the noise distribution and re-fit to get cleaner separation
    if len(retained_counts) == 2 and (args.force_cells is None
                                      or not args.force_cells):
        print('Estimating secondary thresholds')
        sp1, sp2 = species_list

        sp1_threshold = -1 if parameters[sp1][
            'cell_threshold'] is not None else parameters[sp1]['cell_threshold']
        sp2_threshold = -1 if parameters[sp2][
            'cell_threshold'] is not None else parameters[sp2]['cell_threshold']

        if parameters[sp1]['cell_threshold'] is not None:
            sp1_counts = np.array([
                targeted_counts_by_species[sp1][bc]
                for bc in non_excluded_barcodes[sp1]
                if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and (
                    targeted_counts_by_species[sp2][bc] > sp2_threshold)
            ])
            sp1_params = estimate_parameters(Counter(sp1_counts),
                                             threshold=sp1_threshold)
            if not np.isnan(sp1_params.frac_noise):
                parameters[sp1]['cell_threshold'] = max(
                    sp1_threshold, estimate_threshold(sp1_params, 20))
            parameters[sp1]['cells_detected'] = np.sum(
                sp1_counts >= parameters[sp1]['cell_threshold'])
        else:
            parameters[sp1]['cells_detected'] = 0

        if parameters[sp2]['cell_threshold'] is not None:
            sp2_counts = np.array([
                targeted_counts_by_species[sp2][bc]
                for bc in non_excluded_barcodes[sp2]
                if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and (
                    targeted_counts_by_species[sp2][bc] > sp2_threshold)
            ])
            sp2_params = estimate_parameters(Counter(sp2_counts),
                                             threshold=sp2_threshold)
            if not np.isnan(sp2_params.frac_noise):
                parameters[sp2]['cell_threshold'] = max(
                    sp2_threshold, estimate_threshold(sp2_params, 20))
            parameters[sp2]['cells_detected'] = np.sum(
                sp2_counts >= parameters[sp2]['cell_threshold'])
        else:
            parameters[sp2]['cells_detected'] = 0

        print('Secondary threshold ({}): {}'.format(
            sp1, parameters[sp1]['cell_threshold']))
        print('Secondary threshold ({}): {}'.format(
            sp2, parameters[sp2]['cell_threshold']))

    print('Writing out cell barcodes')
    cell_barcodes = {}
    for (species, count_data) in retained_counts.iteritems():
        threshold = parameters[species]['cell_threshold']
        cell_barcodes[species] = {}
        print('Cell threshold for species {}: {}'.format(species, threshold))
        if threshold is not None:
            for count, barcode in zip(count_data,
                                      non_excluded_barcodes[species]):
                if count >= threshold:
                    print('{} - Total {}, Targeted {}, Count {}, Threshold {}'.
                          format(barcode,
                                 barcode_counts_by_species[species][barcode],
                                 targeted_counts_by_species[species][barcode],
                                 count, threshold))
                    cell_barcodes[species][barcode] = count
        if len(cell_barcodes[species]
               ) != parameters[species]['cells_detected']:
            print(len(cell_barcodes[species]),
                  parameters[species]['cells_detected'])
            raise ValueError(
                'Mismatch in called cells identified - failure in threshold setting'
            )
        print('Selected {} barcodes of species {}'.format(
            len(cell_barcodes[species]), species))

    with open(outs.cell_barcodes, 'w') as outfile:
        # low mem reduce op to merge-sort bcs across species
        for species in cell_barcodes.keys():
            outfile.write(species + ",")
            outfile.write(",".join(cell_barcodes[species]) + "\n")

    cell_index = compute_cell_index(species_list, cell_barcodes)

    with open(outs.singlecell, 'w') as outfile:
        outfile.write("barcode,cell_id,")
        outfile.write(",".join([
            "is_{}_cell_barcode".format(species) for species in species_list
        ]))
        if len(species_list) > 1:
            for species in species_list:
                outfile.write(",passed_filters_{}".format(species))
                outfile.write(",peak_region_fragments_{}".format(species))
        outfile.write("\n")
        for barcode in [NO_BARCODE] + sorted(barcodes):
            outfile.write("{},".format(barcode))
            outfile.write("{},".format(cell_index.get(barcode, "None")))
            values = [
                str(
                    int(species in cell_barcodes
                        and barcode in cell_barcodes[species]))
                for species in species_list
            ]
            outfile.write(",".join(values))
            if len(species_list) > 1:
                for species in species_list:
                    outfile.write(",{:d}".format(
                        barcode_counts_by_species[species][barcode]))
                    outfile.write(",{:d}".format(
                        targeted_counts_by_species[species][barcode]))
            outfile.write("\n")

    # process data into summary metrics
    summary_info = {}
    summary_info.update(
        generate_cell_calling_metrics(parameters, cell_barcodes))
    summary_info.update(generate_gb_metrics(cell_barcodes, excluded_barcodes))

    with open(outs.cell_calling_summary, 'w') as outfile:
        outfile.write(json.dumps(summary_info, indent=4))
Ejemplo n.º 24
0
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()
    if args.fragments is None:
        outs.low_targeting_barcodes = None
        outs.low_targeting_summary = None
        return

    # Merge the chunk inputs
    ref = ReferenceManager(args.reference_path)
    species_list = ref.list_species()

    barcode_counts_by_species = {species: Counter() for species in species_list}
    targeted_counts_by_species = {species: Counter() for species in species_list}

    peak_bp_by_species = {species: 0 for species in species_list}
    genome_bp_by_species = {species: 0 for species in species_list}

    fragment_lengths = {padding: Counter() for padding in PADDING_VALUES}
    covered_bases = {padding: Counter() for padding in PADDING_VALUES}

    for chunk_in, chunk_out in zip(chunk_defs, chunk_outs):
        species = ref.species_from_contig(chunk_in.contig)

        with open(chunk_out.fragment_counts, "r") as infile:
            barcode_counts_by_species[species] += pickle.load(infile)
        with open(chunk_out.targeted_counts, "r") as infile:
            targeted_counts_by_species[species] += pickle.load(infile)

        with open(chunk_out.fragment_lengths, "r") as infile:
            data = pickle.load(infile)
            for padding in PADDING_VALUES:
                fragment_lengths[padding] += data[padding]

        with open(chunk_out.covered_bases, "r") as infile:
            data = pickle.load(infile)
            for padding in PADDING_VALUES:
                covered_bases[padding] += data[padding]

        peak_bp_by_species[species] += chunk_out.peak_coverage
        genome_bp_by_species[species] += ref.contig_lengths[chunk_in.contig]

    frac_genome_in_peaks_by_species = {
        species: peak_bp_by_species[species] / genome_bp_by_species[species]
        for species in species_list
    }

    # Identify barcodes that have lower fraction of reads overlapping peaks than the
    # genomic coverage of the peaks
    low_targeting_barcodes = {
        "label": "low_targeting",
        "data": {species: {} for species in species_list}
    }
    for species in species_list:
        for barcode, total_count in barcode_counts_by_species[species].iteritems():
            barcode_frac_peaks = (
                targeted_counts_by_species[species][barcode] / total_count
            )
            if barcode_frac_peaks < frac_genome_in_peaks_by_species[species]:
                low_targeting_barcodes["data"][species][barcode] = barcode_frac_peaks

    # Sum up the total fragment counts per barcode across all species
    total_barcode_counts = Counter()
    for species, barcode_counts in barcode_counts_by_species.iteritems():
        total_barcode_counts += barcode_counts
    with open(outs.barcode_counts, "w") as outfile:
        outfile.write(json.dumps(total_barcode_counts, indent=4))

    summary_data = {}
    for species in species_list:
        key_suffix = "" if len(species_list) == 1 else "_{}".format(species)
        summary_data["number_of_low_targeting_barcodes{}".format(key_suffix)] = len(
            low_targeting_barcodes["data"][species]
        )
        summary_data[
            "fraction_of_genome_within_{}bp_of_peaks{}".format(DISTANCE, key_suffix)
        ] = frac_genome_in_peaks_by_species[species]
    with open(outs.low_targeting_summary, "w") as outfile:
        outfile.write(json.dumps(summary_data, indent=4))
    with open(outs.low_targeting_barcodes, "w") as outfile:
        outfile.write(json.dumps(low_targeting_barcodes, indent=4))
    with open(outs.fragment_lengths, "w") as outfile:
        outfile.write(json.dumps(fragment_lengths, indent=4))
    with open(outs.covered_bases, "w") as outfile:
        outfile.write(json.dumps(covered_bases, indent=4))
Ejemplo n.º 25
0
def main(args, outs):
    reference = ReferenceManager(args.reference_path)

    martian.log_info('Writing analysis parameters')
    write_analysis_parameters(outs.analysis_params)

    martian.log_info('Initializing summary metrics')
    summary_metrics = {}
    summary_metrics = simple_load_metrics(summary_metrics, args.basic_results)

    if args.singlecell_results is not None:
        martian.log_info('Loading single cell results')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.singlecell_results)

    if args.insert_summary is not None:
        martian.log_info('Loading insert summary')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.insert_summary)

    if args.complexity_summary is not None:
        martian.log_info('Loading complexity summary')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.complexity_summary)

    if args.error_results_summary is not None:
        martian.log_info('Loading error summary')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.error_results_summary)

    if args.downsample_info is not None:
        martian.log_info('Loading downsampling information')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.downsample_info)

    if args.contam_results is not None:
        martian.log_info('Loading contamination results')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.contam_results)

    if args.peak_results is not None:
        martian.log_info('Loading peak results')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.peak_results)

    if args.enrichment_results is not None:
        martian.log_info('Loading TSS and CTCF scores')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.enrichment_results)

    if args.cell_calling_summary is not None:
        martian.log_info('Loading cell calling parameters')
        summary_metrics = simple_load_metrics(summary_metrics,
                                              args.cell_calling_summary)

    # Normalize "NaN" values
    for key in summary_metrics:
        value = summary_metrics[key]
        if str(value) == 'NaN' or (isinstance(value, float)
                                   and np.isnan(value)):
            summary_metrics[key] = None

    if reference.metadata:
        # If we have reference metadata - copy over the data to summary.json
        for (key, value) in reference.metadata.items():
            summary_metrics["reference_" + key] = value

    martian.log_info('Writing out summary_metrics')
    with open(outs.summary, 'w') as outfile:
        outfile.write(
            tenkit.safe_json.safe_jsonify(summary_metrics, pretty=True))

    # compile summary.csv metrics
    metric_registry = MetricAnnotations()

    species_list = reference.list_species()
    summary_csv_dict = metric_registry.compile_summary_metrics(
        summary_metrics, species_list=species_list)
    write_dict_to_csv(outs.summary_csv, summary_csv_dict, sort=True)