Ejemplo n.º 1
0
def split(args):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]}

    chunks = []
    matrix_mem_gb = 0.
    if args.filtered_tf_bc_matrix is not None:
        matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_tf_bc_matrix) * 1.5
    matrix_mem_gb += cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_peak_bc_matrix)
    chunk_mem_gb = int(np.ceil(max(matrix_mem_gb, h5_constants.MIN_MEM_GB)))

    if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS):
        raise ValueError('Invalid factorization provided')

    # create a chunk for each method x clustering combo
    for method in args.factorization:
        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):
            clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, key)
            for cluster in set(clustering.clusters):
                chunks.append({
                    'method': method,
                    'clustering_key': key,
                    'cluster': cluster,
                    '__mem_gb': chunk_mem_gb,
                    '__vmem_gb': chunk_mem_gb + int(np.ceil(ctg_mgr.get_vmem_est())) + 1,
                    '__threads': 1,
                })

    return {'chunks': chunks, 'join': {'__mem_gb': 3}}
def check_reference_format(reference_path):
    """Check file formats for files present in the reference"""
    try:
        contig_manager = ReferenceManager(reference_path)
    except Exception as e:
        martian.exit("Contig manager could not be initialized, Error:\n%s" % str(e))

    # formatting
    error_msg = contig_manager.verify_contig_defs()
    if error_msg is not None:
        martian.exit(error_msg)

    # filecheck
    contig_manager.genes

    # check if motif file is in right format (naming convention)
    if len(contig_manager.list_species()) == 1:
        motif_format_checker(contig_manager.motifs)

    # checks for valid bed file formats in regions/
    faidx_file = os.path.join(reference_path, 'fasta', 'genome.fa.fai')

    bed_format_checker(contig_manager.tss_track, faidx_file)
    bed_format_checker(contig_manager.transcripts_track, faidx_file)
    bed_format_checker(contig_manager.ctcf_track, faidx_file)
    bed_format_checker(contig_manager.blacklist_track, faidx_file)
    bed_format_checker(contig_manager.dnase_track, faidx_file)
    bed_format_checker(contig_manager.enhancer_track, faidx_file)
    bed_format_checker(contig_manager.promoter_track, faidx_file)
Ejemplo n.º 3
0
def main(args, outs):
    '''Find cut sites on a per chromosome basis and write out a bedgraph'''
    if args.fragments is None:
        outs.count_dict = None
        outs.cut_sites = None
        return

    ctg_mgr = ReferenceManager(args.reference_path)
    contig_len = ctg_mgr.get_contig_lengths()
    chrom_len = contig_len[args.contig]
    half_window = WINDOW_SIZE // 2
    Cuts = np.zeros(chrom_len, dtype='int32')

    # find windowed cut sites
    for _, start, stop, _, _ in parsed_fragments_from_contig(contig=args.contig, filename=args.fragments, index=args.fragments_index):
        Cuts[max(0, start - half_window): min(start + half_window + 1, chrom_len)] += 1
        Cuts[max(0, stop - half_window): min(stop + half_window + 1, chrom_len)] += 1

    # get count dict
    count_dict = Counter(v for v in Cuts if v > 0)
    with open(outs.count_dict, 'w') as count_dict_out:
        pickle.dump(count_dict, count_dict_out)

    # write bedgraph of * windowed cutsites *
    if len(count_dict):
        write_chrom_bedgraph(args.contig, chrom_len, Cuts, outs.cut_sites)
    else:
        outs.cut_sites = None
Ejemplo n.º 4
0
def main(args, outs):
    metrics = {}
    for fname in args.metrics:
        if fname is not None:
            with open(fname, 'r') as f:
                metrics.update(json.load(f))

    # Normalize "NaN" values
    for key in metrics:
        value = metrics[key]
        if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)):
            metrics[key] = None

    # add version info
    metrics['cellranger-atac_version'] = martian.get_pipelines_version()

    if len(metrics) > 0:
        martian.log_info('Writing out summary_metrics')
        with open(outs.metrics, 'w') as outfile:
            outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True))

    # compile summary.csv metrics
    # load library info and fake libraries as species
    metric_registry = MetricAnnotations()
    metrics_csv_dict = {}
    if args.library_info is not None:
        with open(args.library_info, 'r') as f:
            library_info = pickle.load(f)
        library_list = [library_info[n]['library_id'] for n in library_info.keys()]
        metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list))

    # load species level metrics
    ctg_mgr = ReferenceManager(args.reference_path)
    metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species()))
    write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
Ejemplo n.º 5
0
def split(args):
    """Compute base background in split and use it in each chunk."""

    ref_mgr = ReferenceManager(args.reference_path)
    npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0
    if len(ref_mgr.list_species()
           ) > 1 or npeaks == 0 or ref_mgr.motifs is None:
        chunk_def = [{'skip': True}]
        return {'chunks': chunk_def}

    with open(args.globalGCdict, 'r') as f:
        GCdict = pickle.load(f)

    GCdict_paths = {}
    GCbins = sorted(GCdict.keys())
    for gc in GCbins:
        GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format(
            gc[0], gc[1]))
        with open(GCdict_paths[gc], 'w') as dump:
            pickle.dump(GCdict[gc], dump)

    # write rows of each chunk to a new peak file
    mem_in_gb = 8
    chunk_def = [{
        '__mem_gb':
        mem_in_gb,
        '__vmem_gb':
        mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1,
        'skip':
        False,
        'GCdict':
        GCdict_paths[chunk]
    } for chunk in GCbins]
    return {'chunks': chunk_def}
Ejemplo n.º 6
0
def join(args, outs, chunk_defs, chunk_outs):
    ref_mgr = ReferenceManager(args.reference_path)
    if args.filtered_matrix is None or args.peak_motif_hits is None or len(
            ref_mgr.list_species()) > 1:
        outs.filtered_tf_bc_matrix = None
        outs.filtered_tf_bc_matrix_mex = None
        outs.tf_propZ_matrix = None
        return

    # motif scan is completed in ANNOTATE_PEAKS

    peaks = BedTool(args.peaks)
    motifs = Motifs(args.reference_path)

    peak_motif_hits = BedTool(args.peak_motif_hits)

    # extract peak coordinate to numerical index map
    peak_idx, n_peaks = _get_peak_indexes(peaks)

    # extract motif names to numerical index map
    motif_idx, n_motifs = _get_motif_indexes(motifs)

    # extract 3 lists: peak indexes, motif indexes and counts, each entry correspond to a peak-motif pair
    peak_coor, motif_coor, values = motifscan_bed_to_sparse_matrix(
        peak_motif_hits, peak_idx, motif_idx, format='binary')

    # convert it to a sparse matrix, default is binary format, motifs are rows and peaks are columns
    tf_peak_matrix = sp.csr_matrix((values, (motif_coor, peak_coor)),
                                   shape=(n_motifs, n_peaks),
                                   dtype='int32')

    # compute the motif-BC matrix via pooling
    # The current method simply counts the number of hits for a motif inside the peaks in a barcode
    # cast as a CountMatrix
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix)
    motif_names = motif_idx.keys()
    barcodes = peak_matrix.bcs
    genomes = utils.generate_genome_tag(args.reference_path)
    motifs_def = atac_feature_ref.from_motif_list(motif_names, genomes)
    tf_matrix = cr_matrix.CountMatrix(motifs_def, barcodes,
                                      tf_peak_matrix * peak_matrix.m)

    # perform MAD-zscoring of proportion values
    propZ_matrix = np.array(tf_matrix.m / peak_matrix.m.sum(axis=0))
    propZ_matrix = MADzscore(propZ_matrix)

    outs.coerce_strings()

    # save to h5 and csv
    tf_matrix.save_h5_file(outs.filtered_tf_bc_matrix,
                           sw_version=martian.get_pipelines_version())
    if not os.path.exists(outs.filtered_tf_bc_matrix_mex):
        os.mkdir(outs.filtered_tf_bc_matrix_mex)
    atac_matrix.save_mex(
        tf_matrix,
        outs.filtered_tf_bc_matrix_mex,
        feature_type=cr_lib_constants.ATACSEQ_LIBRARY_DERIVED_TYPE,
        sw_version=martian.get_pipelines_version())
    # save propZ matrix as gz
    np.savetxt(outs.tf_propZ_matrix, propZ_matrix)
Ejemplo n.º 7
0
def generate_genome_tag(ref_path):
    """Replace empty genome name for single genomes with valid genome name"""
    # For a single species reference, use contents of <reference_path>/genome
    ref_contig_manager = ReferenceManager(ref_path)
    genomes = ref_contig_manager.list_species()
    if len(genomes) == 1 and genomes[0] == '' or len(genomes) == 0:
        genomes = [ref_contig_manager.genome]
    return genomes
Ejemplo n.º 8
0
def main(args, outs):
    reference = ReferenceManager(args.reference_path)
    species_list = reference.list_species()
    is_barnyard = len(species_list) > 1 and args.singlecell is not None

    summary_data = None
    if args.summary_results:
        with open(args.summary_results, 'r') as infile:
            summary_data = json.load(infile)

    # Pull up the correct template information
    template_path = os.path.dirname(os.path.abspath(__file__))
    template_file = os.path.join(
        template_path,
        '{}{}.html'.format('barnyard' if is_barnyard else 'single',
                           '_debug' if args.debug else ''))
    with open(template_file, 'r') as infile:
        template = infile.read()

    metadata = MetricAnnotations()
    websummary_data = {
        'alarms': {
            'alarms': []
        },
        'sample': {
            'id': args.sample_id,
            'description': args.sample_desc,
            'pipeline': "Cell Ranger ATAC Renalyzer"
        }
    }

    singlecell_df = pd.read_csv(
        args.singlecell) if args.singlecell is not None else None

    add_data(
        websummary_data,
        get_hero_metric_data(metadata, summary_data, species_list, args.debug))

    add_data(websummary_data, get_pipeline_info(args, reference, args.debug))

    add_data(
        websummary_data,
        get_clustering_plots(metadata, summary_data, args.analysis,
                             args.filtered_peak_bc_matrix, species_list,
                             singlecell_df, is_barnyard))

    # Modify the titles of plots to add consistent plot styling sample ID/descriptions
    for key, subdata in websummary_data.iteritems():
        if "layout" in subdata:
            subdata["layout"][
                "title"] += '<br><sup>Sample {} - {}</sup>'.format(
                    args.sample_id, args.sample_desc)
            subdata["layout"]["hovermode"] = "closest"
            subdata["config"] = PLOT_CONFIG_KWARGS

    with open(outs.web_summary, 'w') as outfile:
        summarize.generate_html_summary(websummary_data, template,
                                        template_path, outfile)
Ejemplo n.º 9
0
def split(args):
    ref_mgr = ReferenceManager(args.reference_path)
    return {
        'chunks': [],
        'join': {
            '__mem_gb': 4,
            '__vmem_gb': int(np.ceil(ref_mgr.get_vmem_est())) + 3
        }
    }
Ejemplo n.º 10
0
def main(args, outs):
    if args.singlecell_mapping is None or args.singlecell_targets is None or args.singlecell_cells is None:
        outs.singlecell = None
        outs.summary = None
        return

    ref = ReferenceManager(args.reference_path)
    species_list = ref.list_species()

    # Merge the input singlecell data into a single dataframe and write it out
    mapping = pd.read_csv(args.singlecell_mapping)
    cells = pd.read_csv(args.singlecell_cells)
    targeting = pd.read_csv(args.singlecell_targets)

    merged = mapping.merge(cells,
                           how="left",
                           on="barcode",
                           sort=False,
                           validate="one_to_one")
    merged["cell_id"] = merged["cell_id"].fillna("None")
    for column in merged.columns:
        if column.endswith("_cell_barcode") or column.startswith(
                "passed_filters_") or column.startswith(
                    "peak_region_fragments_"):
            merged[column] = merged[column].fillna(0).astype(int)

    merged = merged.merge(targeting,
                          how="left",
                          on="barcode",
                          sort=False,
                          validate="one_to_one")
    keys = [
        "{}_fragments".format(region) for region in [
            "TSS", "DNase_sensitive_region", "enhancer_region",
            "promoter_region", "on_target", "blacklist_region", "peak_region"
        ]
    ] + ["peak_region_cutsites"]
    for column in keys:
        merged[column] = merged[column].fillna(0).astype(int)
    merged.to_csv(outs.singlecell, index=None)

    summary_info = {}

    summary_info = add_bulk_targeting_metrics(summary_info, merged,
                                              species_list)
    summary_info = add_doublet_rate_metrics(summary_info, merged, species_list)
    summary_info = add_purity_metrics(summary_info, merged, species_list)
    summary_info = add_bulk_mapping_metrics(summary_info, merged, species_list)
    summary_info = add_singlecell_sensitivity_metrics(summary_info, merged,
                                                      species_list)

    with open(outs.summary, 'w') as summary_file:
        summary_file.write(json.dumps(summary_info, indent=4))
Ejemplo n.º 11
0
def split(args):
    if args.fragments is None:
        return {"chunks": [], "join": {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for contig in all_contigs:
        chunks.append({"contig": contig, "__mem_gb": 5})

    return {"chunks": chunks, "join": {"__mem_gb": 5}}
Ejemplo n.º 12
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig, '__mem_gb': 5})

    return {'chunks': chunks, 'join': {'__mem_gb': 5}}
Ejemplo n.º 13
0
def join(args, outs, chunk_defs, chunk_outs):
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        outs.enrichment_analysis = None
        outs.enrichment_analysis_summary = {}
        return

    peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix)
    tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None
    outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}}
    # for each method, we merge h5 files and copy csv directories to one place
    cr_io.mkdir(outs.enrichment_analysis, allow_existing=True)
    for method in args.factorization:
        method_dir = os.path.join(outs.enrichment_analysis, method)
        cr_io.mkdir(method_dir, allow_existing=True)

        _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method))
        outs.enrichment_analysis_summary['h5'][method] = _h5
        chunk_h5s = []

        _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method))
        outs.enrichment_analysis_summary['csv'][method] = _csv
        diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs]
        if args.filtered_tf_bc_matrix is not None:
            diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs]

        clustering_h5 = args.clustering_summary['h5'][method]
        for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5):

            chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for
                                                       chunk_out, chunk_def in zip(chunk_outs, chunk_defs)
                                                       if chunk_def.clustering_key == key], key=lambda x: x[1].cluster)
            chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering]

            # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering
            diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering]))

            # write out h5
            chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key))
            with analysis_io.open_h5_for_writing(chunk_h5) as f:
                cr_diffexp.save_differential_expression_h5(f, key, diffexp)
            chunk_h5s += [chunk_h5]

            # write out csv
            cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv)

        analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP,
                                                      analysis_constants.ANALYSIS_H5_MAP_DE[method]])
Ejemplo n.º 14
0
def get_cell_barcodes(filename, ref, with_species=False):
    """Read singlecell.csv and emit barcodes"""
    scdf = pd.read_csv(filename, sep=',')
    ctg_mgr = ReferenceManager(ref)
    if not with_species:
        cell_barcodes = set()
        for species in ctg_mgr.list_species():
            species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1
            cell_barcodes.update(scdf[species_cell_mask]['barcode'].values.tolist())
    else:
        cell_barcodes = {}
        for species in ctg_mgr.list_species():
            species_cell_mask = scdf['is_{}_cell_barcode'.format(species)] == 1
            cell_barcodes[species] = set(scdf[species_cell_mask]['barcode'].values.tolist())
    return cell_barcodes
Ejemplo n.º 15
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)
    contig_len = ctg_mgr.get_contig_lengths()
    BYTES_PER_INT32_WITH_SAFETY = 5

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig,
                       '__mem_gb': int(np.ceil(BYTES_PER_INT32_WITH_SAFETY * contig_len[contig] / 1024 / 1024 / 1024))})

    return {'chunks': chunks, 'join': {'__mem_gb': 5}}
Ejemplo n.º 16
0
def split(args):
    if args.fragments is None:
        return {"chunks": [], "join": {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    with open(args.barcode_counts, "r") as infile:
        barcode_counts = Counter(json.load(infile))
    barcode_array = np.array([bc for bc in barcode_counts])
    gem_group_array = np.array(
        [get_barcode_gem_group(bc) for bc in barcode_counts])
    gem_groups = set(gem_group_array)
    frag_count_array = np.array([barcode_counts[bc] for bc in barcode_array])

    valid_barcodes = list()
    for gem_group in gem_groups:
        count_mask = (frag_count_array > MINIMUM_COUNTS) & (gem_group_array
                                                            == gem_group)
        # find at most top N barcodes
        topN_indices = barcode_array[count_mask].argsort(
        )[-min(MAXIMUM_BARCODES, len(count_mask)):]
        valid_barcodes.extend(list(barcode_array[count_mask][topN_indices]))

    # mem allocs
    JOIN_LOAD_FACTOR = 2
    BUFFER_GB = 2
    BYTES_PER_ENTRY = 4  # this depends on the dtype
    chunk_mem_gb = BUFFER_GB + np.ceil(
        BYTES_PER_ENTRY * len(gem_groups) * MAXIMUM_BARCODES**2 /
        1024**3).astype('int32')
    join_mem_gb = BUFFER_GB + np.ceil(
        JOIN_LOAD_FACTOR * BYTES_PER_ENTRY * len(gem_groups) *
        MAXIMUM_BARCODES**2 / 1024**3).astype('int32')

    valid_barcodes_path = martian.make_path("valid_barcodes.txt")
    with open(valid_barcodes_path, 'w') as f:
        f.write(",".join(valid_barcodes))

    chunks = []
    for contig in all_contigs:
        chunks.append({
            "contig": contig,
            "valid_barcodes": valid_barcodes_path,
            "__mem_gb": chunk_mem_gb,
        })

    return {"chunks": chunks, "join": {"__mem_gb": join_mem_gb}}
Ejemplo n.º 17
0
def main(args, outs):
    """Downsample each fragments file to produce a sorted file, while computing the pre and post complexity metrics"""
    with open(args.library_info, 'r') as f:
        library_info = pickle.load(f)[args.n]

    # read cells
    cell_barcodes = get_cell_barcodes(library_info['cells'],
                                      args.reference_path)

    # get chrom key from fasta index
    chrom_order = {}
    ctg_mgr = ReferenceManager(args.reference_path)
    with open(ctg_mgr.fasta_index, 'r') as f:
        for en, line in enumerate(f):
            chrom = line.split('\t')[0]
            chrom_order[chrom] = en

    downsampling_metrics = subsample_fragments(
        infile=library_info['fragments'],
        rate=library_info['rate'],
        outfile=outs.fragments,
        group=args.n,
        cells=cell_barcodes,
        kind=library_info['kind'],
        key=chrom_order)

    with open(outs.normalization_metrics, 'w') as f:
        json.dump(downsampling_metrics, f, indent=4)
Ejemplo n.º 18
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    if args.peaks is None:
        martian.throw("peaks BED file expected")
    if args.cell_barcodes is None:
        martian.throw("cell barcodes CSV file expected")

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig, '__mem_gb': 4})

    return {'chunks': chunks, 'join': {'__mem_gb': 8}}
Ejemplo n.º 19
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    with open(args.barcodes, 'r') as barcode_file:
        barcodes_dict = OrderedDict(
            (bc.strip('\n'), num) for num, bc in enumerate(barcode_file))
    outs.insert_summary = None

    if args.fragments is None or len(barcodes_dict) == 0:
        outs.insert_sizes = None
        outs.total = None
        return

    ref_contig_manager = ReferenceManager(args.reference_path)

    # iterate over fragments and count fragment sizes for each barcode
    insert_sizes = {bc: Counter() for bc in barcodes_dict.iterkeys()}
    primary_contigs = set(
        ref_contig_manager.primary_contigs(allow_sex_chromosomes=True))
    for contig, start, stop, barcode, _ in open_fragment_file(args.fragments):
        if barcode not in barcodes_dict:
            continue
        if args.exclude_non_nuclear and contig not in primary_contigs:
            continue
        size = stop - start
        insert_sizes[barcode][
            str(size) if size <= MAX_INSERT_SIZE else GT_MAX_INSERT_SIZE] += 1

    # compute total and write out csv
    total = np.zeros(MAX_INSERT_SIZE)
    with open(outs.insert_sizes, 'w') as outfile:
        outfile.write(','.join(['Barcode'] +
                               [str(n)
                                for n in range(1, MAX_INSERT_SIZE + 1)] +
                               ['>{}'.format(MAX_INSERT_SIZE)]) + '\n')
        for barcode in insert_sizes:
            outfile.write(','.join([barcode] + [
                str(insert_sizes[barcode][str(n)])
                for n in range(1, MAX_INSERT_SIZE + 1)
            ] + [str(insert_sizes[barcode][GT_MAX_INSERT_SIZE])]) + '\n')
            for n in range(1, 1001):
                total[n - 1] += insert_sizes[barcode][str(n)]

    # write out totals for reduce in join
    np.savetxt(outs.total, total, delimiter=',')
Ejemplo n.º 20
0
def get_barcode_gc(ref_f, peaks_f, matrix):
    """Get mean GC% of peaks in a barcode"""
    ref_mgr = ReferenceManager(ref_f)
    genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0])
    peak_GC = np.array([get_peak_GC_counts(peak, genome_fa, counts=False)
                        for peak in peak_reader(peaks_f)])
    barcode_GC = ((peak_GC * matrix.m) / np.array(matrix.m.sum(axis=0))).squeeze()
    return barcode_GC
Ejemplo n.º 21
0
def split(args):
    """Compute base background in split and use it in each chunk
    """

    n_peaks = utils.quick_line_count(args.peaks) if args.peaks else 0
    ref_mgr = ReferenceManager(args.reference_path)
    if len(ref_mgr.list_species()) > 1 or n_peaks == 0 or ref_mgr.tss_track is None:
        chunk_def = [{'skip': True}]
        return {'chunks': chunk_def}

    # write rows of each chunk to a new peak file
    mem_in_gb = 4.0
    chunk_def = [{'__mem_gb': mem_in_gb,
                  'skip': False,
                  'chunk_start': chunk[0],
                  'chunk_end': chunk[1]} for chunk in utils.get_chunks(n_peaks, chunks=20)]
    return {'chunks': chunk_def}
Ejemplo n.º 22
0
def count_bases_in_peaks(reference_path, peaks_file):
    """Count the total number of bases in peak regions (0-indexed)"""
    bases_in_peaks = 0
    ctg_mgr = ReferenceManager(reference_path)
    genome_fa = pyfasta.Fasta(ctg_mgr.fasta, key_fn=lambda x: x.split()[0])
    for peak in peak_reader(peaks_file):
        bases_in_peaks += len(genome_fa[peak.chrom][peak.start:peak.end])
    return bases_in_peaks
Ejemplo n.º 23
0
def main(args, outs):
    """Run this for each method x clustering key combination from split"""
    ctg_mgr = ReferenceManager(args.reference_path)
    species = ctg_mgr.list_species()
    if args.filtered_peak_bc_matrix is None or len(species) > 1:
        return

    # Load the peak-BC matrix and a clustering and perform DE
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_peak_bc_matrix)
    clustering_h5 = args.clustering_summary['h5'][args.method]
    clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, args.clustering_key)
    mask = clustering.clusters == args.cluster
    clustering.clusters[mask] = 1
    clustering.clusters[np.logical_not(mask)] = 2

    # find depth using peak matrix and normalize
    scale = np.array(peak_matrix.m.sum(axis=0)).squeeze()
    depth = (scale + 1) / np.median(scale)

    cov_peak = [np.log(depth)]
    diffexp_peak = nb2_diffexp.run_differential_expression(peak_matrix.m, clustering.clusters, model='poisson',
                                                           impute_rest=True, test_params={'cov': cov_peak}, verbose=True)

    # find empirical estimates of alpha
    tf_matrix = None
    diffexp_tf = None
    # do DE on tf-BC matrix
    if args.filtered_tf_bc_matrix is not None:
        tf_matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_tf_bc_matrix)
        ntfmatrix = normalize_matrix(tf_matrix.m, scale)
        alpha_tf = nb2_diffexp.empirical_dispersion(ntfmatrix)
        barcode_GC = get_barcode_gc(args.reference_path, args.peaks, peak_matrix)
        cov_tf = [barcode_GC, np.log(depth)]
        diffexp_tf = nb2_diffexp.run_differential_expression(tf_matrix.m, clustering.clusters, model='nb', impute_rest=True,
                                                             test_params={'cov': cov_tf, 'alpha': alpha_tf}, verbose=True)

    # vstack
    diffexp = diffexp_peak if tf_matrix is None else cr_diffexp.DIFFERENTIAL_EXPRESSION(np.vstack([diffexp_peak.data, diffexp_tf.data]))

    # write out temp file
    np.savetxt(outs.tmp_diffexp, diffexp.data, delimiter=',')
    outs.enrichment_analysis = None
    outs.enrichment_analysis_summary = None
Ejemplo n.º 24
0
def split(args):
    ref_mgr = ReferenceManager(args.reference_path)
    if args.filtered_matrix is None or args.peak_motif_hits is None or len(
            ref_mgr.list_species()) > 1:
        return {'chunks': []}

    matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(
        args.filtered_matrix)
    npeaks, nbcs, nnz = cr_matrix.CountMatrix.load_dims_from_h5(
        args.filtered_matrix)
    # assume we will never test more than 1000 TFs and
    # the relative hit-rate of a TF is a generous 1 out of every 10 peaks
    MAX_TF_COUNT = 1000
    MAX_TF_PEAK_SPARSITY = 0.1
    BYTES_PER_INT = np.dtype(int).itemsize
    BYTES_PER_FLOAT = np.dtype(float).itemsize
    BYTES_PER_GB = 1024**3
    ENTRIES_PER_VAL = 3
    predicted_tf_peak_matrix_mem_gb = ENTRIES_PER_VAL * MAX_TF_PEAK_SPARSITY * npeaks * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB
    predicted_tf_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_INT / BYTES_PER_GB
    predicted_tf_propZ_matrix_mem_gb = ENTRIES_PER_VAL * nbcs * MAX_TF_COUNT * BYTES_PER_FLOAT / BYTES_PER_GB
    chunk_mem_gb = int(
        np.ceil(
            max(
                matrix_mem_gb + predicted_tf_peak_matrix_mem_gb * 2 +
                predicted_tf_matrix_mem_gb * 2 +
                predicted_tf_propZ_matrix_mem_gb * 2,
                h5_constants.MIN_MEM_GB)))
    vmem_peak_motif_hits = int(
        np.ceil(predicted_tf_peak_matrix_mem_gb) * 3 +
        predicted_tf_peak_matrix_mem_gb)

    # HACK - give big jobs more threads in order to avoid overloading a node
    threads = cr_io.get_thread_request_from_mem_gb(chunk_mem_gb)

    return {
        'chunks': [],
        'join': {
            '__mem_gb': chunk_mem_gb,
            '__vmem_gb': chunk_mem_gb + vmem_peak_motif_hits + 1,
            '__threads': threads
        }
    }
Ejemplo n.º 25
0
    def __init__(self, ref_path, bg=None):
        ref_manager = ReferenceManager(ref_path)
        self.all_motifs = []
        if ref_manager.motifs is not None:
            with open(ref_manager.motifs, "r") as infile:
                self.all_motifs = list(motifs.parse(infile, "jaspar"))

        # for large sequence header, only keep the text before the first space
        self.genome_seq = pyfasta.Fasta(ref_manager.fasta,
                                        key_fn=lambda x: x.split()[0])
        self.bg = bg
Ejemplo n.º 26
0
def split(args):
    """We just align each chunk independently -- joining will happen in the join step of SORT_READS"""

    # Pull some reads from fastq files -- bail out if it's less than 25bp
    fastq_tests = [x['read1'] for x in args.chunks]

    for fastq_test in fastq_tests:
        with open(fastq_test) as in_file:
            reader = tk_fasta.read_generator_fastq(in_file)
            for name, read, qual in itertools.islice(reader, 10):
                if len(read) < MIN_READ_LENGTH:
                    martian.alarm("BWA-MEM can't handle reads <25bp -- reads will be unmapped.")
                    continue

    # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB
    ctg_mgr = ReferenceManager(args.reference_path)
    base_mem_in_gb = int(math.ceil(2 * ctg_mgr.get_vmem_est()))

    mem_in_gb = base_mem_in_gb + 4
    chunks = [{'chunk': x, '__threads': args.num_threads, '__mem_gb': mem_in_gb} for x in args.chunks]
    return {'chunks': chunks}
Ejemplo n.º 27
0
def annotate_peaks(peaks, ref_path):
    """
    peak to gene annotation strategy:
        1. if a peak overlaps with promoter region (-1kb, + 100) of any TSS, call it a promoter peak
        2. if a peak is within 200kb of the closest TSS, AND if it is not a promoter peak, call it a distal peak
        3. if a peak overlaps of a transcript, AND it is not a promoter nor a distal peak of the gene, call it a distal peak
            This step is optional
        4. call it an intergenic peak
    """

    ref_mgr = ReferenceManager(ref_path)
    tss = BedTool(ref_mgr.tss_track)

    # if tss.bed contains the 7th column (gene type), then apply filter. Otherwise use all tss sites
    if tss.field_count() == 7:
        tss_filtered = tss.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
    else:
        df_tss = tss.to_dataframe()
        df_tss['gene_type'] = '.'
        tss_filtered = BedTool.from_dataframe(df_tss).saveas()

    # including transcripts.bed is optional
    if ref_mgr.transcripts_track is None:
        transcripts_filtered = BedTool([])
    else:
        transcripts = BedTool(ref_mgr.transcripts_track)
        if transcripts.field_count() == 7:
            transcripts_filtered = transcripts.filter(lambda x: x[6] in TRANSCRIPT_ANNOTATION_GENE_TYPES).saveas()
        else:
            df_tx = transcripts.to_dataframe()
            df_tx['gene_type'] = '.'
            transcripts_filtered = BedTool.from_dataframe(df_tx).saveas()

    # run bedtools closest for peaks against filtered tss, group by peaks and summarize annotations from select columns
    peaks_nearby_tss = peaks.closest(tss_filtered, D='b', g=ref_mgr.fasta_index).groupby(g=[1, 2, 3], c=[7, 11], o=['collapse']).saveas()

    results = []
    peaks_nearby_tss_butno_tx = peaks_nearby_tss.intersect(transcripts_filtered, v=True).saveas()

    # avoid error when no peaks overlap with any transcipts
    if len(peaks_nearby_tss_butno_tx) < len(peaks_nearby_tss):
        peaks_nearby_tss_and_tx = peaks_nearby_tss \
            .intersect(transcripts_filtered, wa=True, wb=True) \
            .groupby(g=[1, 2, 3, 4, 5], c=[9], o=['distinct'])

        for peak in peaks_nearby_tss_and_tx:
            results.append(get_peak_nearby_genes(peak))

    for peak in peaks_nearby_tss_butno_tx:
        results.append(get_peak_nearby_genes(peak))

    return results
Ejemplo n.º 28
0
def split(args):
    """split into a chunk for each library in aggr csv, and define a unique gem group"""
    aggr_df = pd.read_csv(args.aggr_csv, sep=',')
    nchunks = len(aggr_df)

    ctg_mgr = ReferenceManager(args.reference_path)
    max_contig_len = max(ctg_mgr.get_contig_lengths().values())
    BYTES_PER_INT32_WITH_SAFETY = 5
    mem_gb = 2 * int(
        np.ceil(
            BYTES_PER_INT32_WITH_SAFETY * max_contig_len / 1024 / 1024 / 1024))

    return {
        'chunks': [{
            'n': group,
            '__mem_gb': mem_gb,
            '__vmem_gb': mem_gb + 6
        } for group in range(nchunks)],
        'join': {
            '__mem_gb': 12
        }
    }
Ejemplo n.º 29
0
def join(args, outs, chunk_defs, chunk_outs):
    """Compute base background in each peak."""
    ref_mgr = ReferenceManager(args.reference_path)
    npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0

    if len(ref_mgr.list_species()
           ) > 1 or npeaks == 0 or ref_mgr.motifs is None:
        outs.GCdist = None
        return

    # get peak-GC distribution
    genome_fa = pyfasta.Fasta(ref_mgr.fasta, key_fn=lambda x: x.split()[0])
    GCdist = [
        utils.get_peak_GC_counts(peak, genome_fa, counts=False)
        for peak in peak_reader(args.peaks)
    ]

    # compute base background from peaks in bins
    # merge extreme GC bins with adjoining ones if they're too narrow for motif scanner to work correctly
    GCbounds = []
    nbins = NBINS
    for n, gc in enumerate(
            np.percentile(GCdist,
                          np.linspace(0, 100, nbins + 1, endpoint=True),
                          interpolation='lower')):
        if n == 0 or n == nbins:
            GCbounds += [gc]
            continue
        if gc >= LOW_GC and gc < HIGH_GC:
            GCbounds += [gc]
    GCbins = sorted(list(set(zip(GCbounds, GCbounds[1:]))))  # uniqify
    peaks = peak_reader(args.peaks)
    GCdict = get_GCbinned_peaks_and_bg(peaks, genome_fa, GCbins)

    # dump
    with open(outs.GCdict, 'w') as f:
        pickle.dump(GCdict, f)
Ejemplo n.º 30
0
def split(args):
    if args.fragments is None:
        return {"chunks": [], "join": {}}

    with open(args.barcode_counts, "r") as infile:
        barcode_counts = Counter(json.load(infile))

    valid_barcodes = barcode_counts.keys()
    part_a_seqs, part_c_seqs, part_b_seqs, gem_group_seqs = query_barcode_subsequences(
        valid_barcodes)

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for gem_group in gem_group_seqs:
        for contig in all_contigs:
            chunks.append({
                "contig": contig,
                "gem_group": gem_group,
                "__mem_gb": 4,
            })

    return {"chunks": chunks, "join": {"__mem_gb": 16}}