Beispiel #1
0
def split(args):
    if args.fragments is None:
        return {"chunks": [], "join": {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for contig in all_contigs:
        chunks.append({"contig": contig, "__mem_gb": 5})

    return {"chunks": chunks, "join": {"__mem_gb": 5}}
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig, '__mem_gb': 5})

    return {'chunks': chunks, 'join': {'__mem_gb': 5}}
Beispiel #3
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)
    contig_len = ctg_mgr.get_contig_lengths()
    BYTES_PER_INT32_WITH_SAFETY = 5

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig,
                       '__mem_gb': int(np.ceil(BYTES_PER_INT32_WITH_SAFETY * contig_len[contig] / 1024 / 1024 / 1024))})

    return {'chunks': chunks, 'join': {'__mem_gb': 5}}
def split(args):
    if args.fragments is None:
        return {"chunks": [], "join": {}}

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    with open(args.barcode_counts, "r") as infile:
        barcode_counts = Counter(json.load(infile))
    barcode_array = np.array([bc for bc in barcode_counts])
    gem_group_array = np.array(
        [get_barcode_gem_group(bc) for bc in barcode_counts])
    gem_groups = set(gem_group_array)
    frag_count_array = np.array([barcode_counts[bc] for bc in barcode_array])

    valid_barcodes = list()
    for gem_group in gem_groups:
        count_mask = (frag_count_array > MINIMUM_COUNTS) & (gem_group_array
                                                            == gem_group)
        # find at most top N barcodes
        topN_indices = barcode_array[count_mask].argsort(
        )[-min(MAXIMUM_BARCODES, len(count_mask)):]
        valid_barcodes.extend(list(barcode_array[count_mask][topN_indices]))

    # mem allocs
    JOIN_LOAD_FACTOR = 2
    BUFFER_GB = 2
    BYTES_PER_ENTRY = 4  # this depends on the dtype
    chunk_mem_gb = BUFFER_GB + np.ceil(
        BYTES_PER_ENTRY * len(gem_groups) * MAXIMUM_BARCODES**2 /
        1024**3).astype('int32')
    join_mem_gb = BUFFER_GB + np.ceil(
        JOIN_LOAD_FACTOR * BYTES_PER_ENTRY * len(gem_groups) *
        MAXIMUM_BARCODES**2 / 1024**3).astype('int32')

    valid_barcodes_path = martian.make_path("valid_barcodes.txt")
    with open(valid_barcodes_path, 'w') as f:
        f.write(",".join(valid_barcodes))

    chunks = []
    for contig in all_contigs:
        chunks.append({
            "contig": contig,
            "valid_barcodes": valid_barcodes_path,
            "__mem_gb": chunk_mem_gb,
        })

    return {"chunks": chunks, "join": {"__mem_gb": join_mem_gb}}
Beispiel #5
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    if args.peaks is None:
        martian.throw("peaks BED file expected")
    if args.cell_barcodes is None:
        martian.throw("cell barcodes CSV file expected")

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig, '__mem_gb': 4})

    return {'chunks': chunks, 'join': {'__mem_gb': 8}}
Beispiel #6
0
def main(args, outs):
    args.coerce_strings()
    outs.coerce_strings()

    with open(args.barcodes, 'r') as barcode_file:
        barcodes_dict = OrderedDict(
            (bc.strip('\n'), num) for num, bc in enumerate(barcode_file))
    outs.insert_summary = None

    if args.fragments is None or len(barcodes_dict) == 0:
        outs.insert_sizes = None
        outs.total = None
        return

    ref_contig_manager = ReferenceManager(args.reference_path)

    # iterate over fragments and count fragment sizes for each barcode
    insert_sizes = {bc: Counter() for bc in barcodes_dict.iterkeys()}
    primary_contigs = set(
        ref_contig_manager.primary_contigs(allow_sex_chromosomes=True))
    for contig, start, stop, barcode, _ in open_fragment_file(args.fragments):
        if barcode not in barcodes_dict:
            continue
        if args.exclude_non_nuclear and contig not in primary_contigs:
            continue
        size = stop - start
        insert_sizes[barcode][
            str(size) if size <= MAX_INSERT_SIZE else GT_MAX_INSERT_SIZE] += 1

    # compute total and write out csv
    total = np.zeros(MAX_INSERT_SIZE)
    with open(outs.insert_sizes, 'w') as outfile:
        outfile.write(','.join(['Barcode'] +
                               [str(n)
                                for n in range(1, MAX_INSERT_SIZE + 1)] +
                               ['>{}'.format(MAX_INSERT_SIZE)]) + '\n')
        for barcode in insert_sizes:
            outfile.write(','.join([barcode] + [
                str(insert_sizes[barcode][str(n)])
                for n in range(1, MAX_INSERT_SIZE + 1)
            ] + [str(insert_sizes[barcode][GT_MAX_INSERT_SIZE])]) + '\n')
            for n in range(1, 1001):
                total[n - 1] += insert_sizes[barcode][str(n)]

    # write out totals for reduce in join
    np.savetxt(outs.total, total, delimiter=',')
def split(args):
    if args.fragments is None:
        return {"chunks": [], "join": {}}

    with open(args.barcode_counts, "r") as infile:
        barcode_counts = Counter(json.load(infile))

    valid_barcodes = barcode_counts.keys()
    part_a_seqs, part_c_seqs, part_b_seqs, gem_group_seqs = query_barcode_subsequences(
        valid_barcodes)

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for gem_group in gem_group_seqs:
        for contig in all_contigs:
            chunks.append({
                "contig": contig,
                "gem_group": gem_group,
                "__mem_gb": 4,
            })

    return {"chunks": chunks, "join": {"__mem_gb": 16}}
Beispiel #8
0
def join(args, outs, chunk_defs, chunk_outs):
    # Sample ID / pipestance name
    check_sample_id(args.sample_id)

    # force_cells
    check_force_cells(args.force_cells, ulimit=10000000)  # allow arbitrarily large limit for reanalyzer

    # # Reference
    # ref directory structure and timestamps
    ok, msg = check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)

    # formatting
    check_reference_format(args.reference_path)
    contig_manager = ReferenceManager(args.reference_path)

    # peaks format check and nonoverlapping
    if args.peaks is None:
        martian.exit("peaks file not provided")
    exists_and_readable(args.peaks, "peaks")
    bed_format_checker(args.peaks, contig_manager.fasta_index)
    contain_three_columns(args.peaks)
    if is_overlapping(args.peaks):
        martian.exit("{} contains overlapping peak regions".format(args.peaks))

    # check parameters files
    if args.parameters is not None:
        if not os.path.exists(args.parameters):
            martian.exit("{} does not exist".format(args.parameters))

    # fragments checks
    whitelist_barcodes = load_barcode_whitelist(args.barcode_whitelist)
    species_list = contig_manager.list_species()
    observed_gem_groups = set()
    observed_species = set()
    if args.fragments is None:
        martian.exit("fragments file not provided")
    exists_and_readable(args.fragments, "fragments")
    contig_lens = contig_manager.get_contig_lengths()
    # check bounds and matching contigs in reference and species
    for chrom, start, stop, bc, _ in open_fragment_file(args.fragments):
        spec = chrom.split("_")
        observed_species.add(spec[0] if spec[0] != chrom else "")
        barcode, gem_group = bc.split("-")
        observed_gem_groups.add(gem_group)
        if args.check_executables:  # run this only non-locally
            if barcode not in whitelist_barcodes:
                martian.exit("{} is not a valid whitelist barcode".format(barcode))
            if chrom not in contig_lens:
                martian.exit("contig {} not present in reference".format(chrom))
            if stop > contig_lens[chrom]:
                martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom]))
    # ensure fragments are on the correct reference
    for species in observed_species:
        if species not in species_list:
            martian.exit("{} contains fragments mapped to species not recognized in the reference".format(args.fragments))
    if len(observed_gem_groups) > 1:
        martian.log_info("multiple gem groups present in {}, likely generated in a previous aggregation run".format(args.fragments))

    # fragments index is synced with fragments
    if args.fragments_index is None:
        martian.exit("fragments index file not provided")
    if not os.path.exists(args.fragments_index):
        martian.exit("{} does not exist".format(args.fragments_index))
    try:
        all_contigs = contig_manager.primary_contigs(allow_sex_chromosomes=True)
        for contig in all_contigs:
            en = 0
            for chrom, start, end, bc, dups in parsed_fragments_from_contig(contig, args.fragments, index=args.fragments_index):
                if en >= FRAGMENTS_SCAN_SIZE:
                    break
                en += 1
    except:
        martian.exit("fragments index is not in sync with the fragments file")

    # aggr csv checks
    if args.aggregation_csv is not None:
        check_aggr_csv(args.aggregation_csv, args.reference_path, cursory=True)

    # cell barcode checks
    if args.cell_barcodes is not None:
        if not os.path.exists(args.cell_barcodes):
            martian.exit("{} does not exist".format(args.cell_barcodes))
        check_singlecell_format(args.cell_barcodes, species_list, whitelist_barcodes)

    # Open file handles limit
    if args.check_executables:
        check_filehandle_limit()

    martian.log_info(tk_preflight.record_package_versions())