Esempio n. 1
0
def join(args, outs, chunk_defs, chunk_outs):
    # Sample ID / pipestance name
    check_sample_id(args.sample_id)

    # force_cells
    check_force_cells(args.force_cells, ulimit=10000000)  # allow arbitrarily large limit for reanalyzer

    # # Reference
    # ref directory structure and timestamps
    ok, msg = check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)

    # formatting
    check_reference_format(args.reference_path)
    contig_manager = ReferenceManager(args.reference_path)

    # peaks format check and nonoverlapping
    if args.peaks is None:
        martian.exit("peaks file not provided")
    exists_and_readable(args.peaks, "peaks")
    bed_format_checker(args.peaks, contig_manager.fasta_index)
    contain_three_columns(args.peaks)
    if is_overlapping(args.peaks):
        martian.exit("{} contains overlapping peak regions".format(args.peaks))

    # check parameters files
    if args.parameters is not None:
        if not os.path.exists(args.parameters):
            martian.exit("{} does not exist".format(args.parameters))

    # fragments checks
    whitelist_barcodes = load_barcode_whitelist(args.barcode_whitelist)
    species_list = contig_manager.list_species()
    observed_gem_groups = set()
    observed_species = set()
    if args.fragments is None:
        martian.exit("fragments file not provided")
    exists_and_readable(args.fragments, "fragments")
    contig_lens = contig_manager.get_contig_lengths()
    # check bounds and matching contigs in reference and species
    for chrom, start, stop, bc, _ in open_fragment_file(args.fragments):
        spec = chrom.split("_")
        observed_species.add(spec[0] if spec[0] != chrom else "")
        barcode, gem_group = bc.split("-")
        observed_gem_groups.add(gem_group)
        if args.check_executables:  # run this only non-locally
            if barcode not in whitelist_barcodes:
                martian.exit("{} is not a valid whitelist barcode".format(barcode))
            if chrom not in contig_lens:
                martian.exit("contig {} not present in reference".format(chrom))
            if stop > contig_lens[chrom]:
                martian.exit("fragment {}:{}-{} boundaries exceed contig size ({} bp)".format(chrom, start, stop, contig_lens[chrom]))
    # ensure fragments are on the correct reference
    for species in observed_species:
        if species not in species_list:
            martian.exit("{} contains fragments mapped to species not recognized in the reference".format(args.fragments))
    if len(observed_gem_groups) > 1:
        martian.log_info("multiple gem groups present in {}, likely generated in a previous aggregation run".format(args.fragments))

    # fragments index is synced with fragments
    if args.fragments_index is None:
        martian.exit("fragments index file not provided")
    if not os.path.exists(args.fragments_index):
        martian.exit("{} does not exist".format(args.fragments_index))
    try:
        all_contigs = contig_manager.primary_contigs(allow_sex_chromosomes=True)
        for contig in all_contigs:
            en = 0
            for chrom, start, end, bc, dups in parsed_fragments_from_contig(contig, args.fragments, index=args.fragments_index):
                if en >= FRAGMENTS_SCAN_SIZE:
                    break
                en += 1
    except:
        martian.exit("fragments index is not in sync with the fragments file")

    # aggr csv checks
    if args.aggregation_csv is not None:
        check_aggr_csv(args.aggregation_csv, args.reference_path, cursory=True)

    # cell barcode checks
    if args.cell_barcodes is not None:
        if not os.path.exists(args.cell_barcodes):
            martian.exit("{} does not exist".format(args.cell_barcodes))
        check_singlecell_format(args.cell_barcodes, species_list, whitelist_barcodes)

    # Open file handles limit
    if args.check_executables:
        check_filehandle_limit()

    martian.log_info(tk_preflight.record_package_versions())
def main(args, outs):
    """Mark exact duplicate reads in the output BAM file while also writing out some summary statistics.
    PCR duplicates have the same read1 start site and read2 start site.
    """
    args.coerce_strings()
    outs.coerce_strings()

    # Chunk output doesn't get indexed
    outs.fragments_index = None
    outs.index = None

    # Pull in prior likelihoods for barcodes
    raw_barcode_abundance = None
    barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
    if args.raw_barcode_counts is not None and barcode_whitelist is not None:
        with open(args.raw_barcode_counts, 'r') as infile:
            raw_counts = json.load(infile)
        raw_barcode_abundance = {
            '{}-{}'.format(barcode, gem_group): count
            for gem_group, subdict in raw_counts.iteritems()
            for barcode, count in zip(barcode_whitelist, subdict['bc_counts'])
        }

    bam_in = create_bam_infile(args.input)
    bam_refs = bam_in.references

    bam_prefix, ext = os.path.splitext(outs.output)
    raw_bam_file = martian.make_path(bam_prefix + '_five_prime_pos_sorted' +
                                     ext)

    frag_prefix, ext = os.path.splitext(outs.fragments)
    raw_frag_file = martian.make_path(frag_prefix + '_raw' + ext)

    # only write CO line for one chunk, so we don't have duplicates after samtools merge
    if args.chunk_num == 0:
        COs = [
            '10x_bam_to_fastq:R1(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:R2(SEQ:QUAL,TR:TQ)',
            '10x_bam_to_fastq:I1(BC:QT)', '10x_bam_to_fastq:I2(CR:CY)',
            '10x_bam_to_fastq_seqnames:R1,R3,I1,R2'
        ]
    else:
        COs = None

    bam_out, _ = tk_bam.create_bam_outfile(
        raw_bam_file,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "mark_duplicates", TENX_PRODUCT_NAME)
        ],
        cos=COs)
    fragments_out = open(raw_frag_file, 'w')
    bam_in.reset()

    # Ensure the summary key indicates what kind of dup marking was actually performed.
    lane_coord_sys = tk_lane.LaneCoordinateSystem.from_dict(args.lane_map)
    reference_manager = ReferenceManager(args.reference_path)
    summarizer = DupSummary(split_bcs=False,
                            lane_coordinate_system=lane_coord_sys,
                            output_bam=bam_out,
                            output_tsv=fragments_out,
                            ref=reference_manager,
                            bam_refs=bam_refs,
                            priors=raw_barcode_abundance)

    # Now broadcast the selected reads to the summarizers
    consumers = [summarizer.read_consumer()]
    source = tk_bam.read_bam_chunk(bam_in, (args.chunk_start, args.chunk_end))
    broadcast(source, consumers)

    # Close outfiles
    bam_out.close()
    fragments_out.close()

    # Feed the chunk barcode_counts data back to join()
    with open(outs.singlecell_mapping, 'w') as outfile:
        pickle.dump(summarizer.bc_counts, outfile)

    # Sort the output bam & tsv files
    sort_bam(raw_bam_file,
             outs.output,
             threads=martian.get_threads_allocation())
    sort_bed(raw_frag_file,
             outs.fragments,
             genome=reference_manager.fasta_index,
             threads=martian.get_threads_allocation(),
             leave_key=True)
Esempio n. 3
0
def generate_cellcalling_fragment_counts_plot(singlecell_df,
                                              cell_parameters,
                                              barcode_whitelist,
                                              excluded_barcodes=None,
                                              species=""):
    if species:
        fragment_counts = np.array(
            singlecell_df['passed_filters_{}'.format(species)].values)
    else:
        fragment_counts = np.array(
            singlecell_df['peak_region_fragments'].values)

    if excluded_barcodes is None:
        valid_barcode_mask = singlecell_df['barcode'] != NO_BARCODE
    else:
        barcodes = singlecell_df['barcode'].values
        valid_barcode_mask = np.array(
            [(bc not in excluded_barcodes[species]) and (bc != NO_BARCODE)
             for bc in barcodes],
            dtype=bool)

    threshold = cell_parameters[species]['cell_threshold']

    cell_mask = (fragment_counts >= threshold) & valid_barcode_mask
    noncell_mask = (fragment_counts < threshold) & valid_barcode_mask

    logbinmax = int(np.ceil(np.log10(fragment_counts.max())))
    xbins = list(
        np.hstack([np.arange(100),
                   np.logspace(np.log10(100), logbinmax, 350)]))

    data_subplots = []
    for name, mask in zip(["Non-cells", "{} Cells".format(species)],
                          [noncell_mask, cell_mask]):
        if mask.sum() > 0:
            counts, _ = np.histogram(fragment_counts[mask], xbins)
            data_subplots.append({
                "name": name,
                "x": xbins,
                "y": list(counts),
                "type": "scatter",
                "connectgaps": True,
                "fill": "tozeroy",
            })

    whitelist_length = len(load_barcode_whitelist(barcode_whitelist))
    fragment_depth = sum(singlecell_df['passed_filters'].values)
    count_shift = max(
        MINIMUM_COUNT,
        int(fragment_depth * WHITELIST_CONTAM_RATE / whitelist_length))

    def get_fitted_counts(barcode_total, bins, species, parameters):
        max_count = max(bins)
        count_values = np.arange(max_count + 1)

        frac_noise = parameters[species]['fraction_noise']
        mean_noise = parameters[species]['noise_mean']
        mean_signal = parameters[species]['signal_mean']
        dispersion_noise = parameters[species]['noise_dispersion']
        dispersion_signal = parameters[species]['signal_dispersion']

        estimated_noise_counts = stats.nbinom.pmf(
            count_values, 1 / dispersion_noise,
            1 / (1 + dispersion_noise * mean_noise))
        estimated_signal_counts = stats.nbinom.pmf(
            count_values, 1 / dispersion_signal,
            1 / (1 + dispersion_signal * mean_signal))
        estimated_noise_counts *= frac_noise * barcode_total
        estimated_signal_counts *= (1 - frac_noise) * barcode_total
        noise_bin_counts = np.array([
            estimated_noise_counts[(count_values >= lower)
                                   & (count_values < upper)].sum()
            for lower, upper in zip(bins[:-1], bins[1:])
        ])
        signal_bin_counts = np.array([
            estimated_signal_counts[(count_values >= lower)
                                    & (count_values < upper)].sum()
            for lower, upper in zip(bins[:-1], bins[1:])
        ])
        noise_bin_counts[noise_bin_counts < 1.0] = 0.0
        signal_bin_counts[signal_bin_counts < 1.0] = 0.0
        return bins[:-1], noise_bin_counts, signal_bin_counts

    xvals, noise, signal = get_fitted_counts(
        (fragment_counts >= count_shift).sum(), xbins, species,
        cell_parameters)
    data_subplots.append({
        "name": "Noise fit",
        "x": list(xvals),
        "y": list(noise),
        "type": "scatter",
        "mode": "lines",
        "line": {
            "color": "grey",
            "width": 1
        },
    })
    data_subplots.append({
        "name": "Signal fit",
        "x": list(xvals),
        "y": list(signal),
        "type": "scatter",
        "mode": "lines",
        "line": {
            "color": "black",
            "width": 1
        },
    })
    data_subplots.append({
        "name": "Joint fit",
        "x": list(xvals),
        "y": list(signal + noise),
        "type": "scatter",
        "mode": "lines",
        "line": {
            "color": "red",
            "width": 1
        },
    })

    return {
        "layout": {
            "xaxis": {
                "type": "log",
                "title": "{} Fragments Per Barcode".format(species),
            },
            "yaxis": {
                "type": "log",
                "title": "Barcodes",
            },
            "title": "{} Fragment Distribution".format(species),
        },
        "data": data_subplots,
    }
def join(args, outs, chunk_defs, chunk_outs):
    outs.coerce_strings()

    # Merge the output bam files with duplicates marked
    hierarchical_merge_bam([c.output for c in chunk_outs],
                           outs.output,
                           tag=None,
                           threads=martian.get_threads_allocation())
    outs.index = index_bam(outs.output, martian.get_threads_allocation())

    # Merge the barcode counts from each chunk and write out our singlecell_mapping file
    barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist,
                                               ordered=True)
    sorted_barcodes = []
    if args.raw_barcode_counts is not None:
        with open(args.raw_barcode_counts, 'r') as infile:
            raw_counts = json.load(infile)
        sorted_barcodes = [
            '{}-{}'.format(barcode, gem_group) for gem_group in raw_counts
            for barcode in sorted(barcode_whitelist)
        ]
    barcode_counts = {}
    for chunk in chunk_outs:
        with open(chunk.singlecell_mapping, 'r') as infile:
            chunk_counts = pickle.load(infile)
        for barcode, count_dict in chunk_counts.iteritems():
            if barcode not in barcode_counts:
                barcode_counts[barcode] = Counter()
            barcode_counts[barcode] += Counter(count_dict)

    with open(outs.singlecell_mapping, 'w') as outfile:
        outfile.write("barcode,")
        outfile.write(",".join(SINGLE_CELL_KEYS))
        outfile.write("\n")
        if None in barcode_counts:
            outfile.write("{},".format(NO_BARCODE))
            outfile.write(",".join(
                [str(barcode_counts[None][key]) for key in SINGLE_CELL_KEYS]))
            outfile.write("\n")
        for barcode in (bc for bc in sorted_barcodes if bc in barcode_counts):
            outfile.write("{},".format(barcode))
            outfile.write(",".join([
                str(barcode_counts[barcode][key]) for key in SINGLE_CELL_KEYS
            ]))
            outfile.write("\n")

    # Merge the fragment file
    base_file, extension = os.path.splitext(outs.fragments)
    if not extension == '.gz':
        raise ValueError('Expecting compressed file output')
    input_tsvs = [str(chunk.fragments) for chunk in chunk_outs]
    merge_keyed_bed(input_tsvs,
                    base_file,
                    threads=martian.get_threads_allocation())
    if os.path.getsize(base_file) == 0:
        outs.fragments = None
        outs.fragments_index = None
        return

    # N.B. tabix_index will automatically compress the input file, adding the .gz suffix
    pysam.tabix_index(base_file, preset='bed', index=outs.fragments_index)
def main(args, outs):
    """ Trim the reads in a series of fasta files """
    chunk = args.chunk
    subsample_rate = chunk['subsample_rate']
    have_barcode = chunk['barcode'] is not None
    have_sample_index = chunk['sample_index'] is not None

    # STEP 1:  We run the R1/R2 reads through cutadapt, writing them to a temporary file with appropriate adapters
    # trimmed, optionally filtering out reads where adapters weren't found
    interleaved = chunk['read2'] is None
    # can't do discard_untrimmed because we're running cutadapt in single-end mode
    if args.trim_def['discard_untrimmed']:
        martian.exit("discard_untrimmed was set in trim_def")
    if interleaved:
        trimmed_reads = martian.make_path("trimmed_reads.fastq")
        trim_info_fn = martian.make_path("trim_info.txt")
        initial_read_pairs, trimmed_read_pairs = run_cutadapt_single_end(
            chunk['read1'], trimmed_reads, trim_info_fn, args.trim_def,
            args.adapters)
    else:
        trimmed_r1 = martian.make_path("trimmed_r1.fastq")
        trimmed_r2 = martian.make_path("trimmed_r2.fastq")
        trim_info_r1_fn = martian.make_path("trim_info_r1.txt")
        trim_info_r2_fn = martian.make_path("trim_info_r2.txt")
        initial1, trimmed1 = run_cutadapt_single_end(chunk['read1'],
                                                     trimmed_r1,
                                                     trim_info_r1_fn,
                                                     args.trim_def,
                                                     args.adapters,
                                                     read_id="R1")
        initial2, trimmed2 = run_cutadapt_single_end(chunk['read2'],
                                                     trimmed_r2,
                                                     trim_info_r2_fn,
                                                     args.trim_def,
                                                     args.adapters,
                                                     read_id="R2")
        initial_read_pairs = initial1 + initial2
        trimmed_read_pairs = trimmed1 + trimmed2
        if initial1 != initial2:
            martian.exit(
                "Input fastq files for R1 and R2 are not the same length")
        if trimmed1 != trimmed2:
            raise ValueError(
                "Cutadapt produced differing numbers of reads for R1 and R2")

    # STEP 2:  We run through the trimmed R1/R2 reads along with sample index and barcode reads, chunking into files of
    # max_read_num reads or less, and skipping sample index/barcode reads that don't match the trimmed & filtered R1/R2
    # reads
    max_read_num = args.max_read_num
    file_number = 1

    # open the available input read files and get the iterator over them
    if interleaved:
        reads_in = open_maybe_gzip(trimmed_reads, 'r')
        read_iter = tk_fasta.read_generator_fastq(reads_in, paired_end=True)
        trim_info = open_maybe_gzip(trim_info_fn, 'r')
        trim_iter = read_generator_trim_info(trim_info, paired_end=True)
    else:
        r1_in = open_maybe_gzip(trimmed_r1, 'r')
        r2_in = open_maybe_gzip(trimmed_r2, 'r')
        read_iter = ((r1[0], r1[1], r1[2], r2[0], r2[1], r2[2])
                     for r1, r2 in itertools.izip_longest(
                         tk_fasta.read_generator_fastq(r1_in),
                         tk_fasta.read_generator_fastq(r2_in)))
        trim_info_r1 = open_maybe_gzip(trim_info_r1_fn, 'r')
        trim_info_r2 = open_maybe_gzip(trim_info_r2_fn, 'r')
        trim_iter = (t1 + t2 for t1, t2 in itertools.izip(
            read_generator_trim_info(trim_info_r1),
            read_generator_trim_info(trim_info_r2)))

    # open output read file, which will be interleaved
    read_name = martian.make_path("read{}.fastq".format(file_number))
    out_readfiles = [read_name]
    out_read_fastq = open(read_name, 'w')

    # open trimmed read file, which will be interleaved
    trim_out_name = martian.make_path("TRIM{}.fastq".format(file_number))
    out_trimfiles = [trim_out_name]
    out_trim_fastq = open(trim_out_name, 'w')

    if args.barcode_whitelist is None:
        outs.bc_counts = None
        barcode_indices = None
    else:
        barcode_whitelist = sorted(
            list(load_barcode_whitelist(args.barcode_whitelist)))
        barcode_indices = {
            bc: idx
            for (idx, bc) in enumerate(barcode_whitelist)
        }
        bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32)
        bad_count = 0

    # open barcode file if there is one
    if have_barcode:
        bc_name = martian.make_path("BC{}.fastq".format(file_number))
        out_bc_fastq = open(bc_name, 'w')
        out_barcodefiles = [bc_name]
        barcode_read = None
        bc_in = open_maybe_gzip(chunk['barcode'], 'r')
        bc_iter = tk_fasta.read_generator_fastq(bc_in)
        # Determine if barcode sequences need to be reverse complemented.
        with open_maybe_gzip(chunk['barcode'], 'r') as bc_in2:
            bc_iter2 = tk_fasta.read_generator_fastq(bc_in2)
            barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)
            barcode_rc = infer_barcode_reverse_complement(
                barcode_whitelist, bc_iter2)
    else:
        out_barcodefiles = [None]
        outs.bc_counts = None

    # open sample_index file if there is one
    if have_sample_index:
        si_name = martian.make_path("SI{}.fastq".format(file_number))
        out_si_fastq = open(si_name, 'w')
        si_in = open_maybe_gzip(chunk['sample_index'], 'r')
        sample_index_read = None
        si_iter = tk_fasta.read_generator_fastq(si_in)
        out_sampleindex_files = [si_name]
    else:
        out_sampleindex_files = [None]

    read_num = 0
    random.seed(0)
    for (read, trim) in itertools.izip(read_iter, trim_iter):
        # Downsample (other than the first read).  Note we've set a fixed seed to make this deterministic.
        if read_num > 0 and random.random() > subsample_rate:
            continue

        # Now we need to step through the barcode and sample index reads to find the matching reads
        if have_barcode:
            try:
                while barcode_read is None or not read_match(
                        read, barcode_read):
                    barcode_read = bc_iter.next()
                # reverse complement if all barcodes are RC-ed
                if barcode_rc:
                    barcode_read = (barcode_read[0],
                                    tk_seq.get_rev_comp(barcode_read[1]),
                                    barcode_read[2][::-1])
            except StopIteration:
                raise ValueError(
                    "Couldn't find barcode read matching {}".format(
                        get_read_name(read)))
        if have_sample_index:
            try:
                while sample_index_read is None or not read_match(
                        read, sample_index_read):
                    sample_index_read = si_iter.next()
            except StopIteration:
                raise ValueError(
                    "Couldn't find sample index read matching {}".format(
                        get_read_name(read)))

        (name1, seq1, qual1, name2, seq2, qual2) = read
        (tr_name1, tr_seq1, tr_qual1, tr_name2, tr_seq2, tr_qual2) = trim

        read_num += 1
        if read_num > max_read_num:
            read_num = 1
            file_number += 1
            read_name = martian.make_path("read{}.fastq".format(file_number))
            out_read_fastq.close()
            out_read_fastq = open(read_name, 'w')
            out_readfiles.append(read_name)

            trim_out_name = martian.make_path(
                "TRIM{}.fastq".format(file_number))
            out_trim_fastq.close()
            out_trim_fastq = open(trim_out_name, 'w')
            out_trimfiles.append(trim_out_name)

            if have_barcode:
                bc_name = martian.make_path("BC{}.fastq".format(file_number))
                out_bc_fastq.close()
                out_bc_fastq = open(bc_name, 'w')
                out_barcodefiles.append(bc_name)
            else:
                out_barcodefiles.append(None)

            if have_sample_index:
                si_name = martian.make_path("SI{}.fastq".format(file_number))
                out_si_fastq.close()
                out_si_fastq = open(si_name, 'w')
                out_sampleindex_files.append(si_name)
            else:
                out_sampleindex_files.append(None)

        if have_barcode:
            barcode_seq = barcode_read[1]
            barcode_qual = barcode_read[2]
            if barcode_indices is not None:
                idx = barcode_indices.get(barcode_seq)
                if idx is not None:
                    bc_counts[idx] += 1
                else:
                    bad_count += 1

            tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0],
                                      barcode_seq, barcode_qual)
        if have_sample_index:
            tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0],
                                      sample_index_read[1],
                                      sample_index_read[2])

        tk_fasta.write_read_fastq(out_read_fastq, name1, seq1, qual1)
        tk_fasta.write_read_fastq(out_read_fastq, name2, seq2, qual2)

        tk_fasta.write_read_fastq(out_trim_fastq, tr_name1, tr_seq1, tr_qual1)
        tk_fasta.write_read_fastq(out_trim_fastq, tr_name2, tr_seq2, tr_qual2)

    if interleaved:
        reads_in.close()
    else:
        r1_in.close()
        r2_in.close()

    if have_barcode:
        out_bc_fastq.close()
        # Only emit BC counts if we had a whitelist
        if outs.bc_counts is not None:
            result = {}
            result['bad_bc_count'] = bad_count
            result['bc_counts'] = list(bc_counts)
            with open(outs.bc_counts, 'w') as bc_counts_out:
                tenkit.safe_json.dump_numpy(result, bc_counts_out)

    with open(outs.read_counts, 'w') as outfile:
        read_counts = {
            'total_read_pairs': initial_read_pairs,
            'filtered_read_pairs': trimmed_read_pairs
        }
        tenkit.safe_json.dump_numpy(read_counts, outfile)

    if have_sample_index:
        out_si_fastq.close()
    out_read_fastq.close()
    out_trim_fastq.close()

    outs.chunks = [
        {
            'read1': r,  # output chunked trimmed read file
            'read2': None,
            'trim': t,  # output chunked trim file
            'barcode': bc,  # output chunked barcode file
            'sample_index': si,  # output chunked sample index file
            'barcode_reverse_complement':
            False,  # we always keep BC in correct orientation
            'reads_interleaved': True,
            'gem_group': chunk['gem_group'],
            'read_group': chunk['read_group']
        } for (r, t, bc, si) in zip(out_readfiles, out_trimfiles,
                                    out_barcodefiles, out_sampleindex_files)
    ]
def join(args, outs, chunk_defs, chunk_outs):
    final_chunks = []

    for cl in chunk_outs:
        final_chunks.extend(cl.chunks)

    outs.chunks = final_chunks
    valid_counts = [c.bc_counts for c in chunk_outs if c.bc_counts is not None]

    # No counts if there's no whitelist or actual counts
    if args.barcode_whitelist is None or len(valid_counts) == 0:
        outs.bc_counts = None
        outs.lot_info = None
        return

    bc_counts = {}
    read_counts = {}
    for (c_out, c_def) in zip(chunk_outs, chunk_defs):
        # Sum up total and trimmed read counts
        with open(c_out.read_counts) as f:
            r = json.load(f)
        for key in ['filtered_read_pairs', 'total_read_pairs']:
            read_counts[key] = read_counts.get(key, 0) + r[key]

        # Sum up barcode counts
        gem_group = c_def.chunk['gem_group']
        if c_out.bc_counts is not None:
            with open(c_out.bc_counts) as f:
                r = json.load(f)
            gg_result = bc_counts.setdefault(gem_group, {
                'bad_bc_count': 0,
                'bc_counts': None
            })
            gg_result['bad_bc_count'] += r['bad_bc_count']
            if gg_result['bc_counts'] is None:
                gg_result['bc_counts'] = np.array(r['bc_counts'],
                                                  dtype=np.int32)
            else:
                gg_result['bc_counts'] += np.array(r['bc_counts'],
                                                   dtype=np.int32)

    total_counts = 0
    total_errors = 0
    for gg in bc_counts.keys():
        rgg = bc_counts[gg]
        rgg['bc_error_rate'] = tk_stats.robust_divide(
            float(rgg['bad_bc_count']),
            float(rgg['bad_bc_count'] + rgg['bc_counts'].sum()))
        total_counts += float(rgg['bad_bc_count'] + rgg['bc_counts'].sum())
        total_errors += float(rgg['bad_bc_count'])

    # Hardcoded bail-out if the BC-error rate is extremely high
    bc_error_rate = total_errors / total_counts
    if bc_error_rate > MAX_BARCODE_ERROR_RATE:
        martian.exit(
            "Extremely high rate of incorrect barcodes observed (%.2f %%). "
            "Check that input is 10x Chromium data, "
            "and that there are no missing cycles in first 16 bases of the index read I2."
            % (bc_error_rate * 100.0))

    # possibly do lot detection
    lot_detection = {}
    lot_map = WHITELIST_TO_LOT_MAP.get(args.barcode_whitelist, None)
    if lot_map is not None:
        # get BC counts histogram
        # for now, just sum over all gem groups
        bc_seq = sorted(list(load_barcode_whitelist(args.barcode_whitelist)))
        bc_cts = np.sum([ggr['bc_counts'] for ggr in bc_counts.values()],
                        axis=0)
        bc_hist = {seq: cts for seq, cts in zip(bc_seq, bc_cts)}

        (gelbead_lot, gelbead_lot_confidence,
         gelbead_lot_counts) = identify_gelbead_lot(bc_hist, lot_map)
        # only report on lots with nonzero counts
        gelbead_lot_counts_nonzero = {
            lot: count
            for lot, count in gelbead_lot_counts.items() if count > 0
        }

        lot_detection['gelbead_lot'] = gelbead_lot
        lot_detection['gelbead_lot_confidence'] = gelbead_lot_confidence
        lot_detection['gelbead_lot_counts'] = gelbead_lot_counts_nonzero

        martian.log_info("Gelbead lot detected: %s, reason (if None): %s" %
                         (gelbead_lot, gelbead_lot_confidence))

    if outs.lot_info is not None:
        with open(outs.lot_info, 'w') as f:
            tenkit.safe_json.dump_numpy(lot_detection, f, pretty=True)

    if outs.bc_counts is not None:
        with open(outs.bc_counts, 'w') as f:
            tenkit.safe_json.dump_numpy(bc_counts, f)

    with open(outs.read_counts, 'w') as f:
        tenkit.safe_json.dump_numpy(read_counts, f, pretty=True)
def join(args, outs, chunk_defs, chunk_outs):
    args.coerce_strings()
    outs.coerce_strings()
    if args.fragments is None:
        outs.cell_barcodes = None
        outs.cell_calling_summary = None
        outs.singlecell = None
        return

    if args.excluded_barcodes is not None:
        with open(args.excluded_barcodes, 'r') as infile:
            excluded_barcodes = json.load(infile)
    else:
        excluded_barcodes = None

    # Merge the chunk inputs
    ref = ReferenceManager(args.reference_path)
    species_list = ref.list_species()

    barcode_counts_by_species = {
        species: Counter()
        for species in species_list
    }
    targeted_counts_by_species = {
        species: Counter()
        for species in species_list
    }
    fragment_depth = 0
    for chunk_in, chunk_out in zip(chunk_defs, chunk_outs):
        species = ref.species_from_contig(chunk_in.contig)
        with open(chunk_out.barcode_counts, 'r') as infile:
            barcode_counts_by_species[species] += pickle.load(infile)
        with open(chunk_out.targeted_counts, 'r') as infile:
            targeted_counts_by_species[species] += pickle.load(infile)
        fragment_depth += chunk_out.fragment_depth
    print('Total fragments across all chunks: {}'.format(fragment_depth))

    barcodes = list({
        bc
        for species in species_list
        for bc in barcode_counts_by_species[species]
    })
    non_excluded_barcodes = {
        species:
        [bc for bc in barcodes if bc not in excluded_barcodes[species]]
        for species in species_list
    }
    print('Total barcodes observed: {}'.format(len(barcodes)))

    retained_counts = {}
    for species in species_list:
        if excluded_barcodes is None:
            retained_counts[species] = np.array(
                [targeted_counts_by_species[species][bc] for bc in barcodes])
        else:
            retained_counts[species] = np.array([
                targeted_counts_by_species[species][bc] for bc in barcodes
                if bc not in excluded_barcodes[species]
            ])
            print('Barcodes excluded for species {}: {}'.format(
                species, len(excluded_barcodes[species])))
            print('Barcodes remaining for species {}: {}'.format(
                species, len(non_excluded_barcodes[species])))

    parameters = {}

    whitelist_length = len(load_barcode_whitelist(args.barcode_whitelist))
    count_shift = max(
        MINIMUM_COUNT,
        int(fragment_depth * WHITELIST_CONTAM_RATE / whitelist_length))
    print('Count shift for whitelist contamination: {}'.format(count_shift))

    for (species, count_data) in retained_counts.iteritems():
        print('Analyzing species {}'.format(species))
        # Subtract MINIMUM_COUNT from all counts to remove the effects of whitelist contamination
        shifted_data = count_data[count_data >= count_shift] - count_shift
        print('Number of barcodes analyzed: {}'.format(len(shifted_data)))
        count_dict = Counter(shifted_data)
        parameters[species] = {}

        forced_cell_count = None
        if args.force_cells is not None:
            if species in args.force_cells:
                forced_cell_count = int(args.force_cells[species])
            elif "default" in args.force_cells:
                forced_cell_count = int(args.force_cells["default"])
            if forced_cell_count > MAXIMUM_CELLS_PER_SPECIES:
                forced_cell_count = MAXIMUM_CELLS_PER_SPECIES
                martian.log_info(
                    'Attempted to force cells to {}.  Overriding to maximum allowed cells.'
                    .format(forced_cell_count))

        # Initialize parameters to empty
        parameters[species]['noise_mean'] = None
        parameters[species]['noise_dispersion'] = None
        parameters[species]['signal_mean'] = None
        parameters[species]['signal_dispersion'] = None
        parameters[species]['fraction_noise'] = None
        parameters[species]['cell_threshold'] = None
        parameters[species]['goodness_of_fit'] = None
        parameters[species]['estimated_cells_present'] = 0

        # Corner case where FRIP is 0 because the number of peaks is tiny (fuzzer tests)
        if len(count_dict) < 10:
            parameters[species]['cells_detected'] = 0
            forced_cell_count = None
        elif forced_cell_count is None:
            print('Estimating parameters')
            fitted_params = estimate_parameters(count_dict)
            signal_threshold = estimate_threshold(
                fitted_params, CELL_CALLING_THRESHOLD) + count_shift
            print('Primary threshold: {}'.format(signal_threshold))
            parameters[species]['noise_mean'] = fitted_params.mu_noise
            parameters[species]['noise_dispersion'] = fitted_params.alpha_noise
            parameters[species]['signal_mean'] = fitted_params.mu_signal
            parameters[species][
                'signal_dispersion'] = fitted_params.alpha_signal
            parameters[species]['fraction_noise'] = fitted_params.frac_noise
            parameters[species]['cell_threshold'] = signal_threshold
            parameters[species]['goodness_of_fit'] = goodness_of_fit(
                shifted_data, fitted_params)
            called_cell_count = np.sum(count_data >= signal_threshold)
            parameters[species]['cells_detected'] = called_cell_count
            parameters[species]['estimated_cells_present'] = int(
                (1 - fitted_params.frac_noise) * len(shifted_data))
            if called_cell_count > MAXIMUM_CELLS_PER_SPECIES:
                # Abort the model fitting and instead force cells to the maximum
                forced_cell_count = MAXIMUM_CELLS_PER_SPECIES

        if forced_cell_count is not None:
            print('Forcing cells to {}'.format(forced_cell_count))

            if forced_cell_count <= 0:
                raise ValueError("Force cells must be positive")
            else:
                adj_data = shifted_data[shifted_data > 0]
                print('Total barcodes considered for forcing cells: {}'.format(
                    len(adj_data)))
                parameters[species]['cell_threshold'] = min(adj_data) if forced_cell_count >= len(adj_data) else \
                    sorted(adj_data, reverse=True)[forced_cell_count - 1]
                parameters[species]['cell_threshold'] += count_shift
                parameters[species]['cells_detected'] = np.sum(
                    count_data >= parameters[species]['cell_threshold'])

    # For barnyard samples, mask out the noise distribution and re-fit to get cleaner separation
    if len(retained_counts) == 2 and (args.force_cells is None
                                      or not args.force_cells):
        print('Estimating secondary thresholds')
        sp1, sp2 = species_list

        sp1_threshold = -1 if parameters[sp1][
            'cell_threshold'] is not None else parameters[sp1]['cell_threshold']
        sp2_threshold = -1 if parameters[sp2][
            'cell_threshold'] is not None else parameters[sp2]['cell_threshold']

        if parameters[sp1]['cell_threshold'] is not None:
            sp1_counts = np.array([
                targeted_counts_by_species[sp1][bc]
                for bc in non_excluded_barcodes[sp1]
                if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and (
                    targeted_counts_by_species[sp2][bc] > sp2_threshold)
            ])
            sp1_params = estimate_parameters(Counter(sp1_counts),
                                             threshold=sp1_threshold)
            if not np.isnan(sp1_params.frac_noise):
                parameters[sp1]['cell_threshold'] = max(
                    sp1_threshold, estimate_threshold(sp1_params, 20))
            parameters[sp1]['cells_detected'] = np.sum(
                sp1_counts >= parameters[sp1]['cell_threshold'])
        else:
            parameters[sp1]['cells_detected'] = 0

        if parameters[sp2]['cell_threshold'] is not None:
            sp2_counts = np.array([
                targeted_counts_by_species[sp2][bc]
                for bc in non_excluded_barcodes[sp2]
                if (targeted_counts_by_species[sp1][bc] > sp1_threshold) and (
                    targeted_counts_by_species[sp2][bc] > sp2_threshold)
            ])
            sp2_params = estimate_parameters(Counter(sp2_counts),
                                             threshold=sp2_threshold)
            if not np.isnan(sp2_params.frac_noise):
                parameters[sp2]['cell_threshold'] = max(
                    sp2_threshold, estimate_threshold(sp2_params, 20))
            parameters[sp2]['cells_detected'] = np.sum(
                sp2_counts >= parameters[sp2]['cell_threshold'])
        else:
            parameters[sp2]['cells_detected'] = 0

        print('Secondary threshold ({}): {}'.format(
            sp1, parameters[sp1]['cell_threshold']))
        print('Secondary threshold ({}): {}'.format(
            sp2, parameters[sp2]['cell_threshold']))

    print('Writing out cell barcodes')
    cell_barcodes = {}
    for (species, count_data) in retained_counts.iteritems():
        threshold = parameters[species]['cell_threshold']
        cell_barcodes[species] = {}
        print('Cell threshold for species {}: {}'.format(species, threshold))
        if threshold is not None:
            for count, barcode in zip(count_data,
                                      non_excluded_barcodes[species]):
                if count >= threshold:
                    print('{} - Total {}, Targeted {}, Count {}, Threshold {}'.
                          format(barcode,
                                 barcode_counts_by_species[species][barcode],
                                 targeted_counts_by_species[species][barcode],
                                 count, threshold))
                    cell_barcodes[species][barcode] = count
        if len(cell_barcodes[species]
               ) != parameters[species]['cells_detected']:
            print(len(cell_barcodes[species]),
                  parameters[species]['cells_detected'])
            raise ValueError(
                'Mismatch in called cells identified - failure in threshold setting'
            )
        print('Selected {} barcodes of species {}'.format(
            len(cell_barcodes[species]), species))

    with open(outs.cell_barcodes, 'w') as outfile:
        # low mem reduce op to merge-sort bcs across species
        for species in cell_barcodes.keys():
            outfile.write(species + ",")
            outfile.write(",".join(cell_barcodes[species]) + "\n")

    cell_index = compute_cell_index(species_list, cell_barcodes)

    with open(outs.singlecell, 'w') as outfile:
        outfile.write("barcode,cell_id,")
        outfile.write(",".join([
            "is_{}_cell_barcode".format(species) for species in species_list
        ]))
        if len(species_list) > 1:
            for species in species_list:
                outfile.write(",passed_filters_{}".format(species))
                outfile.write(",peak_region_fragments_{}".format(species))
        outfile.write("\n")
        for barcode in [NO_BARCODE] + sorted(barcodes):
            outfile.write("{},".format(barcode))
            outfile.write("{},".format(cell_index.get(barcode, "None")))
            values = [
                str(
                    int(species in cell_barcodes
                        and barcode in cell_barcodes[species]))
                for species in species_list
            ]
            outfile.write(",".join(values))
            if len(species_list) > 1:
                for species in species_list:
                    outfile.write(",{:d}".format(
                        barcode_counts_by_species[species][barcode]))
                    outfile.write(",{:d}".format(
                        targeted_counts_by_species[species][barcode]))
            outfile.write("\n")

    # process data into summary metrics
    summary_info = {}
    summary_info.update(
        generate_cell_calling_metrics(parameters, cell_barcodes))
    summary_info.update(generate_gb_metrics(cell_barcodes, excluded_barcodes))

    with open(outs.cell_calling_summary, 'w') as outfile:
        outfile.write(json.dumps(summary_info, indent=4))
def main(args, outs):
    """ Attaches barcodes. Attaches raw barcode to RAW_BC tag and filters those to form set of PROCESSES_BARCODES """

    chunk = args.chunk

    bam_in = create_bam_infile(args.align_chunk)

    bam_out, _ = tk_bam.create_bam_outfile(
        outs.output,
        None,
        None,
        template=bam_in,
        pgs=[
            tk_bam.make_pg_header(martian.get_pipelines_version(),
                                  "attach_bcs", TENX_PRODUCT_NAME)
        ])

    gp_tagger = GlobalFivePrimePosTagger(bam_in)

    if args.barcode_whitelist is None or args.bc_counts is None:
        # If there's no whitelist or counts then all high quality BC reads get allowed.
        barcode_whitelist = None
        wl_idxs = None
        bc_dist = None
    else:
        barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist)

        # Load the bc counts for this GEM group
        counts = json.load(open(args.bc_counts, 'r'))
        counts = counts[str(chunk['gem_group'])]['bc_counts']

        # Prior distribution over barcodes, with pseudo-count
        bc_dist = np.array(counts, dtype=np.float) + 1.0
        bc_dist = bc_dist / bc_dist.sum()
        wl_idxs = {
            bc: idx
            for (idx, bc) in enumerate(sorted(list(barcode_whitelist)))
        }

    # set random seed to get deterministic subsampling
    random.seed(0)

    if chunk['barcode'] is not None:
        processed_barcode_iter = get_raw_processed_barcodes(
            open_maybe_gzip(chunk['barcode']), barcode_whitelist,
            args.bc_confidence_threshold, chunk['gem_group'],
            chunk['barcode_reverse_complement'], wl_idxs, bc_dist)
        require_barcode_for_stringent = True
    else:
        processed_barcode_iter = itertools.repeat(None)
        require_barcode_for_stringent = False

    if chunk['sample_index'] is not None:
        sample_index_iter = tk_fasta.read_generator_fastq(
            open_maybe_gzip(chunk['sample_index']))
    else:
        sample_index_iter = itertools.repeat(None)

    if chunk['trim'] is not None:
        trim_iter = tk_fasta.read_generator_fastq(open_maybe_gzip(
            chunk['trim']),
                                                  paired_end=True)
    else:
        trim_iter = itertools.repeat(None)

    iters = itertools.izip(processed_barcode_iter, sample_index_iter,
                           trim_iter)

    # First read
    try:
        read = bam_in.next()
    except StopIteration:
        read = None

    # Number of perfect reads -- used to compute down-sampling rates in mark_duplicates
    perfect_read_count = 0

    # Due to secondary alignments, we must apply the tags to all
    # reads with the same cluster name.
    for (barcode_info, sample_index_info, trim_info) in iters:
        tags = []
        read_name = None

        if read is None:
            break

        if barcode_info is not None:
            (bc_read_name, raw_bc_seq, processed_bc_seq,
             raw_bc_qual) = barcode_info
            tags.append((RAW_BARCODE_TAG, raw_bc_seq))
            tags.append((RAW_BARCODE_QUAL_TAG, raw_bc_qual))
            if processed_bc_seq is not None:
                tags.append((PROCESSED_BARCODE_TAG, processed_bc_seq))
            read_name = bc_read_name.split()[0]

        if sample_index_info is not None:
            (si_read_name, seq, qual) = sample_index_info
            tags.append((SAMPLE_INDEX_TAG, seq))
            tags.append((SAMPLE_INDEX_QUAL_TAG, qual))

            if read_name is not None:
                if si_read_name.split()[0] != read_name:
                    martian.log_info(
                        "mismatch: si_read_name: %s, bam_read_name: %s" %
                        (si_read_name, read_name))
                assert (si_read_name.split()[0] == read_name)
            else:
                read_name = si_read_name.split()[0]

        r1_tags = tags
        r2_tags = list(r1_tags)

        if trim_info is not None:
            (trim1_read_name, trim1_seq, trim1_qual, trim2_read_name,
             trim2_seq, trim2_qual) = trim_info
            if len(trim1_seq) > 0:
                r1_tags.append((TRIM_TAG, trim1_seq))
                r1_tags.append((TRIM_QUAL_TAG, trim1_qual))
            if len(trim2_seq) > 0:
                r2_tags.append((TRIM_TAG, trim2_seq))
                r2_tags.append((TRIM_QUAL_TAG, trim2_qual))

        reads_attached = 0
        reads_to_attach = []

        while read.query_name == read_name or read_name is None:
            tags = r1_tags if read.is_read1 else r2_tags
            if len(tags) > 0:
                existing_tags = read.tags
                existing_tags.extend(tags)
                read.tags = existing_tags

            if reads_to_attach and (
                    read.query_name != reads_to_attach[0].query_name
                    or reads_to_attach[0].query_name is None):
                gp_tagger.tag_reads(reads_to_attach)
                reads_attached += len(reads_to_attach)
                for r in reads_to_attach:
                    if stringent_read_filter(r, require_barcode_for_stringent):
                        perfect_read_count += 1

                    if args.exclude_non_bc_reads:
                        if not (get_read_barcode(r) is None):
                            bam_out.write(r)
                    else:
                        bam_out.write(r)
                reads_to_attach = []

            reads_to_attach.append(read)

            try:
                read = bam_in.next()

            except StopIteration:
                read = None
                break

        gp_tagger.tag_reads(reads_to_attach)
        reads_attached += len(reads_to_attach)
        for r in reads_to_attach:
            if stringent_read_filter(r, require_barcode_for_stringent):
                perfect_read_count += 1

            if args.exclude_non_bc_reads:
                if not (get_read_barcode(r) is None):
                    bam_out.write(r)
            else:
                bam_out.write(r)
        # We may have more than 2 reads if there was a
        # secondary alignment, but less than 2 means
        # something went wrong
        assert (reads_attached >= 2)

    outs.perfect_read_count = perfect_read_count
    bam_out.close()