Ejemplo n.º 1
0
    def build_from_mol_counter(molecule_counter, subsample_rate=1.0,
                               subsample_result=None):
        """ Construct a GeneBCMatrices object from a MoleculeCounter.
            Args: subsample_result (dict) - Return some metrics results into this dict. """

        # Reconstruct all barcode sequences in the original matrices
        barcode_whitelist = cr_utils.load_barcode_whitelist(molecule_counter.get_barcode_whitelist())
        barcode_length = molecule_counter.get_barcode_length() or len(barcode_whitelist[0])

        gem_groups = molecule_counter.get_gem_groups()
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, gem_groups)

        # Reconstruct Gene tuples from the molecule info ref columns
        gene_ids = molecule_counter.get_ref_column('gene_ids')
        genome_ids = molecule_counter.get_ref_column('genome_ids')
        gene_names = molecule_counter.get_ref_column('gene_names')
        gene_tuples = [cr_constants.Gene(gid, gname, None, None, None) for (gid, gname) in itertools.izip(gene_ids, gene_names)]
        genes = cr_utils.split_genes_by_genomes(gene_tuples, genome_ids)

        matrices = GeneBCMatrices(genome_ids, genes, barcode_seqs)

        # Track results of subsampling
        reads = 0

        for mol in molecule_counter.get_molecule_iter(barcode_length, subsample_rate=subsample_rate):
            matrices.add(mol.genome, mol.gene_id, mol.barcode)
            reads += mol.reads

        if subsample_result is not None:
            subsample_result['mapped_reads'] = reads

        return matrices
Ejemplo n.º 2
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    libraries = rna_library.get_bam_library_info(in_bam)
    distinct_library_types = sorted(
        list(set([x['library_type'] for x in libraries])))
    library_prefixes = map(
        lambda lib: rna_library.get_library_type_metric_prefix(lib[
            'library_type']), libraries)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_tsv(
        args.barcodes_detected) if not barcode_whitelist else None

    # TODO: this is redundant
    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups,
                                  library_types=distinct_library_types)

    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    matrix = cr_matrix.CountMatrix.empty(feature_ref,
                                         barcode_seqs,
                                         dtype='int32')

    for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None):
        is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb(
            reads_iter,
            libraries,
            library_prefixes,
            use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrix.add(feature_id, bc)

    in_bam.close()

    reporter.store_reference_metadata(args.reference_path,
                                      cr_constants.REFERENCE_TYPE,
                                      cr_constants.REFERENCE_METRIC_PREFIX)

    matrix.save_h5_file(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)
Ejemplo n.º 3
0
 def __init__(self, barcode_whitelist, out_counts, gem_groups=None):
     self.barcode_counts = None
     self.barcode_index = None
     self.out_counts = out_counts
     self.barcode_seqs = cr_utils.load_barcode_whitelist(barcode_whitelist)
     if self.barcode_seqs:
         self.barcode_seqs = cr_utils.format_barcode_seqs(self.barcode_seqs, gem_groups)
         self.barcode_counts = np.zeros(len(self.barcode_seqs), dtype=np.uint32)
         self.barcode_index = {bc: index for index, bc in enumerate(self.barcode_seqs)}
Ejemplo n.º 4
0
def split(args):
    # Need to store umi_info and a json with a dict containing 1 key per barcode
    umi_info_mem_gb = 2 * int(np.ceil(vdj_umi_info.get_mem_gb(args.umi_info)))

    bc_diversity = len(cr_utils.load_barcode_whitelist(args.barcode_whitelist))
    assemble_summary_mem_gb = tk_stats.robust_divide(bc_diversity,
                                                     DICT_BCS_PER_MEM_GB)

    return {
        'chunks': [{
            '__mem_gb':
            int(
                np.ceil(
                    max(cr_constants.MIN_MEM_GB,
                        umi_info_mem_gb + assemble_summary_mem_gb))),
        }]
    }
Ejemplo n.º 5
0
def split(args):
    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist)

    chunks = []
    for chunk_input in args.inputs:
        chunks.append({
            'chunk_input': chunk_input,
            '__mem_gb': chunk_mem_gb,
        })

    join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist, args.gem_groups, use_min=False)

    # Account for memory used by reporters (particularly the bc and umi diversity dicts)
    genomes = cr_utils.get_reference_genomes(args.reference_path)

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    if barcode_whitelist is not None:
        num_barcodes = len(barcode_whitelist) * max(args.gem_groups)
    else:
        num_barcodes = cr_utils.get_num_barcodes_from_barcode_summary(
            args.barcode_summary)

    max_bc_diversity_entries = num_barcodes
    max_umi_diversity_entries = 4**cr_chem.get_umi_length(args.chemistry_def)

    # Multiply by 2 to hold the current reporter + accumulating reporter in the merge
    bc_diversity_mem_gb = (2 * max_bc_diversity_entries *
                           cr_constants.BYTES_PER_STR_INT_DICT_ENTRY *
                           (len(genomes) + 1) *
                           len(cr_constants.READ_TYPES)) / 1e9
    umi_diversity_mem_gb = (2 * max_umi_diversity_entries *
                            cr_constants.BYTES_PER_STR_INT_DICT_ENTRY *
                            (len(genomes) + 1) *
                            len(cr_constants.READ_TYPES)) / 1e9
    join_mem_gb = min(
        cr_constants.COUNT_GENES_MAX_MEM_GB,
        max(cr_constants.MIN_MEM_GB,
            int(join_mem_gb + bc_diversity_mem_gb + umi_diversity_mem_gb)))
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
Ejemplo n.º 6
0
def main(args, outs):
    random.seed(0)
    np.random.seed(0)

    with cr_mol_counter.MoleculeCounter.open(args.molecule_h5,
                                             'r',
                                             start=int(args.chunk_start),
                                             length=int(
                                                 args.chunk_len)) as ctr_in:
        genome_ids = ctr_in.get_ref_column('genome_ids')
        gene_ids = ctr_in.get_ref_column('gene_ids')
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            ctr_in.get_barcode_whitelist())

        # Estimate BC diversity and recovered cells per gem group
        gg_total_diversity = len(barcode_whitelist)

        bc_counts_per_genome = get_bc_counts(genome_ids, gene_ids, ctr_in)
        top_bcs_per_genome = {}
        total_conf_mapped_cell_reads = 0
        total_cells = 0
        recovered_cells = args.recovered_cells or cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP
        for genome, (barcodes, umi_counts,
                     read_counts) in bc_counts_per_genome.iteritems():
            if args.force_cells is not None:
                top_bc_indices, filter_summary, _ = cr_stats.filter_cellular_barcodes_fixed_cutoff(
                    umi_counts, args.force_cells)
            else:
                top_bc_indices, filter_summary, _ = cr_stats.filter_cellular_barcodes_ordmag(
                    umi_counts, recovered_cells, gg_total_diversity)
            top_bcs_per_genome[genome] = barcodes[top_bc_indices]
            total_conf_mapped_cell_reads += read_counts[top_bc_indices].sum()
            total_cells += filter_summary['filtered_bcs']

        write_filtered_barcodes(outs.cell_barcodes, args.gem_group, ctr_in,
                                top_bcs_per_genome)

        outs.gem_group_metrics = {
            'cells': int(total_cells),
            'cmb_reads': int(total_conf_mapped_cell_reads)
        }
Ejemplo n.º 7
0
def main(args, outs):
    reference_star_path = cr_utils.get_reference_star_path(args.reference_path)
    star_index = cr_transcriptome.build_star_index(reference_star_path)
    chroms = star_index[0][0]
    gene_index = cr_reference.GeneIndex.load_pickle(cr_utils.get_reference_genes_index(args.reference_path))
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group)
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_constants.STAR_DEFAULT_HIGH_CONF_MAPQ,
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_dist=barcode_dist,
                                  gem_groups=args.gem_groups,
                                  umi_length=cr_chem.get_umi_length(args.chemistry_def),
                                  umi_min_qual_threshold=args.umi_min_qual_threshold)

    reporter.attach_bcs_init()
    outs.num_alignments = process_alignments(args.chunk_genome_input, args.chunk_trimmed_input, outs.output, args.bam_comments, reporter, gene_index, star_index, args)
    reporter.attach_bcs_finalize()
    reporter.save(outs.chunked_reporter)
Ejemplo n.º 8
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_summary(
        args.barcode_summary) if not barcode_whitelist else None

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes)
    matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs)

    for read in in_bam:
        is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb(
            read, use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrices.add(genome, gene_id, bc)

    in_bam.close()

    matrices.save_h5(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)
Ejemplo n.º 9
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = defaultdict(int)

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes(
            args.umi_info, int(gem_group))

        # Record the RPU and UMI thresholds
        reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold',
                                  gem_group).set_value(rpu_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_umi_threshold',
                                  gem_group).set_value(umi_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_confidence',
                                  gem_group).set_value(confidence)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            # Update set of BCs called as cells
            cell_barcodes.update(set(gg_cell_bcs))

            # Sum BC support
            for bc, count in gg_bc_support.iteritems():
                bc_support[bc] += count

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs, recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
Ejemplo n.º 10
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Build the feature reference
    if args.reference_path:
        feature_ref = rna_feature_ref.from_transcriptome_and_csv(
            args.reference_path, args.feature_reference)
    else:
        feature_ref = rna_feature_ref.FeatureReference.empty()

    # Setup feature barcode extraction
    feature_extractor = rna_feature_ref.FeatureExtractor(
        feature_ref, use_feature_types=[args.library_type])

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    num_libraries = len(args.library_info)
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        num_libraries=num_libraries)

    # Determine if barcode sequences need to be reverse complemented.
    with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved,
                     None, None) as bc_check_rc:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist, True)
        barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                      bc_check_rc.in_iter)

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def,
                            args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter,
                                 cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def,
                                args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(
                r2_reader.in_iter,
                cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()

    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    # Record feature counts:
    feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int)

    # If this library type has no feature barcodes, make the reader a NOOP
    if feature_extractor.has_features_to_extract():
        feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor,
                                           args.reads_interleaved, r1_length,
                                           r2_length)
    else:
        feature_reads = FastqReader(None, None, None, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads,
                     feature_reads)

    read1_writer = ChunkedFastqWriter(outs.reads,
                                      args.reads_per_file,
                                      compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s,
                                          args.reads_per_file,
                                          compression=COMPRESSION)

    tag_writer = None
    if not args.augment_fastq:
        tag_writer = ChunkedFastqWriter(outs.tags,
                                        args.reads_per_file,
                                        compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter,
                                        args.chunk_initial_reads):
        # Downsample
        if random.random() > args.chunk_subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        lib_idx = [
            i for i, x in enumerate(args.library_info)
            if x['library_id'] == args.library_id
        ][0]
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              lib_idx,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        feat_raw_bc = None
        feat_proc_bc = None
        feat_qual = None
        feat_ids = None

        if feature_extraction:
            if feature_extraction.barcode:
                feat_raw_bc = feature_extraction.barcode
                feat_qual = feature_extraction.qual

            if len(feature_extraction.ids) > 0:
                feat_proc_bc = feature_extraction.barcode
                feat_ids = ';'.join(feature_extraction.ids)

                # If hit a single feature ID, count its frequency
                if len(feature_extraction.ids) == 1:
                    feature_counts[feature_extraction.indices[0]] += 1

        if feat_raw_bc:
            fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                  feat_raw_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                  feat_qual)
        if feat_ids:
            fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG,
                                  feat_proc_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

        if args.augment_fastq:
            read1_writer.write(
                (fastq_header1.to_string(), rna_read[1], rna_read[2]))
        else:
            read1_writer.write((rna_read[0], rna_read[1], rna_read[2]))
            tag_writer.write((fastq_header1.to_string(), '', ''))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            if feat_raw_bc:
                fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                      feat_raw_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                      feat_qual)
            if feat_ids:
                fastq_header2.set_tag(
                    cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

            if args.augment_fastq:
                read2_writer.write(
                    (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))
            else:
                read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    if not args.augment_fastq:
        tag_writer.close()
    bc_counter.close()

    # Write feature BC read counts
    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()

        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []

        if args.augment_fastq:
            outs.tags = []
        else:
            outs.tags = tag_writer.get_out_paths(len(outs.tags))

        libraries = args.library_info
        library = [
            li for li in libraries if li['library_id'] == args.library_id
        ][0]

        outs.gem_groups = [library['gem_group']] * len(outs.reads)
        outs.library_types = [library['library_type']] * len(outs.reads)
        outs.library_ids = [library['library_id']] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.tags = []
        outs.gem_groups = []
        outs.library_types = []
        outs.library_ids = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)
    assert args.augment_fastq or len(outs.tags) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
class MoleculeCounter:
    """ Streams a list of tuples w/named elements to or from an h5 file """
    def __init__(self):
        self.file_version = None
        self.h5 = None
        self.columns = OrderedDict()
        self.ref_columns = OrderedDict()
        self.library_info = None

    def get_barcode_whitelist(self):
        return self.get_metric(BC_WHITELIST_METRIC)

    def get_gem_groups(self):
        return map(int, self.get_metric(GEM_GROUPS_METRIC).keys())

    def is_aggregated(self):
        ret = self.get_metric(IS_AGGREGATED_METRIC)
        return ret if ret is not None else False

    @staticmethod
    def get_column_dtype(k):
        return np.dtype(MOLECULE_INFO_COLUMNS[k])

    @staticmethod
    def get_record_bytes():
        return sum([np.dtype(x).itemsize for x in MOLECULE_INFO_COLUMNS.values()])

    @staticmethod
    def estimate_mem_gb(chunk_len, scale=1.0, cap=True):
        """ Estimate memory usage of this object given a number of records. """
        mol_entries_per_gb = int(1e9 / MoleculeCounter.get_record_bytes())
        mem_gb = round(math.ceil(scale * chunk_len / mol_entries_per_gb))
        if cap:
            return max(h5_constants.MIN_MEM_GB, mem_gb)
        else:
            return mem_gb

    @staticmethod
    def build_barcode_info(filtered_barcodes_by_genome, library_info, barcodes):
        """Generate numpy arrays for per-barcode info
        Args:
          filtered_barcodes_by_genome (dict of str:list(str)): Keys are genomes, values are lists of filtered barcode strings.
          library_info (list of dict): Per-library metadata.
          barcodes (list of str): All barcode sequences (e.g. ['ACGT', ...]
        Returns:
          BarcodeInfo object
        """
        # Replace a genome string with its lexicographical rank
        genome_to_idx = {g:i for i, g in \
                         enumerate(sorted(filtered_barcodes_by_genome.keys()))}

        libraries_for_gem_group = defaultdict(list)
        for lib_idx, lib in enumerate(library_info):
            libraries_for_gem_group[lib['gem_group']].append(lib_idx)

        # Map a barcode sequence to its index into the MoleculeCounter
        #  'barcodes' array
        bc_seq_to_idx = {bc:i for i, bc in enumerate(barcodes)}

        # Populate the "pass filter" array of tuples
        pf_tuples = []
        for genome, bcs in filtered_barcodes_by_genome.iteritems():
            genome_idx = genome_to_idx[genome]
            for bc_str in bcs:
                seq, gg = cr_utils.split_barcode_seq(bc_str)
                barcode_idx = bc_seq_to_idx[seq]

                # FIXME: Assumes no per-library filtering, just per-gem-group
                library_inds = libraries_for_gem_group[gg]
                for library_idx in library_inds:
                    pf_tuples.append((barcode_idx, library_idx, genome_idx))

        if len(pf_tuples) > 0:
            pass_filter = np.array(pf_tuples, dtype=BARCODE_INFO_DTYPES['pass_filter'])
        else:
            pass_filter = np.zeros((0,3), dtype=BARCODE_INFO_DTYPES['pass_filter'])

        assert pass_filter.shape[0] == len(pf_tuples)
        assert pass_filter.shape[1] == 3

        # Sort by barcode index
        pass_filter = pass_filter[np.argsort(pass_filter[:,0]), :]

        return BarcodeInfo(
            pass_filter,
            genomes=sorted(filtered_barcodes_by_genome.keys()),
        )

    @staticmethod
    def get_filtered_barcodes(barcode_info, library_info, barcodes,
                              genome_idx=None, library_type=None):
        """Get a list of filtered barcode strings e.g. ['ACGT-1',...]
        Args:
          barcode_info (BarcodeInfo): Barcode info object.
          library_info (list of dict): Library info.
          barcodes (np.array): Barcode sequences.
          genome_idx (int): Restrict passing definition to this genome. None for no restriction.
          library_type (str): Restrict passing definition to this library type. None for no restriction.
        Returns:
          list of str
        """

        # Without restrictions, assumes passing filter in a single library or genome is sufficient
        # for a barcode to be passing filter overall.

        pass_filter = barcode_info.pass_filter

        pf_barcode_idx = pass_filter[:,0]
        pf_library_idx = pass_filter[:,1]
        pf_genome_idx = pass_filter[:,2]

        mask = np.ones(pass_filter.shape[0], dtype=bool)
        if genome_idx is not None:
            mask &= pf_genome_idx == genome_idx

        if library_type is not None:
            library_inds = np.array([i for i,lib in enumerate(library_info) if lib['library_type'] == library_type],
                                    dtype=MOLECULE_INFO_COLUMNS['library_idx'])
            mask &= np.isin(pf_library_idx, library_inds)
        inds = np.flatnonzero(mask)

        lib_to_gg = np.array([lib['gem_group'] for lib in library_info], dtype='uint64')

        pf_gem_group = lib_to_gg[pf_library_idx[inds]]

        # Take unique, sorted barcodes (sorted by (gem_group, barcode_idx))
        gg_bcs = np.unique(np.column_stack((pf_gem_group, pf_barcode_idx[inds])), axis=0)

        # Create barcode strings
        return [cr_utils.format_barcode_seq(barcodes[gg_bcs[i, 1]],
                                            gg_bcs[i, 0]) for i in xrange(gg_bcs.shape[0])]

    @staticmethod
    def save_barcode_info(bc_info, group):
        """Save barcode info to HDF5.
        Args:
          barcode_info (BarcodeInfo): Data.
          group (h5py.Group): Output group.
        """
        group.create_dataset('pass_filter', data=bc_info.pass_filter,
                             maxshape=(None, bc_info.pass_filter.shape[1]),
                             compression=HDF5_COMPRESSION,
                             shuffle=True)
        cr_io.create_hdf5_string_dataset(group, 'genomes', bc_info.genomes,
                                         compression=HDF5_COMPRESSION,
                                         shuffle=True)

    @staticmethod
    def load_barcode_info(group):
        """Load barcode info from an HDF5 group.
        Args:
          group (h5py.Group): Input group.
        Returns:
          BarcodeInfo object
        """
        return BarcodeInfo(
            pass_filter=group['pass_filter'][:],
            genomes=cr_io.read_hdf5_string_dataset(group['genomes']),
        )

    def get_barcode_info(self):
        return MoleculeCounter.load_barcode_info(self.h5[BARCODE_INFO_GROUP_NAME])

    @staticmethod
    def open(filename, mode, feature_ref=None, barcodes=None, library_info=None,
             barcode_info=None):
        """Open a molecule info object.

        Args:
          filename (str): Filename to open or create
          mode (str): 'r' for reading, 'w' for writing.
          feature_ref (FeatureReference): Required when mode is 'w'.
          barcodes (list of str): All possible barcode sequences. Required when mode is 'w'.
          library_info (list of dict): Library metadata. Required when mode is 'w'.
          barcode_info (BarcodeInfo): Per-barcode metadata.
        Returns:
          MoleculeInfo: A new object
        """
        assert mode == 'r' or mode == 'w'

        mc = MoleculeCounter()

        if mode == 'w':
            if feature_ref is None:
                raise ValueError('Feature reference must be specified when opening a molecule info object for writing')
            if barcodes is None:
                raise ValueError('Barcodes must be specified when opening a molecule info object for writing')
            if library_info is None:
                raise ValueError('Library info must be specified when opening a molecule info object for writing')
            if barcode_info is None:
                raise ValueError('Barcode info must be specified when opening a molecule info object for writing')

            mc.h5 = h5py.File(filename, 'w')
            cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION)
            cr_io.set_hdf5_attr(mc.h5, h5_constants.H5_FILETYPE_KEY, MOLECULE_H5_FILETYPE)
            cr_io.set_hdf5_attr(mc.h5, FILE_VERSION_KEY, CURR_FILE_VERSION)

            mc.h5.create_group(METRICS_GROUP_NAME)

            # Write feature reference
            fref_group = mc.h5.create_group(h5_constants.H5_FEATURE_REF_ATTR)
            feature_ref.to_hdf5(fref_group)

            # Write barcodes
            # If there are multiple barcode lengths, use the largest for the numpy dtype.
            max_barcode_len = np.max(map(len, barcodes))
            barcode_dtype = np.dtype('S%d' % max_barcode_len)
            mc.h5.create_dataset('barcodes', data=np.fromiter(barcodes, barcode_dtype, count=len(barcodes)), compression=HDF5_COMPRESSION)

            # Write library info
            lib_info_json = json.dumps(library_info, indent=4, sort_keys=True)
            cr_io.create_hdf5_string_dataset(mc.h5, 'library_info', [lib_info_json])

            # Write barcode info
            g = mc.h5.create_group(BARCODE_INFO_GROUP_NAME)
            MoleculeCounter.save_barcode_info(barcode_info, g)

            # Create empty per-molecule datasets
            for name, col_type in MOLECULE_INFO_COLUMNS.iteritems():
                mc.columns[name] = mc.h5.create_dataset(name, (0,),
                                                        maxshape=(None,),
                                                        dtype=col_type,
                                                        compression=HDF5_COMPRESSION,
                                                        chunks=(HDF5_CHUNK_SIZE,))

        elif mode == 'r':
            mc.h5 = h5py.File(filename, 'r')

            try:
                mc.file_version = mc.h5.attrs[FILE_VERSION_KEY]
            except AttributeError:
                mc.file_version = 1 # V1 doesn't have version field

            if mc.file_version < CURR_FILE_VERSION:
                raise ValueError('The molecule info HDF5 file (format version %d) was produced by an older version of Cell Ranger. Reading these files is unsupported.' % mc.file_version)
            if mc.file_version > CURR_FILE_VERSION:
                raise ValueError('The molecule info HDF5 file (format version %d) was produced by an newer version of Cell Ranger. Reading these files is unsupported.' % mc.file_version)

            for key in mc.h5.keys():
                if key in MOLECULE_INFO_COLUMNS:
                    mc.columns[key] = mc.h5[key]
                elif key in MOLECULE_REF_COLUMNS:
                    mc.ref_columns[key] = mc.h5[key]
                elif key == h5_constants.H5_FEATURE_REF_ATTR:
                    mc.feature_reference = FeatureReference.from_hdf5(mc.h5[key])
                elif key == METRICS_GROUP_NAME \
                     or key == BARCODE_INFO_GROUP_NAME:
                    pass
                else:
                    raise AttributeError("Unrecognized dataset key: %s" % key)

            # Load library info
            mc.library_info = json.loads(cr_io.read_hdf5_string_dataset(mc.h5['library_info'])[0])

        return mc

    def nrows(self):
        return self.get_column_lazy(MOLECULE_INFO_COLUMNS.keys()[0]).shape[0]

    def get_chunk_key(self, idx):
        return tuple(self.get_column_lazy(col)[idx] for col in CHUNK_COLUMNS)

    def set_metric(self, key, value):
        """Set a metric. Serialize to Pickle."""
        self.h5[METRICS_GROUP_NAME].attrs[key] = cPickle.dumps(value)

    def get_metric(self, key):
        """Get a metric."""
        try:
            value = cPickle.loads(self.h5[METRICS_GROUP_NAME].attrs[key])
        except KeyError:
            value = None
        return value

    def set_all_metrics(self, metrics):
        for (k,v) in metrics.iteritems():
            self.set_metric(k, v)

    def get_all_metrics(self):
        return {k:cPickle.loads(v) for k,v in self.h5[METRICS_GROUP_NAME].attrs.iteritems()}

    def append_column(self, name, values):
        """Append an array of values to a column."""
        ds = self.columns[name]
        start = len(ds)
        end = start + len(values)
        ds.resize((end,))
        ds[start:end] = values

    def get_column_lazy(self, col_name):
        """ Retrieve column. Depending on how the file was opened,
        this may only be a file view instead of a full array. """
        return self.columns[col_name]

    def get_column(self, col_name):
        """Load an entire column of data into memory"""
        return self.get_column_lazy(col_name)[:]

    def set_ref_column(self, col_name, values):
        assert col_name in MOLECULE_REF_COLUMNS
        self.ref_columns[col_name] = self.h5.create_carray(self.h5.root, col_name, obj=np.array(values))

    def get_ref_column(self, col_name):
        """Load a reference array into memory as a numpy array"""
        return self.get_ref_column_lazy(col_name)[:]

    def get_ref_column_lazy(self, col_name):
        """Get a reference array as a lazy h5py Dataset"""
        return self.ref_columns[col_name]

    def get_feature_ref(self):
        return FeatureReference.from_hdf5(self.h5[h5_constants.H5_FEATURE_REF_ATTR])

    def get_barcodes(self):
        return self.h5['barcodes'][:]

    def get_num_filtered_barcodes_for_library(self, library_idx):
        """Count the number of barcodes passing filter for a library.
        Args:
          library_idx (int): Index of library to count.
        Returns:
          int: Number of filtered barcodes for this library.
        """
        pass_filter = self.h5[BARCODE_INFO_GROUP_NAME]['pass_filter'][:]
        this_lib = np.flatnonzero(pass_filter[:,1] == library_idx)
        barcode_inds = pass_filter[this_lib, 0]
        return len(np.unique(barcode_inds))

    def get_library_info(self):
        return json.loads(self.h5['library_info'][0])

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def close(self):
        self.h5.close()

    def save(self):
        self.h5.close()

    @staticmethod
    def merge_barcode_infos(bc_infos):
        """Merge a BarcodeInfo into another BarcodeInfo.
        Args:
          src_bc_infos (list of BarcodeInfo): Input BarcodeInfos.
        Returns:
          BarcodeInfo"""
        assert len(bc_infos) > 0
        genomes = bc_infos[0].genomes

        # Total number of barcodes with any information
        pfs = []
        for bc_info in bc_infos:
            assert bc_info.pass_filter.shape[1] == 3
            assert bc_info.genomes == genomes
            pfs.append(bc_info.pass_filter)

        new_pf = np.concatenate(pfs, axis=0)

        # Deduplicate the tuples. Unique throws an error on a zero-row array.
        if new_pf.shape[0] > 0:
            new_pf = np.unique(new_pf, axis=0)

        return BarcodeInfo(
            pass_filter=new_pf,
            genomes=genomes,
        )

    @staticmethod
    def concatenate(out_filename, in_filenames, metrics=None):
        """Concatenate MoleculeCounter HDF5 files
        Args:
          out_filename (str): Output HDF5 filename
          in_filenames (list of str): Input HDF5 filenames
          metrics (dict): Metrics to write
        """
        # Load reference info from first file
        first_mc = MoleculeCounter.open(in_filenames[0], 'r')
        feature_ref = first_mc.get_feature_ref()
        barcodes = first_mc.get_barcodes()
        library_info = first_mc.get_library_info()

        feature_ids = [f.id for f in feature_ref.feature_defs]

        # print 'Merging barcode info'
        bc_infos = []
        for filename in in_filenames:
            with MoleculeCounter.open(filename, 'r') as mc:
                bc_infos.append(mc.get_barcode_info())
        merged_bc_info = MoleculeCounter.merge_barcode_infos(bc_infos)

        # print 'Concatenating molecule info files'
        out_mc = MoleculeCounter.open(out_filename, mode='w',
                                      feature_ref=feature_ref,
                                      barcodes=barcodes,
                                      library_info=library_info,
                                      barcode_info=merged_bc_info)

        for filename in in_filenames:
            with MoleculeCounter.open(filename, mode='r') as in_mc:
                # Assert that these data are compatible
                assert in_mc.get_library_info() == library_info
                assert np.array_equal(in_mc.get_barcodes(), barcodes)
                fref = in_mc.get_feature_ref()
                assert [f.id for f in fref.feature_defs] == feature_ids

                # if no metrics specified, copy them from the first file
                if metrics is None:
                    metrics = in_mc.get_all_metrics()

                # Concatenate per-molecule datasets
                for name, ds in in_mc.columns.iteritems():
                    out_mc.append_column(name, ds[:])

        out_mc.set_all_metrics(metrics)
        out_mc.save()

    def find_last_occurrence_of_chunk_key(self, from_row):
        num_rows = self.nrows()
        initial_chunk_key = self.get_chunk_key(from_row)
        for i in xrange(from_row, num_rows):
             chunk_key = self.get_chunk_key(i)
             if not chunk_key == initial_chunk_key:
                 return i - 1
        return num_rows - 1

    def bisect(self, query, key_func):
        return MoleculeCounter.bisect_static(self.nrows(), query, key_func)

    @staticmethod
    def bisect_static(num_rows, query, key_func):
        """ Performs a binary search to find the leftmost insertion point of query.
        Takes a key function, where key_func(i) = the value to compare to at index i."""
        lo = 0
        hi = num_rows
        exists = True
        while True:
            i = (hi + lo) / 2
            curr = key_func(i)
            if curr == query:
                break
            elif hi - lo <= 1:
                # non-matching case
                exists = False
                break
            elif curr < query:
                lo = i
            else:
                hi = i

        if exists:
            # backtrack to first occurrence
            for j in xrange(i, -1, -1):
                 curr = key_func(j)
                 if curr != query:
                     return j + 1
        return 0

    def get_chunks_from_partition(self, values, key_func):
        return MoleculeCounter.get_chunks_from_partition_static(self.nrows(), values, key_func)

    @staticmethod
    def get_chunks_from_partition_static(num_rows, values, key_func):
        """ Get chunks by partitioning on the specified values."""
        starts = [0] + [MoleculeCounter.bisect_static(num_rows, val, key_func) for val in values[1:]]
        n = len(starts)
        for i in xrange(n):
            chunk_start = starts[i]
            chunk_end = starts[i+1] if i+1 < n else num_rows
            yield (chunk_start, chunk_end - chunk_start)

    def get_chunks(self, target_chunk_len, preserve_boundaries=True):
        """ Get chunks, optionally preserving boundaries defined by get_chunk_key().
            Yields (chunk_start, chunk_len) which are closed intervals """
        num_rows = self.nrows()
        chunk_start, chunk_end = 0, 0
        while chunk_end < (num_rows - 1):
            target_chunk_end = min(num_rows - 1, chunk_start + target_chunk_len - 1)
            chunk_end = self.find_last_occurrence_of_chunk_key(target_chunk_end) if preserve_boundaries else target_chunk_end
            chunk_len = 1 + chunk_end - chunk_start
            yield (chunk_start, chunk_len)
            chunk_start = 1 + chunk_end

    @staticmethod
    def compress_gem_group(x):
        return MOLECULE_INFO_COLUMNS['gem_group'](x)

    @staticmethod
    def compress_umi_seq(x, umi_bits):
        return cr_utils.compress_seq(x, umi_bits)

    @staticmethod
    def get_metrics_from_summary(summary, libraries, total_recovered_cells=None, total_force_cells=None):
        """ Extract relevant metrics from a summary dict."""
        mol_metrics = {}

        version_metrics = ['cellranger_version', 'reference_mkref_version', 'reference_fasta_hash', 'reference_gtf_hash']
        for m in version_metrics:
            mol_metrics[m] = summary[m]

        chemistry_metrics = [m for m in summary if m.startswith('chemistry')]
        for m in chemistry_metrics:
            mol_metrics[m] = summary[m]

        # Per-library values
        lib_metrics = {}
        for lib_idx, lib in enumerate(libraries):
            lib_type_prefix = rna_library.get_library_type_metric_prefix(lib['library_type'])
            summary_name = '%s%s_total_read_pairs_per_library' % (lib_type_prefix, lib_idx)
            lib_metrics[str(lib_idx)] = {
                TOTAL_READS_METRIC: summary[summary_name],
            }

        # Per-gem-group values
        gg_metrics = {}
        gem_groups = sorted([lib['gem_group'] for lib in libraries])
        for gg in gem_groups:
            # Distribute the toplevel expected and forced cells parameters
            #   evenly among the gem groups.
            recovered_cells = total_recovered_cells / len(gem_groups) if total_recovered_cells is not None else None
            force_cells = total_force_cells / len(gem_groups) if total_force_cells is not None else None
            gg_metrics[str(gg)] = {
                GG_RECOVERED_CELLS_METRIC: recovered_cells,
                GG_FORCE_CELLS_METRIC: force_cells,
            }

        mol_metrics[LIBRARIES_METRIC] = lib_metrics
        mol_metrics[GEM_GROUPS_METRIC] = gg_metrics
        return mol_metrics

    @staticmethod
    def naive_concatenate_metrics(mol_h5_list):
        combined_metrics = None
        gg_metrics = {}
        lib_metrics = {}
        for mol_h5 in mol_h5_list:
            with MoleculeCounter.open(mol_h5, mode='r') as counter:
                single_metrics = counter.get_all_metrics()
                if combined_metrics is None:
                    combined_metrics = single_metrics
                    gg_metrics = counter.get_metric(GEM_GROUPS_METRIC)
                    lib_metrics = counter.get_metric(LIBRARIES_METRIC)
                else:
                    # concatenate new gem groups to the metrics. if it collides with an existing
                    # gem group, the old one will be overwritten.
                    new_gg_metrics = counter.get_metric(GEM_GROUPS_METRIC)
                    new_lib_metrics = counter.get_metric(LIBRARIES_METRIC)
                    gg_metrics.update(new_gg_metrics)
                    lib_metrics.update(new_lib_metrics)

        combined_metrics[GEM_GROUPS_METRIC] = gg_metrics
        combined_metrics[LIBRARIES_METRIC] = lib_metrics
        return combined_metrics

    @staticmethod
    def get_compressed_bc_iter(barcodes):
        """ Yields compressed barcode tuples that can be compared against
            a MoleculeCounter's data. Useful for filtering a MoleculeCounter by barcode.
        Args: barcodes (iterable) - list of barcode strings (e.g., ACGT-1)
        Yields: (compressed_bc, compressed_gem_group) tuples """

        for barcode in barcodes:
            barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
            compressed_bc = MoleculeCounter.compress_barcode_seq(barcode_seq)
            compressed_gg = MoleculeCounter.compress_gem_group(gem_group)
            yield compressed_bc, compressed_gg

    def get_raw_read_pairs_per_library(self):
        """ Get raw read pairs per library.
        Returns:
          list of int: Order is by library index
        """
        return [self.get_metric(LIBRARIES_METRIC)[str(li)][TOTAL_READS_METRIC] for li,_ in enumerate(self.library_info)]

    def get_usable_read_pairs_per_library(self):
        """ Get usable read pairs per library.
        Returns:
          list of int: Order is by library index
        """
        return [self.get_metric(LIBRARIES_METRIC)[str(li)][USABLE_READS_METRIC] for li,_ in enumerate(self.library_info)]

    @staticmethod
    def _sum_metric(mol_h5_list, metric_name, metric_type):
        """ Combine a library- or gemgroup- level integer metric across multiple h5 files """
        assert metric_type is LIBRARIES_METRIC or \
            metric_type is GEM_GROUPS_METRIC
        combined = defaultdict(int)
        for mol_h5 in mol_h5_list:
            with MoleculeCounter.open(mol_h5, mode='r') as counter:
                for key, metrics in counter.get_metric(metric_type).iteritems():
                    combined[key] += metrics[metric_name]
        return combined

    @staticmethod
    def sum_library_metric(mol_h5_list, metric_name):
        return MoleculeCounter._sum_metric(mol_h5_list, metric_name, LIBRARIES_METRIC)

    @staticmethod
    def get_total_conf_mapped_reads_in_cells_chunk(filename, filtered_bcs_set, start, length, queue):
        total_mapped_reads = 0
        with MoleculeCounter.open(filename, 'r', start, length) as mc:
            for barcode, gem_group, reads in itertools.izip(mc.get_column('barcode'),
                                                            mc.get_column('gem_group'),
                                                            mc.get_column('reads')):
                if reads < 1:
                    continue
                if (barcode, gem_group) not in filtered_bcs_set:
                    continue
                total_mapped_reads += reads
        queue.put(total_mapped_reads)

    @staticmethod
    def convert_v2_to_v3(v2_mole_info_h5, out_v3_mole_info_h5):
        """
        Given the input v2 molecule info h5 file, convert it into v3 file.
        """
        def get_v2_metrics(h5_file):
            group = tables.open_file(h5_file, 'r').get_node('/metrics')
            attrset = group._v_attrs
            return {k: attrset[k] for k in attrset._f_list()}

        def decompress_barcode_seq(x, barcode_length, bits=64):
            x = np.uint64(x)
            assert barcode_length <= (bits/2 - 1)
            if x & (1L << (bits-1)):
                return 'N' * barcode_length
            result = bytearray(barcode_length)
            for i in xrange(barcode_length):
                result[(barcode_length-1)-i] = tk_seq.NUCS[x & np.uint64(0b11)]
                x = x >> np.uint64(2)
            return str(result)
        
        def build_feature_ref(gene_ids, gene_names, genome_index):
            feature_defs = []
            if len(genome_index) == 1:
                genome = genome_index.keys()[0]
                for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)):
                    feature_defs.append(FeatureDef(index=idx,
                                                   id=gene_id,
                                                   name=gene_name,
                                                   feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                                                   tags={'genome': genome}))
            else:
                for idx, (gene_id, gene_name) in enumerate(zip(gene_ids, gene_names)):
                    genome = gene_id.split('_')[0]
                    feature_defs.append(FeatureDef(index=idx,
                                                   id=gene_id,
                                                   name=gene_name,
                                                   feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                                                   tags={'genome': genome}))

            return FeatureReference(feature_defs, ['genome'])
        
        def get_chunks_by_gem_group(gem_group_arr):
            """ Return exactly one chunk per gem group."""
            # verify gem groups are sorted
            assert np.all(np.diff(gem_group_arr)>=0)
            num_rows = gem_group_arr.shape[0]
            unique_ggs = np.unique(gem_group_arr)
            gg_key = lambda i: gem_group_arr[i]
            chunk_iter = MoleculeCounter.get_chunks_from_partition_static(num_rows, unique_ggs, gg_key)
            for (gg, chunk) in zip(unique_ggs, chunk_iter):
                yield (gg, chunk[0], chunk[1])
        
        random.seed(0)
        np.random.seed(0)

        v2_mc_in = h5py.File(v2_mole_info_h5, 'r')
        v2_metrics = get_v2_metrics(v2_mole_info_h5)

        v2_genome_ids = v2_mc_in['genome_ids']
        v2_genome_name_to_index = {g:i for i, g in enumerate(v2_genome_ids)}
        
        # Feature Ref
        new_feature_ref = build_feature_ref(v2_mc_in['gene_ids'], v2_mc_in['gene_names'], v2_genome_name_to_index)
        
        # barcode whitelist
        barcode_length = v2_metrics[BC_LENGTH_METRIC]
        barcode_whitelist = cr_utils.load_barcode_whitelist(v2_metrics[BC_WHITELIST_METRIC])
        barcode_to_idx = OrderedDict((k, i) for i,k in enumerate(barcode_whitelist))
        gg_total_diversity = len(barcode_whitelist)
        
        v2_genomes = np.asarray(v2_mc_in['genome'], dtype=np.uint8)  # <-> genome information goes into feature_idx in v3
        v2_gene = np.asarray(v2_mc_in['gene'], dtype=MOLECULE_INFO_COLUMNS['feature_idx'])# <-> feature_idx in v3
        v2_conf_mapped_reads = np.asarray(v2_mc_in['reads'], dtype=MOLECULE_INFO_COLUMNS['count']) # <-> count in v3
        v2_barcodes = np.asarray(v2_mc_in['barcode'], dtype=np.uint64) # <-> transit into barcode_idx in v3
        v2_umis = np.asarray(v2_mc_in['umi'], dtype=MOLECULE_INFO_COLUMNS['umi']) # <-> umi in v3
        v2_gem_groups = np.asarray(v2_mc_in['gem_group'], dtype=MOLECULE_INFO_COLUMNS['gem_group']) # <-> gem_group in v3 

        library_info = []
        barcode_info_genomes, barcode_info_pass_filter = [], []
        barcode_idx_list, feature_idx_list, library_idx_list = [], [], [] 
        gem_group_list, count_list, umi_list = [], [], []

        v2_metrics[LIBRARIES_METRIC] = {}
        # each gem_group is a library
        for lib_idx, (gem_group, chunk_start, chunk_len) in enumerate(get_chunks_by_gem_group(v2_gem_groups)):
            library_info.append({
                'gem_group': int(gem_group), 
                'library_id': str(lib_idx), 
                'library_type': lib_constants.GENE_EXPRESSION_LIBRARY_TYPE
            })

            # per library, raw_read_pairs and usable_read_pairs info
            v2_metrics[LIBRARIES_METRIC][str(lib_idx)] = {
                USABLE_READS_METRIC : v2_metrics[GEM_GROUPS_METRIC][gem_group]['conf_mapped_filtered_bc_reads'], 
                TOTAL_READS_METRIC : v2_metrics[GEM_GROUPS_METRIC][gem_group]['total_reads']
            }

            recovered_cells = v2_metrics[GEM_GROUPS_METRIC][gem_group].get(GG_RECOVERED_CELLS_METRIC, None)
            force_cells = v2_metrics[GEM_GROUPS_METRIC][gem_group].get(GG_FORCE_CELLS_METRIC, None)

            chunk_end = chunk_start + chunk_len
            genomes_for_gem_group = v2_genomes[chunk_start:chunk_end]
            bcs_for_gem_group = v2_barcodes[chunk_start:chunk_end]
            reads_for_gem_group = v2_conf_mapped_reads[chunk_start:chunk_end]
            gene_for_gem_group = v2_gene[chunk_start:chunk_end]
            umis_for_gem_group = v2_umis[chunk_start:chunk_end]
            
            for genome_id in v2_genome_ids:
                g_idx = v2_genome_name_to_index[genome_id]
                genome_indices = genomes_for_gem_group == g_idx

                if genome_indices.sum() == 0:
                    # edge case - there's no data for this genome (e.g. empty sample, false barnyard sample, or nothing confidently mapped)
                    continue

                bcs_for_genome = bcs_for_gem_group[genome_indices]
                reads_for_genome = reads_for_gem_group[genome_indices]
                gene_for_genome = gene_for_gem_group[genome_indices]
                umis_for_genome = umis_for_gem_group[genome_indices]

                # only count UMIs with at least one conf mapped read
                umi_conf_mapped_to_genome = reads_for_genome > 0
                bc_breaks = bcs_for_genome[1:] - bcs_for_genome[:-1]
                bc_breaks = np.concatenate(([1], bc_breaks)) # first row is always a break
                bc_break_indices = np.nonzero(bc_breaks)[0]
                unique_bcs = bcs_for_genome[bc_break_indices]
                umis_per_bc = np.add.reduceat(umi_conf_mapped_to_genome, bc_break_indices)
                
                if force_cells is not None:
                    top_bc_indices, _, _ = cr_stats.filter_cellular_barcodes_fixed_cutoff(umis_per_bc, force_cells)
                else:
                    top_bc_indices, _, _ = cr_stats.filter_cellular_barcodes_ordmag(umis_per_bc, recovered_cells, gg_total_diversity)
                
                # barcode info
                barcode_seq_to_idx = {b:barcode_to_idx[decompress_barcode_seq(b, barcode_length)] for b in unique_bcs}

                barcode_info_genomes.append(genome_id)
                for b in unique_bcs[top_bc_indices]:
                    barcode_info_pass_filter.append((barcode_seq_to_idx[b], lib_idx, g_idx))  

                # data
                barcode_idx_list.append(np.vectorize(barcode_seq_to_idx.get)(bcs_for_genome))
                count_list.append(reads_for_genome)
                gem_group_list.append(np.full(reads_for_genome.shape[0], gem_group, dtype=MOLECULE_INFO_COLUMNS['gem_group']))
                library_idx_list.append(np.full(reads_for_genome.shape[0], lib_idx, dtype=MOLECULE_INFO_COLUMNS['library_idx']))
                feature_idx_list.append(gene_for_genome)
                umi_list.append(umis_for_genome)
        
        new_barcode_info = BarcodeInfo(
            pass_filter=np.array(barcode_info_pass_filter, dtype=BARCODE_INFO_DTYPES['pass_filter']),
            genomes=barcode_info_genomes,
        )

        with MoleculeCounter.open(out_v3_mole_info_h5, 'w',
                                  feature_ref=new_feature_ref,
                                  barcodes=barcode_whitelist,
                                  library_info=library_info,
                                  barcode_info=new_barcode_info,
        ) as out_mc:
            out_mc.append_column('barcode_idx', np.concatenate(barcode_idx_list))
            out_mc.append_column('count', np.concatenate(count_list))
            out_mc.append_column('feature_idx', np.concatenate(feature_idx_list))
            out_mc.append_column('gem_group', np.concatenate(gem_group_list))
            out_mc.append_column('umi', np.concatenate(umi_list))
            # library_idx is the same as gem_group_list 
            out_mc.append_column('library_idx', np.concatenate(library_idx_list))
            out_mc.set_all_metrics(v2_metrics)
        
        return
Ejemplo n.º 12
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.barcode_compatible = True
    outs.barcode_compatibility_info = {}  # record sampled barcode info
    outs.skip_translate = {}

    if chunk_outs is None or len(chunk_outs) == 0:
        return

    # aggreagate barcodes from chunk, {gem_group : {library_type : count_of_fastq_file} }
    sampled_barcodes = defaultdict(lambda: defaultdict(list))
    for chunk_def, chunk_out in zip(chunk_defs, chunk_outs):
        gem_group, lib = chunk_def.gem_group, chunk_def.library_type
        sampled_barcodes[gem_group][lib].extend(chunk_out.sampled_barcodes)

    barcodes_in_whitelist = cr_utils.load_barcode_whitelist(
        args.barcode_whitelist, as_set=True)
    barcode_translate_map = cr_utils.load_barcode_translate_map(
        args.barcode_whitelist)

    sampled_bc_counter_in_wl = defaultdict(
        lambda: defaultdict(lambda: defaultdict(int)))

    for gem_group in sampled_barcodes:
        outs.barcode_compatibility_info[gem_group] = {}
        for lib in sampled_barcodes[gem_group]:
            sampled_bc = sampled_barcodes[gem_group][lib]
            unique_bc = set(sampled_bc)
            unique_bc_in_wl = unique_bc.intersection(barcodes_in_whitelist)

            outs.barcode_compatibility_info[gem_group][lib] = {}
            outs.barcode_compatibility_info[gem_group][lib][
                'num_barcodes_sampled'] = len(sampled_bc)
            outs.barcode_compatibility_info[gem_group][lib][
                'num_barcodes_sampled_unique'] = len(unique_bc)
            outs.barcode_compatibility_info[gem_group][lib][
                'num_barcodes_sampled_unique_in_whitelist'] = len(
                    unique_bc_in_wl)

            sampled_bc_counter_in_wl[gem_group][lib] = {
                k: v
                for (k, v) in Counter(sampled_bc).iteritems()
                if k in unique_bc_in_wl
            }

    barcode_compatibility_cutoff = cr_constants.BARCODE_COMPATIBILITY_CUTOFF if args.barcode_compatibility_cutoff is None else args.barcode_compatibility_cutoff

    pairwise_compatibility = {}
    exit_log_msg = "Barcodes from libraries are not compatible."

    for gem_group in sampled_barcodes:
        outs.skip_translate[gem_group] = {}
        pairwise_compatibility[gem_group] = {}
        library_types = sampled_barcodes[gem_group].keys()

        if len(library_types) < 2:
            continue

        if GENE_EXPRESSION_LIBRARY_TYPE in library_types:
            base_lib = GENE_EXPRESSION_LIBRARY_TYPE
            library_types.remove(base_lib)
            outs.skip_translate[gem_group][base_lib] = True
        else:
            # TODO: as for CR3.0, we need GEX for cell calling et al
            # at some point, we might support samples without GEX
            martian.exit(
                "Gene expression data not found in the GEM group {}.".format(
                    gem_group))

        base_lib_counter = sampled_bc_counter_in_wl[gem_group][base_lib]
        for lib in library_types:
            pair_key = '{}/{}'.format(base_lib, lib)
            pairwise_compatibility[gem_group][pair_key] = {}
            lib_counter = sampled_bc_counter_in_wl[gem_group][lib]

            # without translate
            overlap_size = len(
                set(base_lib_counter).intersection(set(lib_counter)))
            cosine_sim = robust_cosine_similarity(base_lib_counter,
                                                  lib_counter)
            outs.skip_translate[gem_group][lib] = True

            # with translate
            if (lib != GENE_EXPRESSION_LIBRARY_TYPE) and (barcode_translate_map
                                                          is not None):
                translated_counter = {
                    barcode_translate_map.get(k, k): v
                    for (k, v) in lib_counter.iteritems()
                }
                overlap_size_translated = len(
                    set(base_lib_counter).intersection(
                        set(translated_counter)))
                cosine_sim_translated = robust_cosine_similarity(
                    base_lib_counter, translated_counter)

                if cosine_sim_translated > cosine_sim:
                    outs.skip_translate[gem_group][lib] = False
                    overlap_size = overlap_size_translated
                    cosine_sim = cosine_sim_translated

            pairwise_compatibility[gem_group][pair_key][
                'overlap_size'] = overlap_size
            pairwise_compatibility[gem_group][pair_key][
                'cosine_similarity'] = cosine_sim
            if cosine_sim < barcode_compatibility_cutoff:
                outs.barcode_compatible = False
                exit_log_msg += '\n - GEM group {}: Barcodes from [{}] and [{}] have cosine similarity {:.4f}'.format(
                    gem_group, base_lib, lib, cosine_sim)

    outs.barcode_compatibility_info[
        'pairwise_compatibility'] = pairwise_compatibility
    # format warning/error message if incompatible
    if outs.barcode_compatible is False:
        martian.log_info(exit_log_msg)
        martian.exit(exit_log_msg)

    return
Ejemplo n.º 13
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group,
                                              args.library_type)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk)
    in_read2_fastq = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else []

    outs.corrected_bcs += h5_constants.LZ4_SUFFIX
    out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                            tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        processed_bc = None

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        out_file.write('%s\n' %
                       (processed_bc if processed_bc is not None else ''))

    in_read1_fastq.close()
    if in_read2_fastq:
        in_read2_fastq.close()
    out_file.close()

    bc_counter.close()

    reporter.save(outs.chunked_reporter)
Ejemplo n.º 14
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = open(args.read1_chunk)
    in_read2_fastq = open(args.read2_chunk)
    out_read1_fastq = open(outs.corrected_read1s, 'w')
    out_read2_fastq = open(outs.corrected_read2s, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                    tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])
        read2_header = cr_fastq.AugmentedFastqHeader(read2[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)
                read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)
                read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(),
                                  read1[1], read1[2])
        tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(),
                                  read2[1], read2[2])

    in_read1_fastq.close()
    in_read2_fastq.close()
    out_read1_fastq.close()
    out_read2_fastq.close()
    bc_counter.close()

    reporter.save(outs.chunked_reporter)
Ejemplo n.º 15
0
def main(args, outs):
    outs.coerce_strings()

    # Load whitelist
    whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist))

    # Load feature reference
    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    # Load library info from BAM
    in_bam = tk_bam.create_bam_infile(args.chunk_input)
    library_info = rna_library.get_bam_library_info(in_bam)

    # Get cell-associated barcodes by genome
    filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bc_union = cr_utils.get_cell_associated_barcode_set(
        args.filtered_barcodes)

    # Create the barcode info
    barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome,
                                                      library_info, whitelist)

    # Create the molecule info file
    mc = MoleculeCounter.open(outs.output,
                              mode='w',
                              feature_ref=feature_ref,
                              barcodes=whitelist,
                              library_info=library_info,
                              barcode_info=barcode_info)

    # Initialize per-library metrics
    lib_metrics = {}
    for lib_idx in xrange(len(library_info)):
        lib_metrics[str(lib_idx)] = {}
        lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0

    # Record read-counts per molecule. Note that UMIs are not contiguous
    # in the input because no sorting was done after UMI correction.

    prev_gem_group = None
    prev_barcode_idx = None

    for (gem_group, barcode_seq), reads_iter in \
        itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi):
        if barcode_seq is None:
            continue

        barcode_idx = barcode_to_idx[barcode_seq]

        # Assert expected sort order of input BAM
        assert gem_group >= prev_gem_group
        if gem_group == prev_gem_group:
            assert barcode_idx >= prev_barcode_idx

        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode_seq, gem_group) in filtered_bc_union

        counts = defaultdict(int)

        for read in reads_iter:
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or \
               read.is_read2 or \
               cr_utils.is_read_low_support_umi(read) or \
               not cr_utils.is_read_conf_mapped_to_feature(read):
                continue

            umi_seq = cr_utils.get_read_umi(read)
            if umi_seq is None:
                continue

            umi_int = MoleculeCounter.compress_umi_seq(
                umi_seq,
                MoleculeCounter.get_column_dtype('umi').itemsize * 8)

            feature_ids = cr_utils.get_read_gene_ids(read)
            assert len(feature_ids) == 1
            feature_int = feature_ref.id_map[feature_ids[0]].index

            library_idx = cr_utils.get_read_library_index(read)

            counts[(umi_int, library_idx, feature_int)] += 1

            if is_cell_barcode:
                lib_metrics[str(library_idx)][
                    cr_mol_counter.USABLE_READS_METRIC] += 1

            prev_gem_group = gem_group
            prev_barcode_idx = barcode_idx

        # Record data for this barcode
        gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group)
        mc.append_column('gem_group', np.repeat(gg_int, len(counts)))
        bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type(
            barcode_idx)
        mc.append_column('barcode_idx', np.repeat(bc_int, len(counts)))

        feature_ints = np.fromiter(
            (k[2] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('feature_idx'),
            count=len(counts))
        # Sort by feature for fast matrix construction
        order = np.argsort(feature_ints)
        feature_ints = feature_ints[order]
        mc.append_column('feature_idx', feature_ints)
        del feature_ints

        li_ints = np.fromiter(
            (k[1] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('library_idx'),
            count=len(counts))[order]
        mc.append_column('library_idx', li_ints)
        del li_ints

        umi_ints = np.fromiter((k[0] for k in counts.iterkeys()),
                               dtype=MoleculeCounter.get_column_dtype('umi'),
                               count=len(counts))[order]
        mc.append_column('umi', umi_ints)
        del umi_ints

        count_ints = np.fromiter(
            counts.itervalues(),
            dtype=MoleculeCounter.get_column_dtype('count'),
            count=len(counts))[order]
        mc.append_column('count', count_ints)
        del count_ints

    in_bam.close()

    mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics))

    mc.save()
Ejemplo n.º 16
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [rna_read_def, rna_read2_def,
                 bc_read_def, si_read_def, umi_read_def]
    read_tags = [None, None,
                 (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
                 (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
                 (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
             ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def),
                                  primers=cr_utils.get_primers_from_dicts(args.primers),
                                  gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter)
    bc_check_rc.close()

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()


    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
Ejemplo n.º 17
0
def _get_barcode_whitelist_set(chemistry):
    return set(cr_utils.load_barcode_whitelist(get_barcode_whitelist(chemistry)))
Ejemplo n.º 18
0
def main(args, outs):
    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = {}

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes(
            args.umi_summary, int(gem_group), args.min_umis,
            args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio)

        # Record the threshold
        reporter._get_metric_attr(
            'vdj_filtered_bc_contig_kth_umi_readpair_threshold',
            gem_group).set_value(threshold)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            cell_barcodes.update(set(gg_cell_bcs))
            bc_support.update(gg_bc_support)

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        # Load the assembly metrics summary to get the total assemblable reads
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes)

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs,
                                        assemblable_read_pairs,
                                        recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
Ejemplo n.º 19
0
def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained
    trim_defs = compute_trim_defs(
        read_defs, read_tags,
        args.chemistry_def.get('retain_trimmed_suffix_read'))

    outs.bam_comments = sorted(
        set([td.bam_to_fastq for td in trim_defs.itervalues()]))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def,
                              args.reads_interleaved, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                  bc_check_rc.in_iter)
    bc_check_rc.close()

    # Determine which read_iters need to retain trimmed sequence
    # (only one per read-type e.g., one per R1, one per R2, etc.)
    read_types_with_trim_def = set()
    rna_read_trim_defs = None
    rna_read2_trim_defs = None
    bc_read_trim_defs = None
    si_read_trim_defs = None
    umi_read_trim_defs = None

    if rna_read_def.read_type not in read_types_with_trim_def:
        rna_read_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read_def.read_type)
    if rna_read2_def.read_type not in read_types_with_trim_def:
        rna_read2_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read2_def.read_type)
    if bc_read_def.read_type not in read_types_with_trim_def:
        bc_read_trim_defs = trim_defs
        read_types_with_trim_def.add(bc_read_def.read_type)
    if si_read_def.read_type not in read_types_with_trim_def:
        si_read_trim_defs = trim_defs
        read_types_with_trim_def.add(si_read_def.read_type)
    if umi_read_def.read_type not in read_types_with_trim_def:
        umi_read_trim_defs = trim_defs
        read_types_with_trim_def.add(umi_read_def.read_type)

    # Setup read iterators.
    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, rna_read_trim_defs)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, rna_read2_trim_defs)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, bc_read_trim_defs)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, si_read_trim_defs)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, umi_read_trim_defs)
    else:
        umi_reads = FastqReader(None, None, False, None)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    # Compute trim order of the readers; this is to ensure stability in the ordering
    # in which trimmed sequence is added to the TRIMMED_SEQ tags
    trim_order = list(
        np.argsort([
            reader.read_def.read_type for reader in fastq_readers
            if reader.read_def is not None
        ]))

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME)
    trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs,
                                          args.reads_per_file)

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction.read if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ

        # Extra trimming for internal purposes
        if args.rna_read_length is not None:
            rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length],
                        rna_read[2][0:args.rna_read_length])

        # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2)
        # to ensure stability
        trimmed_seq = ''
        trimmed_qual = ''
        for i in trim_order:
            if extractions[i] is None:
                continue
            trimmed_seq += extractions[i].trimmed_seq
            trimmed_qual += extractions[i].trimmed_qual

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              args.gem_group,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        # Write trimmed sequence data to a separate, unaligned BAM file
        # Note: We assume that there is only one trimmed sequence per read-pair
        trimmed_seq_data = pysam.AlignedSegment()
        trimmed_seq_data.query_name = fastq_header_str1.split(
            AugmentedFastqHeader.WORD_SEP)[0]
        trimmed_seq_data.flag = 4
        trimmed_seq_data.seq = trimmed_seq
        trimmed_seq_data.qual = trimmed_qual
        trimmed_seq_writer.write(trimmed_seq_data)

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write(
                (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    trimmed_seq_writer.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
        outs.trimmed_seqs = trimmed_seq_writer.get_out_paths()
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []
        outs.trimmed_seqs = []

    assert len(outs.gem_groups) == len(outs.reads)
    if paired_end:
        assert len(outs.reads) == len(outs.read2s)
    assert len(outs.trimmed_seqs) == len(outs.reads)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)
Ejemplo n.º 20
0
def _get_barcode_whitelist_set(chemistry):
    return cr_utils.load_barcode_whitelist(get_barcode_whitelist(chemistry),
                                           as_set=True)