Python Reporter Examples

Programming Language: Python

Namespace/Package Name: cellranger.report

Method/Function: Reporter

Examples at hotexamples.com: 12

Python Reporter - 12 examples found. These are the top rated real world Python examples of cellranger.report.Reporter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    libraries = rna_library.get_bam_library_info(in_bam)
    distinct_library_types = sorted(
        list(set([x['library_type'] for x in libraries])))
    library_prefixes = map(
        lambda lib: rna_library.get_library_type_metric_prefix(lib[
            'library_type']), libraries)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_tsv(
        args.barcodes_detected) if not barcode_whitelist else None

    # TODO: this is redundant
    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups,
                                  library_types=distinct_library_types)

    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    matrix = cr_matrix.CountMatrix.empty(feature_ref,
                                         barcode_seqs,
                                         dtype='int32')

    for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None):
        is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb(
            reads_iter,
            libraries,
            library_prefixes,
            use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrix.add(feature_id, bc)

    in_bam.close()

    reporter.store_reference_metadata(args.reference_path,
                                      cr_constants.REFERENCE_TYPE,
                                      cr_constants.REFERENCE_METRIC_PREFIX)

    matrix.save_h5_file(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)

Example #2

Show file

File: common.py Project: mosquitoCat/cellranger

def get_constants_for_pipeline(pipeline):
    if pipeline == shared_constants.PIPELINE_VDJ:
        metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS
        metric_prefixes = vdj_report.VdjReporter().get_all_prefixes()
    else:
        metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS
        metric_prefixes = cr_report.Reporter().get_all_prefixes()

    return metrics, alarms, charts, metric_prefixes

Example #3

Show file

def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.input)
    in_bam_chunk = tk_bam.read_bam_chunk(in_bam,
                                         (args.chunk_start, args.chunk_end))
    out_bam, _ = tk_bam.create_bam_outfile(outs.output,
                                           None,
                                           None,
                                           template=in_bam)

    chroms = in_bam.references
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  chroms=chroms)

    for (gg, bc, gene_ids), reads_iter in itertools.groupby(
            in_bam_chunk, key=cr_utils.barcode_sort_key):
        # Ignore reads w/o a valid barcode, unmapped reads and reads that map to more than 1 gene
        if bc is None or gg is None or gene_ids is None or len(gene_ids) != 1:
            for read in reads_iter:
                reporter.mark_dupes_corrected_cb(read)
                out_bam.write(read)
            continue

        reads = list(reads_iter)
        gene_id = gene_ids[0]

        # Count cDNA PCR duplicates with uncorrected UMIs
        dupe_key_umi_counts = mark_dupes(
            bc, gene_id, reads, args,
            cr_constants.CDNA_PCR_UNCORRECTED_DUPE_TYPE,
            cr_utils.cdna_pcr_dupe_func, reporter)

        # Record UMI corrections
        umi_corrections = correct_umis(dupe_key_umi_counts)

        # Mark duplicates for cDNA PCR duplicates with corrected UMIs
        mark_dupes(bc,
                   gene_id,
                   reads,
                   args,
                   cr_constants.CDNA_PCR_DUPE_TYPE,
                   cr_utils.cdna_pcr_dupe_func,
                   reporter,
                   corrected_dupe_keys=umi_corrections,
                   out_bam=out_bam)

        # Count duplicates for SI PCR duplicates with uncorrected UMIs
        mark_dupes(bc, gene_id, reads, args, cr_constants.SI_PCR_DUPE_TYPE,
                   cr_utils.si_pcr_dupe_func, reporter)

    in_bam.close()
    out_bam.close()
    reporter.save(outs.chunked_reporter)

Example #4

Show file

def main(args, outs):
    np.random.seed(0)

    subsample_rate = args.subsample_info.get('subsample_rate')
    if subsample_rate is None:
        return

    mol_counter = MoleculeCounter.open(args.molecule_info,
                                       'r',
                                       start=int(args.chunk_start),
                                       length=int(args.chunk_len))

    # Subsample the matrices
    subsample_result = {}
    subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter(
        mol_counter,
        subsample_rate=subsample_rate,
        subsample_result=subsample_result)

    # Filter the subsampled matrices
    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    subsampled_filt_mats = subsampled_raw_mats.filter_barcodes(
        filtered_bcs_per_genome)

    # Calculations for subsampled duplication rate
    reporter = cr_report.Reporter(
        genomes=map(str, mol_counter.get_ref_column('genome_ids')),
        subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES,
        subsample_depths=args.subsample_info['all_target_rpc'])

    reporter.subsampled_duplication_frac_cb(
        subsampled_raw_mats,
        mol_counter,
        args.subsample_info['subsample_rate'],
        args.subsample_info['subsample_type'],
        args.subsample_info['target_rpc'],
        subsample_result['mapped_reads'],
    )

    mol_counter.close()

    reporter.save(outs.chunked_reporter)

    outs.subsampled_matrices = {}
    outs.subsampled_matrices['raw_matrices'] = martian.make_path(
        'raw_matrices.h5')
    outs.subsampled_matrices['filtered_matrices'] = martian.make_path(
        'filtered_matrices.h5')

    subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices'])
    subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])

Example #5

Show file

def get_constants_for_pipeline(pipeline, sample_properties):
    """ Get the appropriate metrics/alarms/charts for a pipeline """
    if pipeline == shared_constants.PIPELINE_VDJ:
        metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS

        metric_prefixes = filter_vdj_prefixes(
            vdj_report.VdjReporter().get_all_prefixes(), sample_properties)

        alarms = filter_vdj_alarms(alarms, sample_properties)

    else:
        metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS

        metric_prefixes = cr_report.Reporter().get_all_prefixes()

    return metrics, alarms, charts, metric_prefixes

Example #6

Show file

def main(args, outs):
    reference_star_path = cr_utils.get_reference_star_path(args.reference_path)
    star_index = cr_transcriptome.build_star_index(reference_star_path)
    chroms = star_index[0][0]
    gene_index = cr_reference.GeneIndex.load_pickle(cr_utils.get_reference_genes_index(args.reference_path))
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group)
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_constants.STAR_DEFAULT_HIGH_CONF_MAPQ,
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_dist=barcode_dist,
                                  gem_groups=args.gem_groups,
                                  umi_length=cr_chem.get_umi_length(args.chemistry_def),
                                  umi_min_qual_threshold=args.umi_min_qual_threshold)

    reporter.attach_bcs_init()
    outs.num_alignments = process_alignments(args.chunk_genome_input, args.chunk_trimmed_input, outs.output, args.bam_comments, reporter, gene_index, star_index, args)
    reporter.attach_bcs_finalize()
    reporter.save(outs.chunked_reporter)

Example #7

Show file

def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_summary(
        args.barcode_summary) if not barcode_whitelist else None

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes)
    matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs)

    for read in in_bam:
        is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb(
            read, use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrices.add(genome, gene_id, bc)

    in_bam.close()

    matrices.save_h5(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)

Example #8

Show file

File: __init__.py Project: yu1033704806/cellranger

def join(args, outs, chunk_defs, chunk_outs):
    summary = cr_utils.merge_jsons_as_dict([
        args.extract_reads_summary,
        args.attach_bcs_and_umis_summary,
        args.mark_duplicates_summary,
    ])

    # Hack for getting reference metadata -
    # this used to be computed in prior stages.
    # This is needed for storage in the molecule_info HDF5.
    tmp_reporter = cr_report.Reporter()
    tmp_reporter.store_reference_metadata(args.reference_path,
                                          cr_constants.REFERENCE_TYPE,
                                          cr_constants.REFERENCE_METRIC_PREFIX)
    ref_metadata = tmp_reporter.report(cr_constants.DEFAULT_REPORT_TYPE)
    summary.update(ref_metadata)

    # Load library info from BAM
    in_bam = tk_bam.create_bam_infile(args.inputs[0])
    library_info = rna_library.get_bam_library_info(in_bam)

    metrics = MoleculeCounter.get_metrics_from_summary(summary, library_info,
                                                       args.recovered_cells,
                                                       args.force_cells)

    input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs]
    # update with metrics that were computed in the chunks
    chunk_metric = cr_mol_counter.USABLE_READS_METRIC
    summed_lib_metrics = MoleculeCounter.sum_library_metric(
        input_h5_filenames, chunk_metric)
    for lib_key, value in summed_lib_metrics.iteritems():
        metrics[cr_mol_counter.LIBRARIES_METRIC][lib_key][chunk_metric] = value

    MoleculeCounter.concatenate(outs.output,
                                input_h5_filenames,
                                metrics=metrics)

Example #9

Show file

def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained
    trim_defs = compute_trim_defs(
        read_defs, read_tags,
        args.chemistry_def.get('retain_trimmed_suffix_read'))

    outs.bam_comments = sorted(
        set([td.bam_to_fastq for td in trim_defs.itervalues()]))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def,
                              args.reads_interleaved, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                  bc_check_rc.in_iter)
    bc_check_rc.close()

    # Determine which read_iters need to retain trimmed sequence
    # (only one per read-type e.g., one per R1, one per R2, etc.)
    read_types_with_trim_def = set()
    rna_read_trim_defs = None
    rna_read2_trim_defs = None
    bc_read_trim_defs = None
    si_read_trim_defs = None
    umi_read_trim_defs = None

    if rna_read_def.read_type not in read_types_with_trim_def:
        rna_read_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read_def.read_type)
    if rna_read2_def.read_type not in read_types_with_trim_def:
        rna_read2_trim_defs = trim_defs
        read_types_with_trim_def.add(rna_read2_def.read_type)
    if bc_read_def.read_type not in read_types_with_trim_def:
        bc_read_trim_defs = trim_defs
        read_types_with_trim_def.add(bc_read_def.read_type)
    if si_read_def.read_type not in read_types_with_trim_def:
        si_read_trim_defs = trim_defs
        read_types_with_trim_def.add(si_read_def.read_type)
    if umi_read_def.read_type not in read_types_with_trim_def:
        umi_read_trim_defs = trim_defs
        read_types_with_trim_def.add(umi_read_def.read_type)

    # Setup read iterators.
    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, rna_read_trim_defs)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, rna_read2_trim_defs)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, bc_read_trim_defs)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, si_read_trim_defs)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, umi_read_trim_defs)
    else:
        umi_reads = FastqReader(None, None, False, None)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    # Compute trim order of the readers; this is to ensure stability in the ordering
    # in which trimmed sequence is added to the TRIMMED_SEQ tags
    trim_order = list(
        np.argsort([
            reader.read_def.read_type for reader in fastq_readers
            if reader.read_def is not None
        ]))

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    # Bam file to write auxiliary data to (that won't fit in a fastq hdr / QNAME)
    trimmed_seq_writer = ChunkedBamWriter(outs.trimmed_seqs,
                                          args.reads_per_file)

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction.read if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction.read if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction.read if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction.read if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction.read if umi_extraction is not None else EMPTY_READ

        # Extra trimming for internal purposes
        if args.rna_read_length is not None:
            rna_read = (rna_read[0], rna_read[1][0:args.rna_read_length],
                        rna_read[2][0:args.rna_read_length])

        # Accumulate trimmed sequence; ordering is by read-type (I1,I2,R1,R2)
        # to ensure stability
        trimmed_seq = ''
        trimmed_qual = ''
        for i in trim_order:
            if extractions[i] is None:
                continue
            trimmed_seq += extractions[i].trimmed_seq
            trimmed_qual += extractions[i].trimmed_qual

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              args.gem_group,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        # Write trimmed sequence data to a separate, unaligned BAM file
        # Note: We assume that there is only one trimmed sequence per read-pair
        trimmed_seq_data = pysam.AlignedSegment()
        trimmed_seq_data.query_name = fastq_header_str1.split(
            AugmentedFastqHeader.WORD_SEP)[0]
        trimmed_seq_data.flag = 4
        trimmed_seq_data.seq = trimmed_seq
        trimmed_seq_data.qual = trimmed_qual
        trimmed_seq_writer.write(trimmed_seq_data)

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write(
                (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    trimmed_seq_writer.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
        outs.trimmed_seqs = trimmed_seq_writer.get_out_paths()
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []
        outs.trimmed_seqs = []

    assert len(outs.gem_groups) == len(outs.reads)
    if paired_end:
        assert len(outs.reads) == len(outs.read2s)
    assert len(outs.trimmed_seqs) == len(outs.reads)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)

Example #10

Show file

def join(args, outs, chunk_defs, chunk_outs):
    outs.reads, outs.read2s, outs.tags = [], [], []
    outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], []

    for chunk_out in chunk_outs:
        outs.reads += [read for read in chunk_out.reads]
        outs.read2s += [read2 for read2 in chunk_out.read2s]
        outs.tags += [tags for tags in chunk_out.tags]
        outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups]
        outs.library_types += [lt for lt in chunk_out.library_types]
        outs.library_ids += [li for li in chunk_out.library_ids]
        outs.read_groups += [
            read_group for read_group in chunk_out.read_groups
        ]

    # Ensure that we have non-zero reads
    if not outs.reads:
        martian.exit(
            "No reads found. Check the input fastqs and/or the chemistry definition"
        )
    # Ensure consistency of BAM comments
    assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments
               for chunk_out in chunk_outs)
    outs.bam_comments = chunk_outs[0].bam_comments

    # Write barcode counts (merged by library_type)
    bc_counters = BarcodeCounter.merge_by(
        [co.barcode_counts
         for co in chunk_outs], [cd.library_type for cd in chunk_defs],
        args.barcode_whitelist, outs.gem_groups)
    with open(outs.barcode_counts, 'w') as f:
        tk_safe_json.dump_numpy(bc_counters, f)

    # Write feature counts
    feature_counts = None
    for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs):
        with open(chunk_out.feature_counts) as f:
            chunk_counts = np.asarray(json.load(f), dtype=int)
            if feature_counts is None:
                feature_counts = chunk_counts
            else:
                feature_counts += chunk_counts

    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    outs.align = cr_utils.select_alignment_params(args.align)

    # Group reporters by library type
    outs.chunked_reporter = None
    reporter_groups = defaultdict(list)
    for chunk_def, chunk_out in zip(chunk_defs, chunk_outs):
        if not chunk_out.reads:
            continue
        chunk_lib_types = set(lt for lt in chunk_out.library_types)
        assert len(chunk_lib_types) == 1
        lib_type = list(chunk_lib_types)[0]
        reporter_groups[lib_type].append(chunk_out.chunked_reporter)

    # Merge reporters and prefix JSON keys by library type
    summary = {}
    for lib_type, reporters in reporter_groups.iteritems():
        j = cr_report.merge_reporters(reporters).to_json()

        prefix = rna_library.get_library_type_metric_prefix(lib_type)
        j_prefixed = dict((prefix + k, v) for k, v in j.iteritems())

        summary.update(j_prefixed)

    # Use a temporary reporter to generate the metadata (w/o a prefix)
    tmp_reporter = cr_report.Reporter()
    tmp_reporter.store_chemistry_metadata(args.chemistry_def)
    summary.update(tmp_reporter.to_json())

    # Write summary JSON
    with open(outs.summary, 'w') as f:
        tk_safe_json.dump_numpy(summary, f, pretty=True)

Example #11

Show file

def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Build the feature reference
    if args.reference_path:
        feature_ref = rna_feature_ref.from_transcriptome_and_csv(
            args.reference_path, args.feature_reference)
    else:
        feature_ref = rna_feature_ref.FeatureReference.empty()

    # Setup feature barcode extraction
    feature_extractor = rna_feature_ref.FeatureExtractor(
        feature_ref, use_feature_types=[args.library_type])

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [
        rna_read_def, rna_read2_def, bc_read_def, si_read_def, umi_read_def
    ]
    read_tags = [
        None,
        None,
        (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
        (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
        (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
    ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    num_libraries = len(args.library_info)
    reporter = cr_report.Reporter(
        umi_length=cr_chem.get_umi_length(args.chemistry_def),
        primers=cr_utils.get_primers_from_dicts(args.primers),
        num_libraries=num_libraries)

    # Determine if barcode sequences need to be reverse complemented.
    with FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved,
                     None, None) as bc_check_rc:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist, True)
        barcode_rc = infer_barcode_reverse_complement(barcode_whitelist,
                                                      bc_check_rc.in_iter)

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def,
                            args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter,
                                 cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def,
                                args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(
                r2_reader.in_iter,
                cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()

    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def,
                            args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def,
                             args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def,
                           args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def,
                           args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def,
                                args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    # Record feature counts:
    feature_counts = np.zeros(feature_ref.get_num_features(), dtype=int)

    # If this library type has no feature barcodes, make the reader a NOOP
    if feature_extractor.has_features_to_extract():
        feature_reads = FastqFeatureReader(args.read_chunks, feature_extractor,
                                           args.reads_interleaved, r1_length,
                                           r2_length)
    else:
        feature_reads = FastqReader(None, None, None, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads,
                     feature_reads)

    read1_writer = ChunkedFastqWriter(outs.reads,
                                      args.reads_per_file,
                                      compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s,
                                          args.reads_per_file,
                                          compression=COMPRESSION)

    tag_writer = None
    if not args.augment_fastq:
        tag_writer = ChunkedFastqWriter(outs.tags,
                                        args.reads_per_file,
                                        compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(
        *[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter,
                                        args.chunk_initial_reads):
        # Downsample
        if random.random() > args.chunk_subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction, feature_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]),
                           bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        lib_idx = [
            i for i, x in enumerate(args.library_info)
            if x['library_id'] == args.library_id
        ][0]
        reporter.raw_fastq_cb(rna_read,
                              rna_read2,
                              bc_read,
                              si_read,
                              umi_read,
                              lib_idx,
                              skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        feat_raw_bc = None
        feat_proc_bc = None
        feat_qual = None
        feat_ids = None

        if feature_extraction:
            if feature_extraction.barcode:
                feat_raw_bc = feature_extraction.barcode
                feat_qual = feature_extraction.qual

            if len(feature_extraction.ids) > 0:
                feat_proc_bc = feature_extraction.barcode
                feat_ids = ';'.join(feature_extraction.ids)

                # If hit a single feature ID, count its frequency
                if len(feature_extraction.ids) == 1:
                    feature_counts[feature_extraction.indices[0]] += 1

        if feat_raw_bc:
            fastq_header1.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                  feat_raw_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                  feat_qual)
        if feat_ids:
            fastq_header1.set_tag(cr_constants.PROCESSED_FEATURE_BARCODE_TAG,
                                  feat_proc_bc)
            fastq_header1.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

        if args.augment_fastq:
            read1_writer.write(
                (fastq_header1.to_string(), rna_read[1], rna_read[2]))
        else:
            read1_writer.write((rna_read[0], rna_read[1], rna_read[2]))
            tag_writer.write((fastq_header1.to_string(), '', ''))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG,
                                  si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG,
                                  bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            if feat_raw_bc:
                fastq_header2.set_tag(cr_constants.RAW_FEATURE_BARCODE_TAG,
                                      feat_raw_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_BARCODE_QUAL_TAG,
                                      feat_qual)
            if feat_ids:
                fastq_header2.set_tag(
                    cr_constants.PROCESSED_FEATURE_BARCODE_TAG, feat_proc_bc)
                fastq_header2.set_tag(cr_constants.FEATURE_IDS_TAG, feat_ids)

            if args.augment_fastq:
                read2_writer.write(
                    (fastq_header2.to_string(), rna_read2[1], rna_read2[2]))
            else:
                read2_writer.write((rna_read2[0], rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    if not args.augment_fastq:
        tag_writer.close()
    bc_counter.close()

    # Write feature BC read counts
    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()

        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []

        if args.augment_fastq:
            outs.tags = []
        else:
            outs.tags = tag_writer.get_out_paths(len(outs.tags))

        libraries = args.library_info
        library = [
            li for li in libraries if li['library_id'] == args.library_id
        ][0]

        outs.gem_groups = [library['gem_group']] * len(outs.reads)
        outs.library_types = [library['library_type']] * len(outs.reads)
        outs.library_ids = [library['library_id']] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.tags = []
        outs.gem_groups = []
        outs.library_types = []
        outs.library_ids = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)
    assert args.augment_fastq or len(outs.tags) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)

Example #12

Show file

File: __init__.py Project: GWW/cellranger_211_mirror

def main(args, outs):
    random.seed(0)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Use the chemistry to get the locations of various sequences
    rna_read_def = cr_chem.get_rna_read_def(args.chemistry_def)
    rna_read2_def = cr_chem.get_rna_read2_def(args.chemistry_def)
    bc_read_def = cr_chem.get_barcode_read_def(args.chemistry_def)
    si_read_def = cr_chem.get_si_read_def(args.chemistry_def)
    umi_read_def = cr_chem.get_umi_read_def(args.chemistry_def)

    read_defs = [rna_read_def, rna_read2_def,
                 bc_read_def, si_read_def, umi_read_def]
    read_tags = [None, None,
                 (cr_constants.RAW_BARCODE_TAG, cr_constants.RAW_BARCODE_QUAL_TAG),
                 (tk_constants.SAMPLE_INDEX_TAG, tk_constants.SAMPLE_INDEX_QUAL_TAG),
                 (cr_constants.RAW_UMI_TAG, cr_constants.UMI_QUAL_TAG),
             ]

    # Determine which trimmed sequences need to be retained for bamtofastq
    trim_defs = get_bamtofastq_defs(read_defs, read_tags)
    outs.bam_comments = sorted(set(trim_defs.itervalues()))

    gem_groups = [chunk['gem_group'] for chunk in args.chunks]
    reporter = cr_report.Reporter(umi_length=cr_chem.get_umi_length(args.chemistry_def),
                                  primers=cr_utils.get_primers_from_dicts(args.primers),
                                  gem_groups=gem_groups)

    # Determine if barcode sequences need to be reverse complemented.
    bc_check_rc = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, None, None)
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_rc = infer_barcode_reverse_complement(barcode_whitelist, bc_check_rc.in_iter)
    bc_check_rc.close()

    # Log the untrimmed read lengths to stdout
    r1_read_def = cr_constants.ReadDef(rna_read_def.read_type, 0, None)
    r1_reader = FastqReader(args.read_chunks, r1_read_def, args.reads_interleaved, None, None)

    r1_untrimmed_len = 0
    for read in itertools.islice(r1_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
        r1_untrimmed_len = max(r1_untrimmed_len, len(read[1]))
    print "Read 1 untrimmed length = ", r1_untrimmed_len
    print "Input arg r1_length = ", args.r1_length
    r1_reader.close()

    if paired_end:
        r2_read_def = cr_constants.ReadDef(rna_read2_def.read_type, 0, None)
        r2_reader = FastqReader(args.read_chunks, r2_read_def, args.reads_interleaved, None, None)

        r2_untrimmed_len = 0
        for read in itertools.islice(r2_reader.in_iter, cr_constants.DETECT_CHEMISTRY_INITIAL_READS):
            r2_untrimmed_len = max(r2_untrimmed_len, len(read[1]))
        print "Read 2 untrimmed length = ", r2_untrimmed_len
        print "Input arg r2_length = ", args.r2_length
        r2_reader.close()


    # Setup read iterators.
    r1_length = args.r1_length
    r2_length = args.r2_length

    rna_reads = FastqReader(args.read_chunks, rna_read_def, args.reads_interleaved, r1_length, r2_length)
    rna_read2s = FastqReader(args.read_chunks, rna_read2_def, args.reads_interleaved, r1_length, r2_length)
    bc_reads = FastqReader(args.read_chunks, bc_read_def, args.reads_interleaved, r1_length, r2_length)
    si_reads = FastqReader(args.read_chunks, si_read_def, args.reads_interleaved, r1_length, r2_length)

    if cr_chem.has_umis(args.chemistry_def):
        umi_reads = FastqReader(args.read_chunks, umi_read_def, args.reads_interleaved, r1_length, r2_length)
    else:
        umi_reads = FastqReader(None, None, False, r1_length, r2_length)

    fastq_readers = (rna_reads, rna_read2s, bc_reads, si_reads, umi_reads)

    read1_writer = ChunkedFastqWriter(outs.reads, args.reads_per_file, compression=COMPRESSION)
    if paired_end:
        read2_writer = ChunkedFastqWriter(outs.read2s, args.reads_per_file, compression=COMPRESSION)

    bc_counter = BarcodeCounter(args.barcode_whitelist, outs.barcode_counts)

    all_read_iter = itertools.izip_longest(*[reader.in_iter for reader in fastq_readers])

    EMPTY_READ = (None, '', '')

    reporter.extract_reads_init()

    for extractions in itertools.islice(all_read_iter, args.initial_reads):
        # Downsample
        if random.random() > args.subsample_rate:
            continue

        rna_extraction, rna2_extraction, bc_extraction, si_extraction, umi_extraction = extractions

        rna_read = rna_extraction if rna_extraction is not None else EMPTY_READ
        rna_read2 = rna2_extraction if rna2_extraction is not None else EMPTY_READ
        bc_read = bc_extraction if bc_extraction is not None else EMPTY_READ
        si_read = si_extraction if si_extraction is not None else EMPTY_READ
        umi_read = umi_extraction if umi_extraction is not None else EMPTY_READ

        if (not rna_read[1]) or (paired_end and (not rna_read2[1])):
            # Read 1 is empty or read 2 is empty (if paired_end)
            # Empty reads causes issue with STAR aligner, so eliminate
            # them here
            continue

        if bc_read != EMPTY_READ:
            # Reverse complement the barcode if necessary
            if barcode_rc:
                bc_read = (bc_read[0], tk_seq.get_rev_comp(bc_read[1]), bc_read[2][::-1])
            # Track the barcode count distribution
            bc_counter.count(*bc_read)

        # Calculate metrics on raw sequences
        reporter.raw_fastq_cb(rna_read, rna_read2, bc_read, si_read, umi_read, args.gem_group, skip_metrics=args.skip_metrics)

        # Construct new fastq headers
        fastq_header1 = AugmentedFastqHeader(rna_read[0])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
        fastq_header1.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
        fastq_header1.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
        fastq_header1.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
        fastq_header1.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

        fastq_header_str1 = fastq_header1.to_string()

        read1_writer.write((fastq_header_str1, rna_read[1], rna_read[2]))

        if paired_end:
            fastq_header2 = AugmentedFastqHeader(rna_read2[0])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_TAG, si_read[1])
            fastq_header2.set_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG, si_read[2])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_TAG, bc_read[1])
            fastq_header2.set_tag(cr_constants.RAW_BARCODE_QUAL_TAG, bc_read[2])
            fastq_header2.set_tag(cr_constants.RAW_UMI_TAG, umi_read[1])
            fastq_header2.set_tag(cr_constants.UMI_QUAL_TAG, umi_read[2])

            read2_writer.write((fastq_header2.to_string(), rna_read2[1], rna_read2[2]))

    reporter.extract_reads_finalize()

    # Close input and output files.
    rna_reads.close()
    if paired_end:
        rna_read2s.close()
    bc_reads.close()
    si_reads.close()
    umi_reads.close()

    read1_writer.close()
    if paired_end:
        read2_writer.close()
    bc_counter.close()

    # Set stage output parameters.
    if len(read1_writer.file_paths) > 0:
        outs.reads = read1_writer.get_out_paths()
        if paired_end:
            outs.read2s = read2_writer.get_out_paths(len(outs.reads))
        else:
            outs.read2s = []
        outs.gem_groups = [args.gem_group] * len(outs.reads)
        outs.read_groups = [args.read_group] * len(outs.reads)
    else:
        outs.reads = []
        outs.read2s = []
        outs.gem_groups = []
        outs.read_groups = []

    assert len(outs.gem_groups) == len(outs.reads)

    if paired_end:
        assert len(outs.reads) == len(outs.read2s)

    # this is the first reporter stage, so store the pipeline metadata
    reporter.store_pipeline_metadata(martian.get_pipelines_version())

    reporter.save(outs.chunked_reporter)