Beispiel #1
0
def main(args, outs):
    molecule_counter = cr_mol_counter.MoleculeCounter.open(
        args.raw_molecules,
        'r',
        start=int(args.chunk_start),
        length=int(args.chunk_len))

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)

    raw_matrices = cr_matrix.GeneBCMatrices.build_from_mol_counter(
        molecule_counter)
    filtered_matrices = raw_matrices.filter_barcodes(filtered_bcs_per_genome)

    raw_matrices.save_h5(outs.raw_matrices_h5)
    raw_matrices.save_mex(outs.raw_matrices_mex)
    raw_matrices.save_barcode_summary_h5(outs.barcode_summary_h5)

    filtered_matrices.save_h5(outs.filtered_matrices_h5)
    filtered_matrices.save_mex(outs.filtered_matrices_mex)

    genome_ids = molecule_counter.get_ref_column('genome_ids')

    with cr_mol_counter.MoleculeCounter.open(outs.filtered_molecules,
                                             'w') as ctr_out:
        summary = write_filtered_molecules(molecule_counter, ctr_out,
                                           genome_ids, filtered_bcs_per_genome)

    with open(outs.summary, 'w') as f:
        tk_json.dump_numpy(summary, f, pretty=True)
Beispiel #2
0
def get_gem_group_index_json(args, outs):
    if args.gem_group_index_json:
        cr_utils.copy(args.gem_group_index_json, outs.gem_group_index_json)
    else:
        generated_index = cr_matrix.get_gem_group_index(
            args.filtered_gene_bc_matrices_h5)
        if generated_index:
            with open(outs.gem_group_index_json, 'w') as outfile:
                tk_json.dump_numpy({"gem_group_index": generated_index},
                                   outfile)
    return outs.gem_group_index_json
Beispiel #3
0
def get_gem_group_index_json(args, outs):
    if args.gem_group_index_json:
        cr_io.copy(args.gem_group_index_json, outs.gem_group_index_json)
    else:
        generated_index = cr_matrix.get_gem_group_index(
            args.feature_barcode_matrix)
        if generated_index is not None:
            with open(outs.gem_group_index_json, 'w') as outfile:
                tk_json.dump_numpy({"gem_group_index": generated_index},
                                   outfile)
        else:
            outs.gem_group_index_json = None
    return outs.gem_group_index_json
Beispiel #4
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.corrected_bcs = [co.corrected_bcs for co in chunk_outs]

    # Write barcode counts (merged by library_type)
    bc_counters = cr_fastq.BarcodeCounter.merge_by(
        [co.corrected_barcode_counts
         for co in chunk_outs], [cd.library_type for cd in chunk_defs],
        args.barcode_whitelist, args.gem_groups)
    with open(outs.corrected_barcode_counts, 'w') as f:
        tk_safe_json.dump_numpy(bc_counters, f)

    outs.chunked_reporter = None
    reporter = cr_report.merge_reporters(
        [chunk_out.chunked_reporter for chunk_out in chunk_outs])

    reporter.report_summary_json(outs.summary)

    reporter.report_barcodes_h5(outs.barcode_summary)
Beispiel #5
0
 def close(self):
     if self.barcode_seqs:
         with open(self.out_counts, 'w') as f:
             tk_safe_json.dump_numpy(self.to_json(), f)
Beispiel #6
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.reads, outs.read2s, outs.tags = [], [], []
    outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], []

    for chunk_out in chunk_outs:
        outs.reads += [read for read in chunk_out.reads]
        outs.read2s += [read2 for read2 in chunk_out.read2s]
        outs.tags += [tags for tags in chunk_out.tags]
        outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups]
        outs.library_types += [lt for lt in chunk_out.library_types]
        outs.library_ids += [li for li in chunk_out.library_ids]
        outs.read_groups += [
            read_group for read_group in chunk_out.read_groups
        ]

    # Ensure that we have non-zero reads
    if not outs.reads:
        martian.exit(
            "No reads found. Check the input fastqs and/or the chemistry definition"
        )
    # Ensure consistency of BAM comments
    assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments
               for chunk_out in chunk_outs)
    outs.bam_comments = chunk_outs[0].bam_comments

    # Write barcode counts (merged by library_type)
    bc_counters = BarcodeCounter.merge_by(
        [co.barcode_counts
         for co in chunk_outs], [cd.library_type for cd in chunk_defs],
        args.barcode_whitelist, outs.gem_groups)
    with open(outs.barcode_counts, 'w') as f:
        tk_safe_json.dump_numpy(bc_counters, f)

    # Write feature counts
    feature_counts = None
    for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs):
        with open(chunk_out.feature_counts) as f:
            chunk_counts = np.asarray(json.load(f), dtype=int)
            if feature_counts is None:
                feature_counts = chunk_counts
            else:
                feature_counts += chunk_counts

    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    outs.align = cr_utils.select_alignment_params(args.align)

    # Group reporters by library type
    outs.chunked_reporter = None
    reporter_groups = defaultdict(list)
    for chunk_def, chunk_out in zip(chunk_defs, chunk_outs):
        if not chunk_out.reads:
            continue
        chunk_lib_types = set(lt for lt in chunk_out.library_types)
        assert len(chunk_lib_types) == 1
        lib_type = list(chunk_lib_types)[0]
        reporter_groups[lib_type].append(chunk_out.chunked_reporter)

    # Merge reporters and prefix JSON keys by library type
    summary = {}
    for lib_type, reporters in reporter_groups.iteritems():
        j = cr_report.merge_reporters(reporters).to_json()

        prefix = rna_library.get_library_type_metric_prefix(lib_type)
        j_prefixed = dict((prefix + k, v) for k, v in j.iteritems())

        summary.update(j_prefixed)

    # Use a temporary reporter to generate the metadata (w/o a prefix)
    tmp_reporter = cr_report.Reporter()
    tmp_reporter.store_chemistry_metadata(args.chemistry_def)
    summary.update(tmp_reporter.to_json())

    # Write summary JSON
    with open(outs.summary, 'w') as f:
        tk_safe_json.dump_numpy(summary, f, pretty=True)
Beispiel #7
0
def main(args, outs):
    np.random.seed(0)

    with cr_mol_counter.MoleculeCounter.open(args.molecules,
                                             'r',
                                             start=int(args.chunk_start),
                                             length=int(
                                                 args.chunk_len)) as ctr_in:
        with cr_mol_counter.MoleculeCounter.open(outs.out_molecules,
                                                 'w') as ctr_out:
            metrics_in = ctr_in.get_all_metrics()
            metrics_out = metrics_in.copy()

            reads = ctr_in.get_column('reads')
            gem_groups = ctr_in.get_column('gem_group')

            if args.downsample and len(args.downsample_map) > 1:
                downsample_func = np.vectorize(
                    lambda gem_group, read_count: np.random.binomial(
                        read_count, args.downsample_map[str(gem_group)][
                            'frac_reads_kept']))

                # downsample metrics
                for gg in metrics_out[cr_mol_counter.GEM_GROUPS_METRIC]:
                    frac_reads_kept = args.downsample_map[str(
                        gg)]['frac_reads_kept']
                    total_reads_in = metrics_in[
                        cr_mol_counter.GEM_GROUPS_METRIC][gg][
                            cr_mol_counter.GG_TOTAL_READS_METRIC]
                    total_reads_out = round(frac_reads_kept * total_reads_in)
                    metrics_out[cr_mol_counter.GEM_GROUPS_METRIC][gg][
                        cr_mol_counter.
                        GG_DOWNSAMPLED_READS_METRIC] = total_reads_out

                ctr_out.set_all_metrics(metrics_out)

                # downsample molecule info
                subsampled_reads = downsample_func(gem_groups, reads)
                for col in cr_mol_counter.MOLECULE_INFO_COLUMNS:
                    if col == 'reads':
                        data = subsampled_reads
                    else:
                        data = ctr_in.get_column(col)
                    ctr_out.add_many(col, data)

                # pass reference info
                for col in cr_mol_counter.MOLECULE_REF_COLUMNS:
                    ctr_out.set_ref_column(col, ctr_in.get_ref_column(col))

            else:
                subsampled_reads = reads

            # collect summary stats
            genomes = ctr_in.get_ref_column('genome_ids')
            raw_conf_mapped_per_genome = {}
            if len(genomes) == 1:
                genome = genomes[0]
                raw_conf_mapped_per_genome[genome] = subsampled_reads.sum()
            else:
                genome_ids = ctr_in.get_column('genome')
                genome_index = cr_reference.get_genome_index(genomes)
                for genome in genomes:
                    genome_id = cr_reference.get_genome_id(
                        genome, genome_index)
                    raw_conf_mapped_per_genome[genome] = subsampled_reads[
                        genome_ids == genome_id].sum()

            summary = {
                'raw_conf_mapped_per_genome': raw_conf_mapped_per_genome,
                'mol_counter_metrics': metrics_out
            }

            with open(outs.summary, 'w') as f:
                tk_json.dump_numpy(summary, f, pretty=True)
Beispiel #8
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))

    barcode_contigs = vdj_annot.load_cell_contigs_from_json(
        args.annotations, args.vdj_reference_path, group_key='barcode')

    # From CDR sequence to sequence id
    sequences = {}
    # From clonotype (tuple of CDR ids) to clonotype id
    clonotypes = {}

    # From barcode to clonotype id
    bc_clonotype_assignments = {}

    # First pass: Just keep track of observed CDR3s
    for contig_list in barcode_contigs:

        # This will be a tuple of sequences like "TRA_<cdr seq>"
        barcode_clonotype_tuple = contig_list.clonotype_tuple(
            require_productive=not args.use_non_productive,
            require_full_len=True,
            require_high_conf=True)

        # Give unique numerical ids to the CDR3 sequences
        if barcode_clonotype_tuple:
            for cdr_seq in barcode_clonotype_tuple:
                sequences.setdefault(cdr_seq, len(sequences))

    # From sequence id to CDR sequence
    sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()}

    # Do a second pass to potentially use non-full length contigs with a valid CDR3.
    for contig_list in barcode_contigs:
        if args.use_non_full_len:
            barcode_clonotype_tuple = []

            for c in contig_list.contigs():
                (_, cl_seq) = c.clonotype_seq()
                # If this contig has a CDR3 and we can infer the gene type of
                # that CDR3 (either based on the contig itself or based on
                # other full-length contigs that had this CDR3, then add this
                # to the clonotype tuple).
                if cl_seq in sequences:
                    # this will rescue contigs that have a chain and CDR3 assigned
                    # but aren't full length
                    barcode_clonotype_tuple.append(cl_seq)
        else:
            barcode_clonotype_tuple = contig_list.clonotype_tuple(
                require_productive=(not args.use_non_productive),
                require_full_len=True,
                require_high_conf=True)
        barcode_clonotype = tuple(
            sorted(list(set([sequences[s] for s in barcode_clonotype_tuple]))))

        if barcode_clonotype:
            clonotype_id = clonotypes.setdefault(barcode_clonotype,
                                                 len(clonotypes))
            bc_clonotype_assignments[contig_list.name] = clonotype_id

    # From clonotype id to tuple of CDRs
    clonotype_ids = {
        clonotype_id: clonotype_tuple
        for clonotype_tuple, clonotype_id in clonotypes.iteritems()
    }

    out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw',
                                                 cell_barcodes, clonotype_ids,
                                                 sequence_ids, barcode_contigs,
                                                 bc_clonotype_assignments)

    with open(outs.clonotype_assignments, 'w') as out_file:
        tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes),
                                out_file,
                                pretty=True)

    # Add clonotype assignments to contig annotations
    del barcode_contigs
    with open(args.annotations) as f:
        all_contigs = vdj_annot.load_contig_list_from_json(
            f, args.vdj_reference_path)

    vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw')

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as out_file:
        vdj_annot.save_annotation_list_json(out_file, all_contigs)

    with open(outs.contig_annotations_csv, 'w') as out_file:
        vdj_annot.save_contig_list_csv(out_file,
                                       all_contigs,
                                       write_inferred=False)

    with open(outs.contig_annotations_pickle, 'w') as out_file:
        cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL)

    # Write filtered contig annotations
    with open(outs.filtered_contig_annotations_csv, 'w') as out_file:
        filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell,
                                  all_contigs)
        vdj_annot.save_contig_list_csv(out_file,
                                       filtered_contigs,
                                       write_inferred=False)

    # Set a default value for paired clonotype diversity so that it will be
    # present in the metric summary csv even when there are no paired cells
    # or in denovo mode
    paired_diversity_metric = reporter._get_metric_attr(
        'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw')
    if not paired_diversity_metric.d:
        paired_diversity_metric.add(None, 0)

    reporter.report_summary_json(outs.summary)