예제 #1
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    libraries = rna_library.get_bam_library_info(in_bam)
    distinct_library_types = sorted(
        list(set([x['library_type'] for x in libraries])))
    library_prefixes = map(
        lambda lib: rna_library.get_library_type_metric_prefix(lib[
            'library_type']), libraries)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_tsv(
        args.barcodes_detected) if not barcode_whitelist else None

    # TODO: this is redundant
    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups,
                                  library_types=distinct_library_types)

    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    matrix = cr_matrix.CountMatrix.empty(feature_ref,
                                         barcode_seqs,
                                         dtype='int32')

    for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None):
        is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb(
            reads_iter,
            libraries,
            library_prefixes,
            use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrix.add(feature_id, bc)

    in_bam.close()

    reporter.store_reference_metadata(args.reference_path,
                                      cr_constants.REFERENCE_TYPE,
                                      cr_constants.REFERENCE_METRIC_PREFIX)

    matrix.save_h5_file(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)
예제 #2
0
def from_transcriptome_and_csv(gene_ref_path, feature_def_filename):
    '''Create a FeatureReference.

    Create a FeatureReference from a transcriptome ref and a feature barcode ref.

    Args:
        gene_ref_path (str): Path to transcriptome reference. Can be None.
        feature_def_filename (str): Path to Feature Definition CSV file. Can be None.
    Returns:
        FeatureReference
    '''

    # Load gene info
    feature_defs = []
    all_tag_keys = ['genome']

    genomes = cr_utils.get_reference_genomes(gene_ref_path)

    if gene_ref_path is not None:
        gene_idx_filename = cr_utils.get_reference_genes_index(gene_ref_path)
        gene_index = cr_reference.GeneIndex.load_pickle(gene_idx_filename)

        # Stuff relevant fields of Gene tuple into FeatureDef
        for gene in gene_index.genes:
            genome = cr_utils.get_genome_from_str(gene.id, genomes)
            fd = FeatureDef(
                index=len(feature_defs),
                id=gene.id,
                name=gene.name,
                feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                tags={
                    'genome': genome,
                })
            feature_defs.append(fd)

    # Load feature definition file
    if feature_def_filename is not None:
        csv_feature_defs, csv_tag_keys = parse_feature_def_file(
            feature_def_filename, index_offset=len(feature_defs))

        # check the CRISPR 'target_gene_id' field, if it exists
        # it needs to match a transcriptome entry
        check_crispr_target_gene(csv_feature_defs, feature_defs)

        feature_defs.extend(csv_feature_defs)
        all_tag_keys.extend(csv_tag_keys)

    return FeatureReference(feature_defs, all_tag_keys)
예제 #3
0
def main(args, outs):
    convert_pickle_to_rust_index(
        cr_utils.get_reference_genes_index(args.reference_path),
        outs.gene_index_tab)

    if args.barcode_whitelist is None:
        barcode_whitelist = 'null'
    elif not os.path.exists(args.barcode_whitelist):
        barcode_whitelist = cr_utils.get_barcode_whitelist_path(
            args.barcode_whitelist)
    else:
        barcode_whitelist = args.barcode_whitelist

    cmd = [
        'annotate_reads',
        'main',
        args.chunk_genome_input,
        args.chunk_tags,
        outs.output,
        outs.chunked_reporter,
        args.reference_path,
        outs.gene_index_tab,
        args.barcode_counts,
        barcode_whitelist,
        str(args.gem_group),
        outs.chunk_metadata,
        cr_chem.get_strandedness(args.chemistry_def),
        args.feature_counts,
        args.library_type or lib_constants.DEFAULT_LIBRARY_TYPE,
        args.library_id,
        args.library_info_json,
        '--bam-comments',
        args.bam_comments_json,
    ]

    if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME:
        cmd.append('--fiveprime')
    if args.skip_translate:
        cmd.append('--skip-translate')
    if args.feature_reference is not None:
        cmd.extend(['--feature-ref', args.feature_reference])

    print >> sys.stderr, 'Running', ' '.join(map(lambda x: "'%s'" % x, cmd))
    tk_subproc.check_call(cmd, cwd=os.getcwd())
    with open(outs.chunk_metadata) as f:
        metadata = json.load(f)
    outs.num_alignments = metadata['num_alignments']
예제 #4
0
def main(args, outs):
    reference_star_path = cr_utils.get_reference_star_path(args.reference_path)
    star_index = cr_transcriptome.build_star_index(reference_star_path)
    chroms = star_index[0][0]
    gene_index = cr_reference.GeneIndex.load_pickle(cr_utils.get_reference_genes_index(args.reference_path))
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group)
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_constants.STAR_DEFAULT_HIGH_CONF_MAPQ,
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_dist=barcode_dist,
                                  gem_groups=args.gem_groups,
                                  umi_length=cr_chem.get_umi_length(args.chemistry_def),
                                  umi_min_qual_threshold=args.umi_min_qual_threshold)

    reporter.attach_bcs_init()
    outs.num_alignments = process_alignments(args.chunk_genome_input, args.chunk_trimmed_input, outs.output, args.bam_comments, reporter, gene_index, star_index, args)
    reporter.attach_bcs_finalize()
    reporter.save(outs.chunked_reporter)
예제 #5
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_summary(
        args.barcode_summary) if not barcode_whitelist else None

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes)
    matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs)

    for read in in_bam:
        is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb(
            read, use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrices.add(genome, gene_id, bc)

    in_bam.close()

    matrices.save_h5(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)
예제 #6
0
def main(args, outs):
    convert_pickle_to_rust_index(
        cr_utils.get_reference_genes_index(args.reference_path),
        outs.gene_index_tab)

    if args.barcode_whitelist is None:
        barcode_whitelist = 'null'
    elif not os.path.exists(args.barcode_whitelist):
        barcode_whitelist = cr_utils.get_barcode_whitelist_path(
            args.barcode_whitelist)
    else:
        barcode_whitelist = args.barcode_whitelist

    cmd = [
        'annotate_reads',
        'main',
        args.chunk_genome_input,
        outs.output,
        outs.chunked_reporter,
        args.reference_path,
        outs.gene_index_tab,
        args.barcode_counts,
        barcode_whitelist,
        str(args.gem_group),
        outs.chunk_metadata,
        cr_chem.get_strandedness(args.chemistry_def),
        '--bam-comments',
        args.bam_comments_json,
    ]

    if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME:
        cmd.append('--fiveprime')

    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
    with open(outs.chunk_metadata) as f:
        metadata = json.load(f)
    outs.num_alignments = metadata['num_alignments']
예제 #7
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w')

    mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns()
    mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)}

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genome_index = cr_reference.get_genome_index(genomes)
    none_gene_id = len(gene_index.get_genes())

    # store reference index columns
    # NOTE - these must be cast to str first, as unicode is not supported
    counter.set_ref_column('genome_ids', [str(genome) for genome in genomes])
    counter.set_ref_column('gene_ids',
                           [str(gene.id) for gene in gene_index.genes])
    counter.set_ref_column('gene_names',
                           [str(gene.name) for gene in gene_index.genes])

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)

    gg_metrics = collections.defaultdict(
        lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0})

    for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby(
            in_bam, key=cr_utils.barcode_sort_key):
        if barcode is None or gem_group is None:
            continue
        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode, gem_group) in filtered_bcs
        molecules = collections.defaultdict(
            lambda: np.zeros(len(mol_data_columns), dtype=np.uint64))

        compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq(
            barcode)
        gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group(
            gem_group)

        read_positions = collections.defaultdict(set)
        for read in reads_iter:
            umi = cr_utils.get_read_umi(read)
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or umi is None or read.is_read2:
                continue

            raw_umi = cr_utils.get_read_raw_umi(read)
            raw_bc, raw_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_raw_barcode(read))
            proc_bc, proc_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_barcode(read))

            if cr_utils.is_read_conf_mapped_to_transcriptome(
                    read, cr_utils.get_high_conf_mapq(args.align)):
                assert len(gene_ids) == 1

                mol_key, map_type = (umi, gene_index.gene_id_to_int(
                    gene_ids[0])), 'reads'

                read_pos = (read.tid, read.pos)
                uniq_read_pos = read_pos not in read_positions[mol_key]
                read_positions[mol_key].add(read_pos)

                if is_cell_barcode:
                    gg_metrics[int(gem_group)][
                        cr_mol_counter.
                        GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1

            elif read.is_unmapped:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'unmapped_reads', False
            else:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'nonconf_mapped_reads', False
            molecules[mol_key][mol_data_columns[map_type]] += 1
            molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int(
                not raw_umi == umi)
            molecules[mol_key][mol_data_columns[
                'barcode_corrected_reads']] += int(not raw_bc == proc_bc)
            molecules[mol_key][mol_data_columns[
                'conf_mapped_uniq_read_pos']] += int(uniq_read_pos)

        for mol_key, molecule in sorted(molecules.items()):
            umi, gene_id = mol_key
            genome = cr_utils.get_genome_from_str(
                gene_index.int_to_gene_id(gene_id), genomes)
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            counter.add(
                barcode=compressed_barcode,
                gem_group=gem_group,
                umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi),
                gene=gene_id,
                genome=genome_id,
                **{
                    key: molecule[col_idx]
                    for key, col_idx in mol_data_columns.iteritems()
                })

    in_bam.close()

    counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics))

    counter.save()