Beispiel #1
0
def get_vdj_feature_iter(reference_path):
    """ Yield vdj features from a vdj reference fasta file """
    if reference_path is None:
        return

    for header, sequence in cr_utils.get_fasta_iter(open(get_vdj_reference_fasta(reference_path))):
        yield parse_fasta_entry(header, sequence)
Beispiel #2
0
def check(input, trimmer_out, cutadapt_out):

    correct_untrimmed = 0  # Not trimmed by both
    correct_trimmed = 0  # Consistently trimmed by both
    rust_trimmed_only = 0  # Trimmed only by rust
    rust_untrimmed_only = 0  # Not trimmed only by rust
    inconsistent_trimmed = 0  # Trimmed inconsistently

    with open(cutadapt_out) as fcut, open(trimmer_out) as frust, open(
            input) as finput:
        cutiter = cr_utils.get_fasta_iter(fcut)
        rustiter = cr_utils.get_fasta_iter(frust)
        inputiter = cr_utils.get_fasta_iter(finput)
        for (c, r, u) in zip(cutiter, rustiter, inputiter):
            if c == u and r == u:
                correct_untrimmed += 1
            elif c == u:
                rust_trimmed_only += 1
            elif r == u:
                rust_untrimmed_only += 1
            elif c == r or abs(len(r[1]) -
                               len(c[1])) <= 2:  # Allow upto a slop of two
                correct_trimmed += 1
            else:
                inconsistent_trimmed += 1

    # Sensitivity = (Rust & Cutadapt) / Cutadapt
    sensitivity = float(correct_trimmed + inconsistent_trimmed) / float(
        correct_trimmed + inconsistent_trimmed + rust_untrimmed_only)
    # PPV = (Rust & Cutadapt)/Rust
    ppv = float(correct_trimmed + inconsistent_trimmed) / float(
        correct_trimmed + inconsistent_trimmed + rust_trimmed_only)
    # Concordance = (Rust & Cutadapt & Rust==Cutadpt) / (Rust & Cutadapt)
    concordance = float(correct_trimmed) / float(correct_trimmed +
                                                 inconsistent_trimmed)

    return sensitivity, ppv, concordance
Beispiel #3
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    barcode_contigs = collections.defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t',
                                     dtype={'component': int, 'num_reads': int,
                                            'num_pairs': int, 'num_umis': int,
                                            'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}


    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference)

    reporter.report_summary_json(outs.summary)
Beispiel #4
0
def build_reference_fasta_from_fasta(fasta_path, reference_path,
                                     reference_name, ref_version,
                                     mkref_version):
    """Create cellranger-compatible vdj reference files from a
       V(D)J segment FASTA file.
    """

    seen_features = set()
    seen_ids = set()
    features = []

    print 'Checking FASTA entries...'

    with open(fasta_path) as f:
        for header, sequence in cr_utils.get_fasta_iter(f):
            feat = parse_fasta_entry(header, sequence)

            # Enforce unique feature IDs
            if feat.feature_id in seen_ids:
                raise ValueError(
                    'Duplicate feature ID found in input FASTA: %d.' %
                    feat.feature_id)
            # Sanity check values
            if ' ' in feat.region_type:
                raise ValueError('Spaces not allowed in region type: "%s"' %
                                 feat.region_type)
            if ' ' in feat.gene_name:
                raise ValueError('Spaces not allowed in gene name: "%s"' %
                                 feat.gene_name)
            if ' ' in feat.record_id:
                raise ValueError('Spaces not allowed in record ID: "%s"' %
                                 feat.record_id)

            key = get_duplicate_feature_key(feat)
            if key in seen_features:
                print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (
                    feat.display_name, feat.region_type, feat.record_id)
                continue

            # Strip Ns from termini
            seq = feat.sequence
            if 'N' in seq:
                print 'Warning: Feature %s contains Ns. Stripping from the ends.' % \
                    str((feat.display_name, feat.record_id, feat.region_type))
                seq = seq.strip('N')

            if len(seq) == 0:
                print 'Warning: Feature %s is all Ns. Skipping.' % \
                    str((feat.display_name, feat.record_id, feat.region_type))
                continue

            # Warn on features we couldn't classify properly
            if feat.chain_type not in vdj_constants.VDJ_CHAIN_TYPES:
                print 'Warning: Unknown chain type for: %s. Expected name to be in %s. Skipping.' % \
                (str((feat.display_name, feat.record_id, feat.region_type)),
                 str(tuple(vdj_constants.VDJ_CHAIN_TYPES)))
                continue

            seen_ids.add(feat.feature_id)
            seen_features.add(key)

            # Update the sequence since we may have modified it
            feat_dict = feat._asdict()
            feat_dict.update({'sequence': seq})
            new_feat = VdjAnnotationFeature(**feat_dict)
            features.append(new_feat)
    print '...done.\n'

    print 'Writing sequences...'
    os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path)))
    with open(get_vdj_reference_fasta(reference_path), 'w') as out_fasta:
        for feat in features:
            out_fasta.write(convert_vdj_feature_to_fasta_entry(feat) + '\n')
    print '...done.\n'

    print 'Computing hash of input FASTA file...'
    fasta_hash = cr_utils.compute_hash_of_file(fasta_path)
    print '...done.\n'

    print 'Writing metadata JSON file into reference folder...'
    metadata = {
        cr_constants.REFERENCE_GENOMES_KEY: reference_name,
        cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash,
        cr_constants.REFERENCE_GTF_HASH_KEY: None,
        cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(fasta_path),
        cr_constants.REFERENCE_INPUT_GTF_KEY: None,
        cr_constants.REFERENCE_VERSION_KEY: ref_version,
        cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version,
        cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE,
    }
    with open(
            os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE),
            'w') as json_file:
        json.dump(tk_safe_json.json_sanitize(metadata),
                  json_file,
                  sort_keys=True,
                  indent=4)
    print '...done.\n'
Beispiel #5
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    with open(args.contig_annotations) as f:
        contigs = vdj_annot.load_contig_list_from_json(f,
                                                       args.vdj_reference_path)

    contigs.sort(key=lambda c: (c.barcode, c.get_single_chain(
    ), not c.productive, -c.umi_count, -c.read_count, -len(c)))

    low_confidence_contigs = set()
    cell_contigs = set()

    for (bc,
         chain), group in itertools.groupby(contigs,
                                            key=lambda c:
                                            (c.barcode, c.get_single_chain())):
        first_cdr3 = None
        first_cdr3_umis = None
        seen_cdr3s = set()

        for contig in group:
            contig.high_confidence = True

            if contig.is_cell:
                cell_contigs.add(contig.contig_name)

            if first_cdr3 is None:
                first_cdr3 = contig.cdr3_seq
                first_cdr3_umis = contig.umi_count

            # Mark as low confidence:
            # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3
            #    with a single UMI or low UMIs relative to the first contig, or
            extraneous_cdr3 = first_cdr3 is not None \
               and contig.cdr3_seq != first_cdr3 \
               and (contig.umi_count == 1 or \
                    (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO)

            # 2) Any contigs with a repeated CDR3.
            repeat_cdr3 = contig.cdr3_seq in seen_cdr3s

            if extraneous_cdr3 or repeat_cdr3:
                contig.high_confidence = False
                low_confidence_contigs.add(contig.contig_name)

            seen_cdr3s.add(contig.cdr3_seq)

            if chain in vdj_constants.VDJ_GENES:
                reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                          chain).add(
                                              1, filter=contig.high_confidence)
            reporter._get_metric_attr('vdj_high_conf_prod_contig_frac',
                                      cr_constants.MULTI_REFS_PREFIX).add(
                                          1, filter=contig.high_confidence)

    # Write augmented contig annotations
    with open(outs.contig_annotations, 'w') as f:
        vdj_annot.save_annotation_list_json(f, contigs)

    # Write filtered fasta
    with open(args.contig_fasta) as in_file, \
         open(outs.filtered_contig_fasta, 'w') as out_file:
        for hdr, seq in cr_utils.get_fasta_iter(in_file):
            # Keep contigs that are high confidence & in cells
            if hdr not in low_confidence_contigs and hdr in cell_contigs:
                tk_fasta.write_read_fasta(out_file, hdr, seq)

    # Write filtered fastq
    with open(args.contig_fastq) as in_file, \
         open(outs.filtered_contig_fastq, 'w') as out_file:
        for name, seq, qual in tk_fasta.read_generator_fastq(in_file):
            if name not in low_confidence_contigs and name in cell_contigs:
                tk_fasta.write_read_fastq(out_file, name, seq, qual)

    reporter.report_summary_json(outs.summary)
Beispiel #6
0
def main(args, outs):
    if args.vdj_reference_path is None:
        outs.chunked_annotations = None
        return
    chunk_contigs = []
    barcodes_in_chunk = set(args.barcodes)

    # Set of barcodes that were called as cells
    if args.cell_barcodes:
        cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))
    else:
        cell_barcodes_set = set()

    # Setup feature reference sequences
    res = vdj_annot.setup_feature_aligners(args.vdj_reference_path,
                                           args.min_score_ratios,
                                           args.min_word_sizes)
    feature_types, feature_aligners, feature_filters = res

    # Setup primer reference sequnces
    if args.primers:
        primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers,
                                                                       vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO)

    read_counts = {}
    umi_counts = {}
    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t')
        for _, row in contig_summary.iterrows():
            read_counts[row.contig_name] = int(row.num_reads)
            umi_counts[row.contig_name] = int(row.num_umis)

    if args.filter_summary:
        try:
            filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary))
        except EmptyDataError:
            filter_summary = None
    else:
        filter_summary = None

    if not args.contigs_fastq is None:
        fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False)

    for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)):
        if args.contigs_fastq is None:
            contig_quals = None
        else:
            header_fq, contig_sequence_fq, contig_quals = fq_iter.next()
            assert(contig_sequence_fq == contig_sequence)
            assert(header_fq == header)

        barcode = vdj_utils.get_barcode_from_contig_name(header)
        contig_name = header.split(' ')[0]

        # Only annotate barcodes assigned to this chunk and contigs with enough read support
        if barcode in barcodes_in_chunk:
            if filter_summary is not None:
                filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name)
            else:
                filtered = True

            contig = vdj_annot.AnnotatedContig(contig_name,
                                               contig_sequence,
                                               quals=contig_quals,
                                               barcode=barcode,
                                               is_cell=barcode in cell_barcodes_set,
                                               filtered=filtered,
                                               read_count=read_counts.get(contig_name),
                                               umi_count=umi_counts.get(contig_name),
                                               )

            contig.annotations = contig.annotate_features(feature_types,
                                                          feature_aligners,
                                                          feature_filters)

            if args.primers:
                contig.primer_annotations = contig.annotate_features_by_group(primer_aligner,
                                                                              alignment_filter=primer_filter)

            contig.annotate_cdr3()

            chunk_contigs.append(contig)

    cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)
Beispiel #7
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    barcode_contigs = defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary,
                                     header=0,
                                     index_col=None,
                                     sep='\t',
                                     dtype={
                                         'component': int,
                                         'num_reads': int,
                                         'num_pairs': int,
                                         'num_umis': int,
                                         'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary,
                                  header=0,
                                  index_col=None,
                                  sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(
            args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(
                filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}

    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations,
                                       reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary,
                                 annotations, reference)

    ## Compute post-assembly per-cell metrics
    # Load the assembly metrics summary to get the total assemblable reads
    if args.assemble_metrics_summary and args.reads_summary:
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes)

        total_read_pairs = cr_utils.get_metric_from_json(
            args.reads_summary, 'total_read_pairs')

        reporter._get_metric_attr(
            'vdj_assemblable_read_pairs_per_filtered_bc').set_value(
                assemblable_read_pairs, len(barcodes))
        reporter._get_metric_attr('vdj_sequencing_efficiency').set_value(
            assemblable_read_pairs, total_read_pairs)

    ## Try to autodetect the chain type
    # Find all chains w/ a significant presence.
    # If there's exactly one, set the chain type filter to that.
    # Otherwise, show all chain types.

    chain_count = defaultdict(int)
    for anno_dict in contig_annotations.itervalues():
        contig = vdj_annotations.AnnotatedContig.from_dict(
            anno_dict, reference)
        if contig.is_cell and contig.high_confidence and contig.productive:
            for anno in contig.annotations:
                if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES:
                    chain_count[anno.feature.chain_type] += 1

    outs.chain_type = vdj_constants.ALL_CHAIN_TYPES

    print chain_count

    if len(chain_count) > 0:
        n_contigs = sum(chain_count.itervalues())
        sig_chains = [
            ct
            for ct, count in chain_count.iteritems() if tk_stats.robust_divide(
                count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC
        ]
        if len(sig_chains) == 1:
            outs.chain_type = sig_chains[0]

    reporter.report_summary_json(outs.summary)