Beispiel #1
0
def annotate_consensus_contig(reference_path,
                              min_score_ratios,
                              min_word_sizes,
                              contig_name,
                              clonotype_name,
                              seq,
                              quals,
                              read_count=None,
                              umi_count=None,
                              info_dict=None,
                              primers=None,
                              use_features=None):
    """ Given a sequence and some auxiliary info, return a populated AnnotatedContig """

    contig = vdj_annot.AnnotatedContig(contig_name,
                                       seq,
                                       quals=quals,
                                       clonotype=clonotype_name,
                                       read_count=read_count,
                                       umi_count=umi_count,
                                       info_dict=info_dict,
                                       filtered=True,
                                       high_confidence=True)

    res = vdj_annot.setup_feature_aligners(reference_path,
                                           min_score_ratios,
                                           min_word_sizes,
                                           use_features=use_features)
    feature_types, feature_aligners, feature_filters = res

    contig.annotations = contig.annotate_features(feature_types,
                                                  feature_aligners,
                                                  feature_filters)
    if primers:
        primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(
            primers, VDJ_ANNOTATION_MIN_SCORE_RATIO)
        contig.primer_annotations = contig.annotate_features_by_group(
            primer_aligner, alignment_filter=primer_filter)

    contig.unannotated_intervals = contig.get_unannotated_intervals()
    contig.annotate_cdr3()

    return contig
    def test_setup_feature_aligner(self):
        score_ratios = {"5U": 0.8, "C": 0.8, "D": 0.5, "J": 0.8, "V": 0.8}
        word_sizes = {"5U": 2, "C": 3, "D": 3, "J": 5, "V": 6}
        test_ref = in_path('annotation_setup_test_ref')
        features, aligners, filters = cr_annotations.setup_feature_aligners(
            test_ref, score_ratios, word_sizes)

        # This tests that the lambda constants are not dynamically determined.
        #score = 2.0
        #word = 100

        seq = 'TTAAAAAAAATTTTCCCC'
        for t, al, f in zip(features, aligners, filters):
            alignments = cr_annotations.collect_annotations(al, seq, seq, f)
            if t == '5U' or t == 'V' or t == 'D':
                assert (alignments)
            else:
                # J doesn't have a match because we require at least 5 bases
                # C doesn't have a good enough hit
                assert (not alignments)
    def test_coordinates(self):
        """Test that coordinates for matches are 0-based, half-open.
        """
        score_ratios = {"5U": 0.8, "C": 0.8, "D": 0.5, "J": 0.8, "V": 0.8}
        word_sizes = {"5U": 2, "C": 3, "D": 3, "J": 5, "V": 6}
        test_ref = in_path('annotation_setup_test_ref')
        features, aligners, filters = cr_annotations.setup_feature_aligners(
            test_ref, score_ratios, word_sizes)

        aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters)
                   if t == 'V'][0]

        seq = 'AAAAAAAA'
        alignments = cr_annotations.collect_annotations(
            aligner[0], seq, seq, aligner[1], True)
        self.assertEqual(len(alignments), 1)
        anno = alignments[0]
        self.assertEqual(len(anno.mismatches), 0)
        self.assertEqual(anno.annotation_match_end, len(seq))
        self.assertEqual(anno.contig_match_end, len(seq))

        score_ratios = {"5U": 0.8, "C": 0.8, "D": 0.5, "J": 0.8, "V": 0.5}
        word_sizes = {"5U": 2, "C": 3, "D": 3, "J": 5, "V": 3}
        seq = 'AAATAAAA'
        features, aligners, filters = cr_annotations.setup_feature_aligners(
            test_ref, score_ratios, word_sizes)
        aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters)
                   if t == 'V'][0]
        alignments = cr_annotations.collect_annotations(
            aligner[0], seq, seq, aligner[1], True)
        anno = alignments[0]

        self.assertEqual(len(anno.mismatches), 1)
        mismatch = {
            'region_type': 'MISMATCH',
            'contig_match_start': 3,
            'contig_match_end': 4
        }
        self.assertEqual(anno.mismatches[0], mismatch)

        test_ref = in_path('annotation_setup_test_ref')
        features, aligners, filters = cr_annotations.setup_feature_aligners(
            test_ref, score_ratios, word_sizes)

        aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters)
                   if t == 'V'][0]

        test_ref = in_path('annotation_setup_test_ref2')
        fasta = vdj_reference.get_vdj_reference_fasta(test_ref)
        features, aligners, filters = cr_annotations.setup_feature_aligners(
            test_ref, score_ratios, word_sizes)
        aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters)
                   if t == 'V'][0]

        with open(fasta, 'r') as f:
            true_seq = f.readlines()[1].strip()

        # Test deletion coordinates
        middle = len(true_seq) / 2
        seq = true_seq[0:middle] + true_seq[middle + 3:]

        alignments = cr_annotations.collect_annotations(
            aligner[0], seq, seq, aligner[1], True)
        anno = alignments[0]
        mismatch = {
            'region_type': 'D',
            'contig_match_start': middle,
            'contig_match_end': middle + 1,  # convention
            'deletion_length': 3
        }
        self.assertEqual(anno.mismatches[0], mismatch)

        # Test insertion coordinates
        seq = true_seq[0:middle] + 'AAA' + true_seq[middle:]

        alignments = cr_annotations.collect_annotations(
            aligner[0], seq, seq, aligner[1], True)
        anno = alignments[0]
        print anno.mismatches, anno.cigar
        mismatch = {
            'region_type': 'I',
            'contig_match_start': middle,
            'contig_match_end': middle + 3
        }
        self.assertEqual(anno.mismatches[0], mismatch)
Beispiel #4
0
def main(args, outs):
    if args.vdj_reference_path is None:
        outs.chunked_annotations = None
        return
    chunk_contigs = []
    barcodes_in_chunk = set(args.barcodes)

    # Set of barcodes that were called as cells
    if args.cell_barcodes:
        cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes))
    else:
        cell_barcodes_set = set()

    # Setup feature reference sequences
    res = vdj_annot.setup_feature_aligners(args.vdj_reference_path,
                                           args.min_score_ratios,
                                           args.min_word_sizes)
    feature_types, feature_aligners, feature_filters = res

    # Setup primer reference sequnces
    if args.primers:
        primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers,
                                                                       vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO)

    read_counts = {}
    umi_counts = {}
    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t')
        for _, row in contig_summary.iterrows():
            read_counts[row.contig_name] = int(row.num_reads)
            umi_counts[row.contig_name] = int(row.num_umis)

    if args.filter_summary:
        try:
            filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary))
        except EmptyDataError:
            filter_summary = None
    else:
        filter_summary = None

    if not args.contigs_fastq is None:
        fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False)

    for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)):
        if args.contigs_fastq is None:
            contig_quals = None
        else:
            header_fq, contig_sequence_fq, contig_quals = fq_iter.next()
            assert(contig_sequence_fq == contig_sequence)
            assert(header_fq == header)

        barcode = vdj_utils.get_barcode_from_contig_name(header)
        contig_name = header.split(' ')[0]

        # Only annotate barcodes assigned to this chunk and contigs with enough read support
        if barcode in barcodes_in_chunk:
            if filter_summary is not None:
                filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name)
            else:
                filtered = True

            contig = vdj_annot.AnnotatedContig(contig_name,
                                               contig_sequence,
                                               quals=contig_quals,
                                               barcode=barcode,
                                               is_cell=barcode in cell_barcodes_set,
                                               filtered=filtered,
                                               read_count=read_counts.get(contig_name),
                                               umi_count=umi_counts.get(contig_name),
                                               )

            contig.annotations = contig.annotate_features(feature_types,
                                                          feature_aligners,
                                                          feature_filters)

            if args.primers:
                contig.primer_annotations = contig.annotate_features_by_group(primer_aligner,
                                                                              alignment_filter=primer_filter)

            contig.annotate_cdr3()

            chunk_contigs.append(contig)

    cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'),
                 protocol=cPickle.HIGHEST_PROTOCOL)