def annotate_consensus_contig(reference_path, min_score_ratios, min_word_sizes, contig_name, clonotype_name, seq, quals, read_count=None, umi_count=None, info_dict=None, primers=None, use_features=None): """ Given a sequence and some auxiliary info, return a populated AnnotatedContig """ contig = vdj_annot.AnnotatedContig(contig_name, seq, quals=quals, clonotype=clonotype_name, read_count=read_count, umi_count=umi_count, info_dict=info_dict, filtered=True, high_confidence=True) res = vdj_annot.setup_feature_aligners(reference_path, min_score_ratios, min_word_sizes, use_features=use_features) feature_types, feature_aligners, feature_filters = res contig.annotations = contig.annotate_features(feature_types, feature_aligners, feature_filters) if primers: primer_aligner, primer_filter = vdj_annot.setup_primer_aligner( primers, VDJ_ANNOTATION_MIN_SCORE_RATIO) contig.primer_annotations = contig.annotate_features_by_group( primer_aligner, alignment_filter=primer_filter) contig.unannotated_intervals = contig.get_unannotated_intervals() contig.annotate_cdr3() return contig
def test_setup_feature_aligner(self): score_ratios = {"5U": 0.8, "C": 0.8, "D": 0.5, "J": 0.8, "V": 0.8} word_sizes = {"5U": 2, "C": 3, "D": 3, "J": 5, "V": 6} test_ref = in_path('annotation_setup_test_ref') features, aligners, filters = cr_annotations.setup_feature_aligners( test_ref, score_ratios, word_sizes) # This tests that the lambda constants are not dynamically determined. #score = 2.0 #word = 100 seq = 'TTAAAAAAAATTTTCCCC' for t, al, f in zip(features, aligners, filters): alignments = cr_annotations.collect_annotations(al, seq, seq, f) if t == '5U' or t == 'V' or t == 'D': assert (alignments) else: # J doesn't have a match because we require at least 5 bases # C doesn't have a good enough hit assert (not alignments)
def test_coordinates(self): """Test that coordinates for matches are 0-based, half-open. """ score_ratios = {"5U": 0.8, "C": 0.8, "D": 0.5, "J": 0.8, "V": 0.8} word_sizes = {"5U": 2, "C": 3, "D": 3, "J": 5, "V": 6} test_ref = in_path('annotation_setup_test_ref') features, aligners, filters = cr_annotations.setup_feature_aligners( test_ref, score_ratios, word_sizes) aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters) if t == 'V'][0] seq = 'AAAAAAAA' alignments = cr_annotations.collect_annotations( aligner[0], seq, seq, aligner[1], True) self.assertEqual(len(alignments), 1) anno = alignments[0] self.assertEqual(len(anno.mismatches), 0) self.assertEqual(anno.annotation_match_end, len(seq)) self.assertEqual(anno.contig_match_end, len(seq)) score_ratios = {"5U": 0.8, "C": 0.8, "D": 0.5, "J": 0.8, "V": 0.5} word_sizes = {"5U": 2, "C": 3, "D": 3, "J": 5, "V": 3} seq = 'AAATAAAA' features, aligners, filters = cr_annotations.setup_feature_aligners( test_ref, score_ratios, word_sizes) aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters) if t == 'V'][0] alignments = cr_annotations.collect_annotations( aligner[0], seq, seq, aligner[1], True) anno = alignments[0] self.assertEqual(len(anno.mismatches), 1) mismatch = { 'region_type': 'MISMATCH', 'contig_match_start': 3, 'contig_match_end': 4 } self.assertEqual(anno.mismatches[0], mismatch) test_ref = in_path('annotation_setup_test_ref') features, aligners, filters = cr_annotations.setup_feature_aligners( test_ref, score_ratios, word_sizes) aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters) if t == 'V'][0] test_ref = in_path('annotation_setup_test_ref2') fasta = vdj_reference.get_vdj_reference_fasta(test_ref) features, aligners, filters = cr_annotations.setup_feature_aligners( test_ref, score_ratios, word_sizes) aligner = [(al, f) for (t, al, f) in zip(features, aligners, filters) if t == 'V'][0] with open(fasta, 'r') as f: true_seq = f.readlines()[1].strip() # Test deletion coordinates middle = len(true_seq) / 2 seq = true_seq[0:middle] + true_seq[middle + 3:] alignments = cr_annotations.collect_annotations( aligner[0], seq, seq, aligner[1], True) anno = alignments[0] mismatch = { 'region_type': 'D', 'contig_match_start': middle, 'contig_match_end': middle + 1, # convention 'deletion_length': 3 } self.assertEqual(anno.mismatches[0], mismatch) # Test insertion coordinates seq = true_seq[0:middle] + 'AAA' + true_seq[middle:] alignments = cr_annotations.collect_annotations( aligner[0], seq, seq, aligner[1], True) anno = alignments[0] print anno.mismatches, anno.cigar mismatch = { 'region_type': 'I', 'contig_match_start': middle, 'contig_match_end': middle + 3 } self.assertEqual(anno.mismatches[0], mismatch)
def main(args, outs): if args.vdj_reference_path is None: outs.chunked_annotations = None return chunk_contigs = [] barcodes_in_chunk = set(args.barcodes) # Set of barcodes that were called as cells if args.cell_barcodes: cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) else: cell_barcodes_set = set() # Setup feature reference sequences res = vdj_annot.setup_feature_aligners(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes) feature_types, feature_aligners, feature_filters = res # Setup primer reference sequnces if args.primers: primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers, vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO) read_counts = {} umi_counts = {} if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t') for _, row in contig_summary.iterrows(): read_counts[row.contig_name] = int(row.num_reads) umi_counts[row.contig_name] = int(row.num_umis) if args.filter_summary: try: filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary)) except EmptyDataError: filter_summary = None else: filter_summary = None if not args.contigs_fastq is None: fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False) for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)): if args.contigs_fastq is None: contig_quals = None else: header_fq, contig_sequence_fq, contig_quals = fq_iter.next() assert(contig_sequence_fq == contig_sequence) assert(header_fq == header) barcode = vdj_utils.get_barcode_from_contig_name(header) contig_name = header.split(' ')[0] # Only annotate barcodes assigned to this chunk and contigs with enough read support if barcode in barcodes_in_chunk: if filter_summary is not None: filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name) else: filtered = True contig = vdj_annot.AnnotatedContig(contig_name, contig_sequence, quals=contig_quals, barcode=barcode, is_cell=barcode in cell_barcodes_set, filtered=filtered, read_count=read_counts.get(contig_name), umi_count=umi_counts.get(contig_name), ) contig.annotations = contig.annotate_features(feature_types, feature_aligners, feature_filters) if args.primers: contig.primer_annotations = contig.annotate_features_by_group(primer_aligner, alignment_filter=primer_filter) contig.annotate_cdr3() chunk_contigs.append(contig) cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)