def annotate_consensus_contig(reference_path, min_score_ratios, min_word_sizes, contig_name, clonotype_name, seq, quals, read_count=None, umi_count=None, info_dict=None, primers=None, use_features=None): """ Given a sequence and some auxiliary info, return a populated AnnotatedContig """ contig = vdj_annot.AnnotatedContig(contig_name, seq, quals=quals, clonotype=clonotype_name, read_count=read_count, umi_count=umi_count, info_dict=info_dict, filtered=True, high_confidence=True) res = vdj_annot.setup_feature_aligners(reference_path, min_score_ratios, min_word_sizes, use_features=use_features) feature_types, feature_aligners, feature_filters = res contig.annotations = contig.annotate_features(feature_types, feature_aligners, feature_filters) if primers: primer_aligner, primer_filter = vdj_annot.setup_primer_aligner( primers, VDJ_ANNOTATION_MIN_SCORE_RATIO) contig.primer_annotations = contig.annotate_features_by_group( primer_aligner, alignment_filter=primer_filter) contig.unannotated_intervals = contig.get_unannotated_intervals() contig.annotate_cdr3() return contig
def main(args, outs): if args.vdj_reference_path is None: outs.chunked_annotations = None return chunk_contigs = [] barcodes_in_chunk = set(args.barcodes) # Set of barcodes that were called as cells if args.cell_barcodes: cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) else: cell_barcodes_set = set() # Setup feature reference sequences res = vdj_annot.setup_feature_aligners(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes) feature_types, feature_aligners, feature_filters = res # Setup primer reference sequnces if args.primers: primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers, vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO) read_counts = {} umi_counts = {} if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t') for _, row in contig_summary.iterrows(): read_counts[row.contig_name] = int(row.num_reads) umi_counts[row.contig_name] = int(row.num_umis) if args.filter_summary: try: filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary)) except EmptyDataError: filter_summary = None else: filter_summary = None if not args.contigs_fastq is None: fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False) for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)): if args.contigs_fastq is None: contig_quals = None else: header_fq, contig_sequence_fq, contig_quals = fq_iter.next() assert(contig_sequence_fq == contig_sequence) assert(header_fq == header) barcode = vdj_utils.get_barcode_from_contig_name(header) contig_name = header.split(' ')[0] # Only annotate barcodes assigned to this chunk and contigs with enough read support if barcode in barcodes_in_chunk: if filter_summary is not None: filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name) else: filtered = True contig = vdj_annot.AnnotatedContig(contig_name, contig_sequence, quals=contig_quals, barcode=barcode, is_cell=barcode in cell_barcodes_set, filtered=filtered, read_count=read_counts.get(contig_name), umi_count=umi_counts.get(contig_name), ) contig.annotations = contig.annotate_features(feature_types, feature_aligners, feature_filters) if args.primers: contig.primer_annotations = contig.annotate_features_by_group(primer_aligner, alignment_filter=primer_filter) contig.annotate_cdr3() chunk_contigs.append(contig) cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)