def main(args, outs): if args.vdj_reference_path is None: outs.chunked_annotations = None return chunk_contigs = [] barcodes_in_chunk = set(args.barcodes) # Set of barcodes that were called as cells if args.cell_barcodes: cell_barcodes_set = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) else: cell_barcodes_set = set() # Setup feature reference sequences res = vdj_annot.setup_feature_aligners(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes) feature_types, feature_aligners, feature_filters = res # Setup primer reference sequnces if args.primers: primer_aligner, primer_filter = vdj_annot.setup_primer_aligner(args.primers, vdj_constants.VDJ_ANNOTATION_MIN_SCORE_RATIO) read_counts = {} umi_counts = {} if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t') for _, row in contig_summary.iterrows(): read_counts[row.contig_name] = int(row.num_reads) umi_counts[row.contig_name] = int(row.num_umis) if args.filter_summary: try: filter_summary = vdj_utils.load_contig_summary_table(open(args.filter_summary)) except EmptyDataError: filter_summary = None else: filter_summary = None if not args.contigs_fastq is None: fq_iter = tk_fasta.read_generator_fastq(open(args.contigs_fastq), paired_end=False) for header, contig_sequence in cr_utils.get_fasta_iter(open(args.contigs)): if args.contigs_fastq is None: contig_quals = None else: header_fq, contig_sequence_fq, contig_quals = fq_iter.next() assert(contig_sequence_fq == contig_sequence) assert(header_fq == header) barcode = vdj_utils.get_barcode_from_contig_name(header) contig_name = header.split(' ')[0] # Only annotate barcodes assigned to this chunk and contigs with enough read support if barcode in barcodes_in_chunk: if filter_summary is not None: filtered = vdj_utils.is_contig_filtered(filter_summary, contig_name) else: filtered = True contig = vdj_annot.AnnotatedContig(contig_name, contig_sequence, quals=contig_quals, barcode=barcode, is_cell=barcode in cell_barcodes_set, filtered=filtered, read_count=read_counts.get(contig_name), umi_count=umi_counts.get(contig_name), ) contig.annotations = contig.annotate_features(feature_types, feature_aligners, feature_filters) if args.primers: contig.primer_annotations = contig.annotate_features_by_group(primer_aligner, alignment_filter=primer_filter) contig.annotate_cdr3() chunk_contigs.append(contig) cPickle.dump(chunk_contigs, open(outs.chunked_annotations, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
def main(args, outs): reporter = vdj_report.VdjReporter() barcode_contigs = collections.defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table(args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered(filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) reporter.report_summary_json(outs.summary)
def main(args, outs): reporter = vdj_report.VdjReporter() barcode_contigs = defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={ 'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table( args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered( filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) ## Compute post-assembly per-cell metrics # Load the assembly metrics summary to get the total assemblable reads if args.assemble_metrics_summary and args.reads_summary: assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes) total_read_pairs = cr_utils.get_metric_from_json( args.reads_summary, 'total_read_pairs') reporter._get_metric_attr( 'vdj_assemblable_read_pairs_per_filtered_bc').set_value( assemblable_read_pairs, len(barcodes)) reporter._get_metric_attr('vdj_sequencing_efficiency').set_value( assemblable_read_pairs, total_read_pairs) ## Try to autodetect the chain type # Find all chains w/ a significant presence. # If there's exactly one, set the chain type filter to that. # Otherwise, show all chain types. chain_count = defaultdict(int) for anno_dict in contig_annotations.itervalues(): contig = vdj_annotations.AnnotatedContig.from_dict( anno_dict, reference) if contig.is_cell and contig.high_confidence and contig.productive: for anno in contig.annotations: if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES: chain_count[anno.feature.chain_type] += 1 outs.chain_type = vdj_constants.ALL_CHAIN_TYPES print chain_count if len(chain_count) > 0: n_contigs = sum(chain_count.itervalues()) sig_chains = [ ct for ct, count in chain_count.iteritems() if tk_stats.robust_divide( count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC ] if len(sig_chains) == 1: outs.chain_type = sig_chains[0] reporter.report_summary_json(outs.summary)