def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = { int(k): v for k, v in args.chunks_per_gem_group.iteritems() } with open(args.read1s_chunk) as f1: read1s = [read for read in tk_fasta.read_generator_fastq(f1)] with open(args.read2s_chunk) as f2: read2s = [read for read in tk_fasta.read_generator_fastq(f2)] assert len(read1s) == len(read2s) fastqs_out = {} buckets = {} outs.buckets = {} for gem_group, bucket_name in enumerate_bucket_names( args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) fastqs_out[bucket_name] = open(filename, 'w') outs.buckets[bucket_name] = filename buckets[bucket_name] = [] for read1, read2 in itertools.izip(read1s, read2s): barcode = vdj_utils.get_fastq_read_barcode(read1) # Exclude unbarcoded reads if barcode is None: continue assert barcode == vdj_utils.get_fastq_read_barcode(read2) barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append(read1) buckets[bucket_name].append(read2) # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) fastq_out = fastqs_out[bucket_name] for read in bucket: tk_fasta.write_read_fastq(fastq_out, *read) fastq_out.close()
def split(args): assert args.read1s is not None and args.read2s is not None chunks = [] if cr_chem.get_barcode_whitelist(args.chemistry_def) is not None: # Data are barcoded for read1_fq, read2_fq, barcodes_json in zip(args.read1s, args.read2s, args.chunk_barcodes): with open(barcodes_json) as f: chunk_barcodes = json.load(f) chunks.append({ 'read1_chunk': read1_fq, 'read2_chunk': read2_fq, 'barcodes_chunk': chunk_barcodes, '__mem_gb': 3.0, }) else: # Most stages assume that each chunk has a single barcode. # So unfortunately we have to put all reads in the same chunk, otherwise # metric computation will break. read1_out_filename = martian.make_path('chunk0_1.fastq') read2_out_filename = martian.make_path('chunk0_2.fastq') with open(read1_out_filename, 'w') as read1_out, open(read2_out_filename, 'w') as read2_out: for read1_file, read2_file in zip(args.read1s, args.read2s): with open(read1_file) as in1, open(read2_file) as in2: fastq1_iter = tk_fasta.read_generator_fastq( in1, paired_end=False) fastq2_iter = tk_fasta.read_generator_fastq( in2, paired_end=False) for read1_tuple in fastq1_iter: read2_tuple = fastq2_iter.next() tk_fasta.write_read_fastq(read1_out, *read1_tuple) tk_fasta.write_read_fastq(read2_out, *read2_tuple) chunks.append({ 'read1_chunk': read1_out_filename, 'read2_chunk': read2_out_filename, 'barcodes_chunk': [""], }) # Martian doesn't like empty chunk lists so create a chunk w/ empty data if len(chunks) == 0: return get_dummy_chunk() return {'chunks': chunks}
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file, paired_end): barcodes = set() # Note: The filehandle cache precludes the use of compressed files file_cache = tk_cache.FileHandleCache(mode='r', open_func=open) heap = [] key_func = vdj_utils.fastq_barcode_sort_key for filename in in_filenames: try: fastq = tk_fasta.read_generator_fastq(file_cache.get(filename), paired_end=paired_end) first_readpair = fastq.next() key = key_func(first_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, first_readpair, filename)) except StopIteration: pass while len(heap) > 0: # Get the minimum item and write it. key, readpair, in_filename = heapq.heappop(heap) fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename), paired_end=paired_end) tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3]) if paired_end: tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6]) # Get the next item from the source file we just wrote from # If that file is out of items, then we leave that one out try: next_readpair = fastq.next() key = key_func(next_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, next_readpair, in_filename)) except StopIteration: pass json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
def write_data(self, data): tk_fasta.write_read_fastq(self.curr_file, *data)
def main(args, outs): """ Trim the reads in a series of fasta files """ # Set a fixed random seed to eliminate noise in metrics random.seed(0) chunk = args.chunk interleaved = chunk['reads_interleaved'] have_read2 = chunk['read2'] is not None paired = interleaved or have_read2 read1_trim = args.read1_trim_length read2_trim = args.read2_trim_length subsample_rate = chunk['subsample_rate'] # BC config -- BC come from separate fastq, or are embedded in R1 or R2 have_barcode = False bc_in_read1 = False bc_in_read2 = False bc_in_fastq = False # If we have bc in read, use that & ignore a separate BC read if chunk.get('bc_in_read', None) is not None and chunk.get('bc_length', 0) > 0: have_barcode = True bc_length = chunk['bc_length'] if chunk['bc_in_read'] == 1: bc_in_read1 = True read1_trim += bc_length elif chunk['bc_in_read'] == 2: bc_in_read2 = True read2_trim += bc_length else: martian.exit( "bc_in_read configuration incorrect -- read must be 1 or 2") # Otherwise use the BC file elif chunk['barcode'] is not None: have_barcode = True bc_in_fastq = True have_sample_index = chunk['sample_index'] is not None output_directory = os.path.dirname(os.path.realpath(outs.placeholder)) max_read_num = args.max_read_num # counter for sub-chunked files file_number = 1 # open the available read files and make the appropriate iterators if interleaved: read_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read_in, paired_end=True) else: if have_read2: read1_in = openfq(chunk['read1']) read1_iter = tk_fasta.read_generator_fastq(read1_in) read2_in = openfq(chunk['read2']) read2_iter = tk_fasta.read_generator_fastq(read2_in) read_iter = itertools.imap( lambda x, y: (x[0], x[1], x[2], y[0], y[1], y[2]), read1_iter, read2_iter) else: read1_in = openfq(chunk['read1']) read_iter = tk_fasta.read_generator_fastq(read1_in) # open read file read_name = output_directory + "/read" + str(file_number) + ".fastq" read_names = [read_name] out_read_fastq = open(read_name, 'w') # Bail out if there's no barcodes or whitelist if args.barcode_whitelist is None: outs.bc_counts = None bc_idx = None else: barcode_whitelist = sorted( list(tk_seq.load_barcode_whitelist(args.barcode_whitelist))) bc_idx = {bc: idx for (idx, bc) in enumerate(barcode_whitelist)} bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 # open barcode file if there is one if have_barcode: bc_name = output_directory + "/BC" + str(file_number) + ".fastq" out_bc_fastq = open(bc_name, 'w') bc_names = [bc_name] if bc_in_fastq: bc_in = openfq(chunk['barcode']) bc_iter = tk_fasta.read_generator_fastq(bc_in) elif bc_in_read1 or bc_in_read2: # BC in read -- have output file but no input file bc_iter = itertools.repeat(None) else: bc_iter = itertools.repeat(None) bc_names = [None] outs.bc_counts = None # open sample_index file if there is one if have_sample_index: si_name = output_directory + "/SI" + str(file_number) + ".fastq" out_si_fastq = open(si_name, 'w') si_in = openfq(chunk['sample_index']) si_iter = tk_fasta.read_generator_fastq(si_in) si_names = [si_name] else: si_iter = itertools.repeat(None) si_names = [None] # loop through reads read_num = 0 for read, barcode_read, sample_index_read in itertools.izip( read_iter, bc_iter, si_iter): if read_num > 0 and random.random() > subsample_rate: continue if paired: (name1, seq1, qual1, name2, seq2, qual2) = read else: (name1, seq1, qual1) = read new_seq1 = seq1[read1_trim:] new_qual1 = qual1[read1_trim:] if paired: new_seq2 = seq2[read2_trim:] new_qual2 = qual2[read2_trim:] # Get BC sequence out of the read, for BC-in-read schemes if bc_in_read1: barcode_read = (name1, seq1[:bc_length], qual1[:bc_length]) if bc_in_read2: barcode_read = (name2, seq2[:bc_length], qual2[:bc_length]) read_num += 1 if read_num > max_read_num: read_num = 1 file_number += 1 read_name = output_directory + "/read" + str( file_number) + ".fastq" out_read_fastq.close() out_read_fastq = open(read_name, 'w') read_names.append(read_name) if have_barcode: bc_name = output_directory + "/BC" + str( file_number) + ".fastq" out_bc_fastq.close() out_bc_fastq = open(bc_name, 'w') bc_names.append(bc_name) else: bc_names.append(None) if have_sample_index: si_name = output_directory + "/SI" + str( file_number) + ".fastq" out_si_fastq.close() out_si_fastq = open(si_name, 'w') si_names.append(si_name) else: si_names.append(None) if have_barcode: barcode_seq = barcode_read[1] barcode_qual = barcode_read[2] if chunk['barcode_reverse_complement']: barcode_seq = tk_seq.get_rev_comp(barcode_seq) barcode_qual = barcode_qual[:: -1] # obscure way to reverse string if bc_idx is not None: idx = bc_idx.get(barcode_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0], barcode_seq, barcode_qual) if have_sample_index: tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0], sample_index_read[1], sample_index_read[2]) tk_fasta.write_read_fastq(out_read_fastq, name1, new_seq1, new_qual1) if paired: tk_fasta.write_read_fastq(out_read_fastq, name2, new_seq2, new_qual2) if have_barcode: out_bc_fastq.close() # Only emit BC counts if we had a whitelist if outs.bc_counts is not None: result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out) if have_sample_index: out_si_fastq.close() out_read_fastq.close() chunks = [] for (r, bc, si) in zip(read_names, bc_names, si_names): new_chunk = { 'read1': r, 'read2': None, 'barcode': bc, 'sample_index': si, 'barcode_reverse_complement': False, 'reads_interleaved': have_read2 or interleaved, 'gem_group': chunk['gem_group'], 'read_group': chunk['read_group'] } chunks.append(new_chunk) outs.chunks = chunks
def main(args, outs): reporter = vdj_report.VdjReporter() with open(args.contig_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) contigs.sort(key=lambda c: (c.barcode, c.get_single_chain( ), not c.productive, -c.umi_count, -c.read_count, -len(c))) low_confidence_contigs = set() cell_contigs = set() for (bc, chain), group in itertools.groupby(contigs, key=lambda c: (c.barcode, c.get_single_chain())): first_cdr3 = None first_cdr3_umis = None seen_cdr3s = set() for contig in group: contig.high_confidence = True if contig.is_cell: cell_contigs.add(contig.contig_name) if first_cdr3 is None: first_cdr3 = contig.cdr3_seq first_cdr3_umis = contig.umi_count # Mark as low confidence: # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3 # with a single UMI or low UMIs relative to the first contig, or extraneous_cdr3 = first_cdr3 is not None \ and contig.cdr3_seq != first_cdr3 \ and (contig.umi_count == 1 or \ (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO) # 2) Any contigs with a repeated CDR3. repeat_cdr3 = contig.cdr3_seq in seen_cdr3s if extraneous_cdr3 or repeat_cdr3: contig.high_confidence = False low_confidence_contigs.add(contig.contig_name) seen_cdr3s.add(contig.cdr3_seq) if chain in vdj_constants.VDJ_GENES: reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', chain).add( 1, filter=contig.high_confidence) reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', cr_constants.MULTI_REFS_PREFIX).add( 1, filter=contig.high_confidence) # Write augmented contig annotations with open(outs.contig_annotations, 'w') as f: vdj_annot.save_annotation_list_json(f, contigs) # Write filtered fasta with open(args.contig_fasta) as in_file, \ open(outs.filtered_contig_fasta, 'w') as out_file: for hdr, seq in cr_utils.get_fasta_iter(in_file): # Keep contigs that are high confidence & in cells if hdr not in low_confidence_contigs and hdr in cell_contigs: tk_fasta.write_read_fasta(out_file, hdr, seq) # Write filtered fastq with open(args.contig_fastq) as in_file, \ open(outs.filtered_contig_fastq, 'w') as out_file: for name, seq, qual in tk_fasta.read_generator_fastq(in_file): if name not in low_confidence_contigs and name in cell_contigs: tk_fasta.write_read_fastq(out_file, name, seq, qual) reporter.report_summary_json(outs.summary)
def get_consensus_quals(in_bam, clonotype_name, in_fasta, sel_contigs, contig_umis, out_dir): """Compute base quality scores of a sequence. Args: - in_bam: bam file to get the list of reads assigned to UMIs on the selected contigs - clonotype_name: Used for naming output files. - sel_contigs: Contigs that led to the consensus sequence above - contig_umis: from contig name to list of umis assigned to that contig Return value: String with base qualities (in FASTQ format). """ pref = re.sub('.fasta', '', os.path.basename(in_fasta)) fastq1 = re.sub('.fasta', '_1.fastq', in_fasta) fastq2 = re.sub('.fasta', '_2.fastq', in_fasta) sel_reads = {} for contig in sel_contigs: umi_read_count = Counter() barcode = contig.split('_')[0] contig_read_count = 0 # Wrap contig w/ str() because pysam crashes on unicode input for read in in_bam.fetch(str(contig)): # NOTE: Assembler assumes that any tags are part of the read name # BUT the bam that we feed to this stage has the tags stripped out # of the name. umi = read.get_tag(PROCESSED_UMI_TAG) if umi in contig_umis[contig] and not read.is_secondary: umi_read_count[umi] += 1 if umi_read_count[umi] >= MAX_READS_PER_UMI: continue contig_read_count += 1 if contig_read_count >= MAX_READS_PER_CONTIG: continue if not read.qname in sel_reads: sel_reads[read.qname] = [None, None] sel_reads[read.qname][read.is_read2] = read with open(fastq1, 'w') as f1, open(fastq2, 'w') as f2: for read_name, pair in sel_reads.iteritems(): read1, read2 = pair[0], pair[1] if read1 is None: # Replace the UMI with <BC>_<UMI>. umi = read2.get_tag(PROCESSED_UMI_TAG) else: umi = read1.get_tag(PROCESSED_UMI_TAG) header = cr_fastq.AugmentedFastqHeader(read_name) header.set_tag(PROCESSED_UMI_TAG, barcode + '_' + umi) header.set_tag(PROCESSED_BARCODE_TAG, barcode) if read1 is None: out_seq1 = "" out_quals1 = "" else: out_seq1 = tk_seq.get_rev_comp( read1.seq) if read1.is_reverse else read1.seq out_quals1 = read1.qual[:: -1] if read1.is_reverse else read1.qual tk_fasta.write_read_fastq(f1, header.to_string(), out_seq1, out_quals1) if read2 is None: out_seq2 = "" out_quals2 = "" else: out_seq2 = tk_seq.get_rev_comp( read2.seq) if read2.is_reverse else read2.seq out_quals2 = read2.qual[:: -1] if read2.is_reverse else read2.qual tk_fasta.write_read_fastq(f2, header.to_string(), out_seq2, out_quals2) assert (len(sel_reads) > 0) cmd = ['vdj_asm', 'base-quals', re.sub('.fasta', '', in_fasta), out_dir] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, pref + '.fastq'), 'r') as f: lines = f.readlines() return lines[3].strip()
def get_consensus_seq(clonotype_name, sel_contigs, best_contig, out_dir, args): """Build a consensus sequence from a set of contigs. Args: - clonotype_name: Used to prefix output files. - sel_contigs: Names of contigs to use for consensus building. - best_contig: Name of "best" contig. Will search for this contig's sequence and base qualities. - out_dir: dir used for temporary results - args: stage args. - Return value: A tuple (best_contig_seq, best_contig_quals, consensus_seq, out_bam_name, out_fastq_name, out_fasta_name). - best_contig_seq/best_contig_quals: the sequence and quals of the best contig - consensus_seq: the consensus sequence or None if no consensus could be built. - out_bam_name: Path of BAM with alignments of contigs to consensus seq. - out_fastq_name: FASTQ with contig sequences. - out_fasta_name: FASTA with consensus sequence. enough reads for consensus. """ best_contig_seq = None best_contig_quals = None # Input to base quality computation - we don't really need the # base qualities because we will replace them by read-based qualities # But we need to do this to get proper alignments of contigs against # the consensus. out_fastq_name = martian.make_path(clonotype_name + '_contigs.fastq') # Input to assembly out_bam_name = martian.make_path(clonotype_name + '_contigs.bam') # The reference in the output bam doesn't really matter. out_bam, _ = tk_bam.create_bam_outfile(out_bam_name, ['chr1'], [1]) # Read the entire fastq (all contigs) and write the selected contigs to # a bam for the assembler and a fastq for the aligner. with open(args.contigs_fastq, 'r') as f, open(out_fastq_name, 'w') as out_fq: fq_iter = tk_fasta.read_generator_fastq(f) for (name, seq, quals) in fq_iter: if name in sel_contigs: if name == best_contig: best_contig_seq = seq best_contig_quals = quals header = cr_fastq.AugmentedFastqHeader(name) # Create a pseudo-UMI for each input contig header.set_tag(PROCESSED_UMI_TAG, name) # Put all reads on the same "barcode". This is important, so # the assembler assembles all of them together. header.set_tag(PROCESSED_BARCODE_TAG, clonotype_name) record = pysam.AlignedRead() record.reference_start = 0 record.reference_id = 0 # Wrap with str() or pysam will crash when given unicode record.qname = str(header.to_string()) record.seq = seq record.qual = quals record.flag = MAPPED_UNPAIRED_FLAG out_bam.write(record) # Now change the tags. The final bam concatenation code will pull # the tags out of the header, so we want these to be meaningful. # Put the real barcode in the barcode tag. The alignment-base-qual # code will ignore it anyway. header.set_tag(PROCESSED_BARCODE_TAG, name.split('_')[0]) tk_fasta.write_read_fastq(out_fq, header.to_string(), seq, quals) out_bam.close() assert (not best_contig_seq is None) out_fasta_name = martian.make_path(clonotype_name + '_contigs.fasta') # Run the assembler to produce a consensus sequence. Read contig-reads from out_bam_name. # The resulting sequences will be in out_dir/<clonotype_name>_contigs.fasta. This is the # only output of the assembler we care about. if len(sel_contigs) >= MIN_CONTIGS_FOR_CONSENSUS: cmd = [ 'vdj_asm', 'asm', out_bam_name, out_dir, '--single-end', '--cons', # required so we produce a single output sequence '--kmers=0', '--min-qual=0', '--score-factor=0.0' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(os.path.join(out_dir, clonotype_name + '_contigs.fasta'), 'r') as contig_f: lines = contig_f.readlines() if lines: out_seq = lines[1].strip() else: # In some rare cases (eg. input contigs have 0 quality), assembly might fail. out_seq = None else: out_seq = None # Write the best contig sequence on a new fasta. We need to make sure this has the # right contig name because this will be the name written in the bam alignments # of the contigs against the consensus with open(out_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, clonotype_name, out_seq if out_seq else best_contig_seq) # Now align the same reads that were used in vdj_asm against the consensus that you just got. # The output will be in out_dir/<clonotype_name> + '_contigs.bam' cmd = [ 'vdj_asm', 'base-quals', martian.make_path(clonotype_name + '_contigs'), out_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move the BAM of the contigs aligned against the consensus out of the outs # (Will overwrite this bam which was already used as input to assembly). cr_io.move(os.path.join(out_dir, clonotype_name + '_contigs.bam'), out_bam_name) return (best_contig_seq, best_contig_quals, out_seq, out_bam_name, out_fastq_name, out_fasta_name)
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return # Get the clonotype-barcode assignments with open(args.clonotype_assignments) as f: clonotypes = json.load(f) # Partition contig annotations by consensus id consensus_to_contigs = defaultdict(list) relevant_contig_ids = set() with open(args.chunk_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) clo_key = '%s_clonotype_id' % args.metric_prefix cons_key = '%s_consensus_id' % args.metric_prefix for contig in contigs: clo_id = contig.info_dict.get(clo_key) cons_id = contig.info_dict.get(cons_key) assert clo_id in chunk_clonotypes and cons_id is not None consensus_to_contigs[cons_id].append(contig) relevant_contig_ids.add(contig.contig_name) assert len(consensus_to_contigs) > 0 in_bam = tk_bam.create_bam_infile(args.contig_bam) n_merged_bams = 0 # For all contigs relevant to this chunk, # get the assembler umi data required for base qual recalculation. # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5].lower() == 'true' contig_ids = set(fields[6].split(',')) if good_umi and len(contig_ids & relevant_contig_ids) > 0: for c in contig_ids: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] # Verify that the contig annotation data are consistent with the clonotype assignment data assert set(consensus['cell_contigs']) == \ set(c.contig_name for c in consensus_to_contigs[consensus_id]) sel_contigs = consensus_to_contigs[consensus_id] sel_contig_ids = [c.contig_name for c in sel_contigs] # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ best_contig.umi_count < contig.umi_count): best_contig = contig assert best_contig is not None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_io.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contig_ids, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) if list(tmp_bam.references) != [consensus_id]: # Print some info to help us debug print tmp_bam.references, consensus_id assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. # Use a subset of the contigs for computing quals. contig_ids = map( lambda c: c.contig_name, sorted(sel_contigs, key=lambda c: c.umi_count, reverse=True)) contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS] consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, contig_ids, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = sum([c.read_count for c in sel_contigs]) total_umi_count = sum([c.umi_count for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contig_ids, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_io.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) # Clean up unneeded files ASAP rm_files([ consensus_id + '_contigs.fasta', consensus_id + '_contigs.fastq' ]) # Merge N most recent BAM files to avoid filesystem overload if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY: assert len(outs.chunked_consensus_bams) == len( outs.chunked_concat_ref_bams) new_cons_bam = martian.make_path('merged-consensus-%03d.bam' % n_merged_bams) concatenate_bams(new_cons_bam, outs.chunked_consensus_bams) rm_files(outs.chunked_consensus_bams) outs.chunked_consensus_bams = [new_cons_bam] new_ref_bam = martian.make_path('merged-ref-%03d.bam' % n_merged_bams) concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams) rm_files(outs.chunked_concat_ref_bams) outs.chunked_concat_ref_bams = [new_ref_bam] n_merged_bams += 1 in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): # Martian coerces dict keys to string # Coerce keys back to int args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()} paired_end = args.read2s_chunk is not None # Lazy load R1 r1_file = cr_io.open_maybe_gzip(args.read1s_chunk) read1s = tk_fasta.read_generator_fastq(r1_file) # Lazy load R2 if paired_end: r2_file = cr_io.open_maybe_gzip(args.read2s_chunk) read2s = tk_fasta.read_generator_fastq(r2_file) else: read2s = [] # Lazy load corrected BCs bc_file = cr_io.open_maybe_gzip(args.bcs) bcs = (line.strip() for line in bc_file) buckets = {} bucket_filenames = {} for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group): filename = martian.make_path("%s.fastq" % bucket_name) bucket_filenames[bucket_name] = filename buckets[bucket_name] = [] for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs): # Exclude unbarcoded reads if barcode == '': continue # Exclude short reads if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH): continue # Attach processed barcode to reads r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0]) r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r1_new_qname = r1_hdr.to_string() if paired_end: r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0]) r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode) r2_new_qname = r2_hdr.to_string() barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode) bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group]) buckets[bucket_name].append((r1_new_qname, read1[1], read1[2])) if paired_end: buckets[bucket_name].append((r2_new_qname, read2[1], read2[2])) outs.buckets = {} # Sort and write each bucket for bucket_name, bucket in buckets.iteritems(): bucket.sort(key=vdj_utils.fastq_barcode_sort_key) # Don't create empty bucket files. # This is common when the reads are ordered by gem group # And a chunk sees only a single gem group. if len(bucket) == 0: continue filename = bucket_filenames[bucket_name] with cr_io.open_maybe_gzip(filename, 'w') as f: for read in bucket: tk_fasta.write_read_fastq(f, *read) outs.buckets[bucket_name] = bucket_filenames[bucket_name]
def main(args, outs): """ Trim the reads in a series of fasta files """ chunk = args.chunk subsample_rate = chunk['subsample_rate'] have_barcode = chunk['barcode'] is not None have_sample_index = chunk['sample_index'] is not None # STEP 1: We run the R1/R2 reads through cutadapt, writing them to a temporary file with appropriate adapters # trimmed, optionally filtering out reads where adapters weren't found interleaved = chunk['read2'] is None # can't do discard_untrimmed because we're running cutadapt in single-end mode if args.trim_def['discard_untrimmed']: martian.exit("discard_untrimmed was set in trim_def") if interleaved: trimmed_reads = martian.make_path("trimmed_reads.fastq") trim_info_fn = martian.make_path("trim_info.txt") initial_read_pairs, trimmed_read_pairs = run_cutadapt_single_end( chunk['read1'], trimmed_reads, trim_info_fn, args.trim_def, args.adapters) else: trimmed_r1 = martian.make_path("trimmed_r1.fastq") trimmed_r2 = martian.make_path("trimmed_r2.fastq") trim_info_r1_fn = martian.make_path("trim_info_r1.txt") trim_info_r2_fn = martian.make_path("trim_info_r2.txt") initial1, trimmed1 = run_cutadapt_single_end(chunk['read1'], trimmed_r1, trim_info_r1_fn, args.trim_def, args.adapters, read_id="R1") initial2, trimmed2 = run_cutadapt_single_end(chunk['read2'], trimmed_r2, trim_info_r2_fn, args.trim_def, args.adapters, read_id="R2") initial_read_pairs = initial1 + initial2 trimmed_read_pairs = trimmed1 + trimmed2 if initial1 != initial2: martian.exit( "Input fastq files for R1 and R2 are not the same length") if trimmed1 != trimmed2: raise ValueError( "Cutadapt produced differing numbers of reads for R1 and R2") # STEP 2: We run through the trimmed R1/R2 reads along with sample index and barcode reads, chunking into files of # max_read_num reads or less, and skipping sample index/barcode reads that don't match the trimmed & filtered R1/R2 # reads max_read_num = args.max_read_num file_number = 1 # open the available input read files and get the iterator over them if interleaved: reads_in = open_maybe_gzip(trimmed_reads, 'r') read_iter = tk_fasta.read_generator_fastq(reads_in, paired_end=True) trim_info = open_maybe_gzip(trim_info_fn, 'r') trim_iter = read_generator_trim_info(trim_info, paired_end=True) else: r1_in = open_maybe_gzip(trimmed_r1, 'r') r2_in = open_maybe_gzip(trimmed_r2, 'r') read_iter = ((r1[0], r1[1], r1[2], r2[0], r2[1], r2[2]) for r1, r2 in itertools.izip_longest( tk_fasta.read_generator_fastq(r1_in), tk_fasta.read_generator_fastq(r2_in))) trim_info_r1 = open_maybe_gzip(trim_info_r1_fn, 'r') trim_info_r2 = open_maybe_gzip(trim_info_r2_fn, 'r') trim_iter = (t1 + t2 for t1, t2 in itertools.izip( read_generator_trim_info(trim_info_r1), read_generator_trim_info(trim_info_r2))) # open output read file, which will be interleaved read_name = martian.make_path("read{}.fastq".format(file_number)) out_readfiles = [read_name] out_read_fastq = open(read_name, 'w') # open trimmed read file, which will be interleaved trim_out_name = martian.make_path("TRIM{}.fastq".format(file_number)) out_trimfiles = [trim_out_name] out_trim_fastq = open(trim_out_name, 'w') if args.barcode_whitelist is None: outs.bc_counts = None barcode_indices = None else: barcode_whitelist = sorted( list(load_barcode_whitelist(args.barcode_whitelist))) barcode_indices = { bc: idx for (idx, bc) in enumerate(barcode_whitelist) } bc_counts = np.zeros(len(barcode_whitelist), dtype=np.int32) bad_count = 0 # open barcode file if there is one if have_barcode: bc_name = martian.make_path("BC{}.fastq".format(file_number)) out_bc_fastq = open(bc_name, 'w') out_barcodefiles = [bc_name] barcode_read = None bc_in = open_maybe_gzip(chunk['barcode'], 'r') bc_iter = tk_fasta.read_generator_fastq(bc_in) # Determine if barcode sequences need to be reverse complemented. with open_maybe_gzip(chunk['barcode'], 'r') as bc_in2: bc_iter2 = tk_fasta.read_generator_fastq(bc_in2) barcode_whitelist = load_barcode_whitelist(args.barcode_whitelist) barcode_rc = infer_barcode_reverse_complement( barcode_whitelist, bc_iter2) else: out_barcodefiles = [None] outs.bc_counts = None # open sample_index file if there is one if have_sample_index: si_name = martian.make_path("SI{}.fastq".format(file_number)) out_si_fastq = open(si_name, 'w') si_in = open_maybe_gzip(chunk['sample_index'], 'r') sample_index_read = None si_iter = tk_fasta.read_generator_fastq(si_in) out_sampleindex_files = [si_name] else: out_sampleindex_files = [None] read_num = 0 random.seed(0) for (read, trim) in itertools.izip(read_iter, trim_iter): # Downsample (other than the first read). Note we've set a fixed seed to make this deterministic. if read_num > 0 and random.random() > subsample_rate: continue # Now we need to step through the barcode and sample index reads to find the matching reads if have_barcode: try: while barcode_read is None or not read_match( read, barcode_read): barcode_read = bc_iter.next() # reverse complement if all barcodes are RC-ed if barcode_rc: barcode_read = (barcode_read[0], tk_seq.get_rev_comp(barcode_read[1]), barcode_read[2][::-1]) except StopIteration: raise ValueError( "Couldn't find barcode read matching {}".format( get_read_name(read))) if have_sample_index: try: while sample_index_read is None or not read_match( read, sample_index_read): sample_index_read = si_iter.next() except StopIteration: raise ValueError( "Couldn't find sample index read matching {}".format( get_read_name(read))) (name1, seq1, qual1, name2, seq2, qual2) = read (tr_name1, tr_seq1, tr_qual1, tr_name2, tr_seq2, tr_qual2) = trim read_num += 1 if read_num > max_read_num: read_num = 1 file_number += 1 read_name = martian.make_path("read{}.fastq".format(file_number)) out_read_fastq.close() out_read_fastq = open(read_name, 'w') out_readfiles.append(read_name) trim_out_name = martian.make_path( "TRIM{}.fastq".format(file_number)) out_trim_fastq.close() out_trim_fastq = open(trim_out_name, 'w') out_trimfiles.append(trim_out_name) if have_barcode: bc_name = martian.make_path("BC{}.fastq".format(file_number)) out_bc_fastq.close() out_bc_fastq = open(bc_name, 'w') out_barcodefiles.append(bc_name) else: out_barcodefiles.append(None) if have_sample_index: si_name = martian.make_path("SI{}.fastq".format(file_number)) out_si_fastq.close() out_si_fastq = open(si_name, 'w') out_sampleindex_files.append(si_name) else: out_sampleindex_files.append(None) if have_barcode: barcode_seq = barcode_read[1] barcode_qual = barcode_read[2] if barcode_indices is not None: idx = barcode_indices.get(barcode_seq) if idx is not None: bc_counts[idx] += 1 else: bad_count += 1 tk_fasta.write_read_fastq(out_bc_fastq, barcode_read[0], barcode_seq, barcode_qual) if have_sample_index: tk_fasta.write_read_fastq(out_si_fastq, sample_index_read[0], sample_index_read[1], sample_index_read[2]) tk_fasta.write_read_fastq(out_read_fastq, name1, seq1, qual1) tk_fasta.write_read_fastq(out_read_fastq, name2, seq2, qual2) tk_fasta.write_read_fastq(out_trim_fastq, tr_name1, tr_seq1, tr_qual1) tk_fasta.write_read_fastq(out_trim_fastq, tr_name2, tr_seq2, tr_qual2) if interleaved: reads_in.close() else: r1_in.close() r2_in.close() if have_barcode: out_bc_fastq.close() # Only emit BC counts if we had a whitelist if outs.bc_counts is not None: result = {} result['bad_bc_count'] = bad_count result['bc_counts'] = list(bc_counts) with open(outs.bc_counts, 'w') as bc_counts_out: tenkit.safe_json.dump_numpy(result, bc_counts_out) with open(outs.read_counts, 'w') as outfile: read_counts = { 'total_read_pairs': initial_read_pairs, 'filtered_read_pairs': trimmed_read_pairs } tenkit.safe_json.dump_numpy(read_counts, outfile) if have_sample_index: out_si_fastq.close() out_read_fastq.close() out_trim_fastq.close() outs.chunks = [ { 'read1': r, # output chunked trimmed read file 'read2': None, 'trim': t, # output chunked trim file 'barcode': bc, # output chunked barcode file 'sample_index': si, # output chunked sample index file 'barcode_reverse_complement': False, # we always keep BC in correct orientation 'reads_interleaved': True, 'gem_group': chunk['gem_group'], 'read_group': chunk['read_group'] } for (r, t, bc, si) in zip(out_readfiles, out_trimfiles, out_barcodefiles, out_sampleindex_files) ]
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return with open(args.annotations) as f: contigs = cPickle.load(f) with open(args.clonotype_assignments) as f: clonotypes = json.load(f) in_bam = tk_bam.create_bam_infile(args.contig_bam) contig_read_counts = {c.contig_name: c.read_count for c in contigs} contig_umi_counts = {c.contig_name: c.umi_count for c in contigs} # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5] == 'True' contig_names = fields[6].split(',') if good_umi: for c in contig_names: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] sel_contigs = set(consensus['cell_contigs'] ) # Get the contigs that should be merged # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in contigs: if contig.contig_name in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])): best_contig = contig assert not best_contig is None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) # Order contigs by decreasing UMI support ordered_contigs = list( sorted(sel_contigs, key=lambda x: len(contig_umis[x]), reverse=True)) ordered_contigs = ordered_contigs[ 0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))] wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_utils.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contigs, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, ordered_contigs, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = np.sum( [contig_read_counts[c] for c in sel_contigs]) total_umi_count = np.sum( [contig_umi_counts[c] for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contigs, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end', '--global' # use global alignment if a good seed isn't found - everything must get aligned ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') subprocess.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_utils.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = open(args.read1_chunk) in_read2_fastq = open(args.read2_chunk) out_read1_fastq = open(outs.corrected_read1s, 'w') out_read2_fastq = open(outs.corrected_read2s, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) read2_header = cr_fastq.AugmentedFastqHeader(read2[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) reporter.vdj_barcode_cb(raw_bc, processed_bc) tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(), read1[1], read1[2]) tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(), read2[1], read2[2]) in_read1_fastq.close() in_read2_fastq.close() out_read1_fastq.close() out_read2_fastq.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def write_bam_read_fastq(out, read): if read.is_reverse: seq, qual = tk_seq.get_rev_comp(read.seq), read.qual[::-1] else: seq, qual = read.seq, read.qual tk_fasta.write_read_fastq(out, read.qname, seq, qual)