def __init__(self, filenames): """ filenames list(str) - list of filenames to write to """ self.filenames = filenames self.cache = tk_cache.FileHandleCache(mode='w') self.writers = [ JsonDictListWriter(self.cache.get(fn)) for fn in filenames ]
def merge_by_key(bam_filenames, key_func, bam_out): file_cache = tk_cache.FileHandleCache(mode='rb', open_func=pysam.Samfile) total_reads = 0 heap = [] for bam_filename in bam_filenames: try: bam = file_cache.get(bam_filename) first_read = bam.next() heapq.heappush(heap, (key_func(first_read), first_read, bam_filename)) except StopIteration: pass while len(heap) > 0: # Get the minimum item and write it to the bam. key, read, bam_filename = heapq.heappop(heap) bam = file_cache.get(bam_filename) bam_out.write(read) total_reads += 1 # Get the next read from the source bam we just wrote from # If that BAM file is out of reads, then we leave that one out try: next_read = bam.next() heapq.heappush(heap, (key_func(next_read), next_read, bam_filename)) except StopIteration: pass return total_reads
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file, paired_end): barcodes = set() # Note: The filehandle cache precludes the use of compressed files file_cache = tk_cache.FileHandleCache(mode='r', open_func=open) heap = [] key_func = vdj_utils.fastq_barcode_sort_key for filename in in_filenames: try: fastq = tk_fasta.read_generator_fastq(file_cache.get(filename), paired_end=paired_end) first_readpair = fastq.next() key = key_func(first_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, first_readpair, filename)) except StopIteration: pass while len(heap) > 0: # Get the minimum item and write it. key, readpair, in_filename = heapq.heappop(heap) fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename), paired_end=paired_end) tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3]) if paired_end: tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6]) # Get the next item from the source file we just wrote from # If that file is out of items, then we leave that one out try: next_readpair = fastq.next() key = key_func(next_readpair[0:3]) barcode = key[0] barcodes.add(barcode) heapq.heappush(heap, (key, next_readpair, in_filename)) except StopIteration: pass json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
def main_demultiplex(args, outs): do_interleave = True file_info = [ IlmnFastqFile(x) for x in args.input_files ] file_groups = groupby(lambda x: (x.s, x.lane, x.group), file_info).items() demultiplex = args.demultiplex read_types = args.read_types good_bcs = args.common_bcs # For no interleaving: interleave_map = range(len(args.read_types)) output_reads = args.read_types if not ("R1" in read_types) or not ("R2" in read_types): martian.throw("You requested interleaving, but you don't have R1 and R2 read types") r1_slot = read_types.index("R1") r2_slot = read_types.index("R2") interleave_map[r2_slot] = r1_slot output_reads = [ read_types[idx] for idx in numpy.unique(interleave_map) ] # Create output path os.mkdir(outs.demultiplexed_fastq_path) output_path = outs.demultiplexed_fastq_path # counts of each valid barcode and non-matching barcodes summary_counts = { bc:0 for bc in good_bcs } summary_counts[DEMULTIPLEX_INVALID_SAMPLE_INDEX] = 0 with tk_cache.FileHandleCache(open_func=gzip.open) as file_cache: # Iterate over the file groups for (k, input_files) in file_groups: # original path: # <path>/<prefix>_S0_L001_R1_001.fastq # new path: # <outpath>/read-<read_id>_si-xxxxx_lane-<lane>_chunk-<chunk>.fastq # input_files should have constant prefix, S, and L # sort input_files to match the read_types read_to_file_dict = { x.read:x for x in input_files } input_files = [ read_to_file_dict[rt] for rt in read_types ] output_files = [ read_to_file_dict[rt] for rt in output_reads ] def output_file(path, in_file, barcode): if do_interleave and in_file.read[0] == "R": read = "RA" else: read = in_file.read # Chunk over lanes to get some parallelism to speed up alignment f = "read-%s_si-%s_lane-%03d-chunk-%03d.fastq.gz" % (read, barcode, in_file.lane, args.chunk_number) return os.path.join(path, f) if args.rc_i2_read: # For NextSeq we need to RC the I2 read input_iters = [ FastqParser(f.filename, rc=(f.read == "I2")).read_fastq() for f in input_files ] else: input_iters = [ FastqParser(f.filename).read_fastq() for f in input_files ] martian.log_info("Demultiplexing from: %s" % input_files[0].filename) if demultiplex: bc_files = { bc: [output_file(output_path, f, bc) for f in output_files] for bc in good_bcs } err_files = [ output_file(output_path, f, "X") for f in output_files ] process_fastq_chunk(input_iters, bc_files, err_files, file_cache, interleave_map, summary_counts) else: out_files = [ output_file(output_path, f, 'X') for f in output_files ] process_fastq_chunk_no_demult(input_iters, out_files, file_cache, interleave_map, summary_counts) output_files = file_cache.have_opened # Write out the summary counts to JSON with open(outs.demultiplex_summary, "w") as f: json.dump(summary_counts, f)