Ejemplo n.º 1
0
 def __init__(self, filenames):
     """ filenames list(str) - list of filenames to write to """
     self.filenames = filenames
     self.cache = tk_cache.FileHandleCache(mode='w')
     self.writers = [
         JsonDictListWriter(self.cache.get(fn)) for fn in filenames
     ]
Ejemplo n.º 2
0
def merge_by_key(bam_filenames, key_func, bam_out):
    file_cache = tk_cache.FileHandleCache(mode='rb', open_func=pysam.Samfile)
    total_reads = 0
    heap = []

    for bam_filename in bam_filenames:
        try:
            bam = file_cache.get(bam_filename)
            first_read = bam.next()
            heapq.heappush(heap,
                           (key_func(first_read), first_read, bam_filename))
        except StopIteration:
            pass

    while len(heap) > 0:
        # Get the minimum item and write it to the bam.
        key, read, bam_filename = heapq.heappop(heap)
        bam = file_cache.get(bam_filename)
        bam_out.write(read)
        total_reads += 1

        # Get the next read from the source bam we just wrote from
        # If that BAM file is out of reads, then we leave that one out
        try:
            next_read = bam.next()
            heapq.heappush(heap,
                           (key_func(next_read), next_read, bam_filename))
        except StopIteration:
            pass

    return total_reads
Ejemplo n.º 3
0
def merge_by_barcode(in_filenames, r1_out_file, r2_out_file, bcs_out_file,
                     paired_end):
    barcodes = set()

    # Note: The filehandle cache precludes the use of compressed files
    file_cache = tk_cache.FileHandleCache(mode='r', open_func=open)
    heap = []

    key_func = vdj_utils.fastq_barcode_sort_key

    for filename in in_filenames:
        try:
            fastq = tk_fasta.read_generator_fastq(file_cache.get(filename),
                                                  paired_end=paired_end)
            first_readpair = fastq.next()

            key = key_func(first_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, first_readpair, filename))

        except StopIteration:
            pass

    while len(heap) > 0:
        # Get the minimum item and write it.
        key, readpair, in_filename = heapq.heappop(heap)

        fastq = tk_fasta.read_generator_fastq(file_cache.get(in_filename),
                                              paired_end=paired_end)

        tk_fasta.write_read_fastq(r1_out_file, *readpair[0:3])
        if paired_end:
            tk_fasta.write_read_fastq(r2_out_file, *readpair[3:6])

        # Get the next item from the source file we just wrote from
        # If that file is out of items, then we leave that one out
        try:
            next_readpair = fastq.next()

            key = key_func(next_readpair[0:3])
            barcode = key[0]
            barcodes.add(barcode)

            heapq.heappush(heap, (key, next_readpair, in_filename))

        except StopIteration:
            pass

    json.dump(tk_safe_json.json_sanitize(list(barcodes)), bcs_out_file)
Ejemplo n.º 4
0
def main_demultiplex(args, outs):

    do_interleave = True
    file_info = [ IlmnFastqFile(x) for x in args.input_files ]
    file_groups = groupby(lambda x: (x.s, x.lane, x.group), file_info).items()

    demultiplex = args.demultiplex
    read_types = args.read_types
    good_bcs = args.common_bcs

    # For no interleaving:
    interleave_map = range(len(args.read_types))
    output_reads = args.read_types

    if not ("R1" in read_types) or not ("R2" in read_types):
        martian.throw("You requested interleaving, but you don't have R1 and R2 read types")

    r1_slot = read_types.index("R1")
    r2_slot = read_types.index("R2")
    interleave_map[r2_slot] = r1_slot
    output_reads = [ read_types[idx] for idx in numpy.unique(interleave_map) ]

    # Create output path
    os.mkdir(outs.demultiplexed_fastq_path)
    output_path = outs.demultiplexed_fastq_path

    # counts of each valid barcode and non-matching barcodes
    summary_counts = { bc:0 for bc in good_bcs }
    summary_counts[DEMULTIPLEX_INVALID_SAMPLE_INDEX] = 0

    with tk_cache.FileHandleCache(open_func=gzip.open) as file_cache:
        # Iterate over the file groups
        for (k, input_files) in file_groups:
            # original path:
            # <path>/<prefix>_S0_L001_R1_001.fastq
            # new path:
            # <outpath>/read-<read_id>_si-xxxxx_lane-<lane>_chunk-<chunk>.fastq
            # input_files should have constant prefix, S, and L
            # sort input_files to match the read_types
            read_to_file_dict = { x.read:x for x in input_files }
            input_files = [ read_to_file_dict[rt] for rt in read_types ]
            output_files = [ read_to_file_dict[rt] for rt in output_reads ]

            def output_file(path, in_file, barcode):
                if do_interleave and in_file.read[0] == "R":
                    read = "RA"
                else:
                    read = in_file.read

                # Chunk over lanes to get some parallelism to speed up alignment
                f = "read-%s_si-%s_lane-%03d-chunk-%03d.fastq.gz" % (read, barcode, in_file.lane, args.chunk_number)
                return os.path.join(path, f)

            if args.rc_i2_read:
                # For NextSeq we need to RC the I2 read
                input_iters = [ FastqParser(f.filename, rc=(f.read == "I2")).read_fastq() for f in input_files ]
            else:
                input_iters = [ FastqParser(f.filename).read_fastq() for f in input_files ]

            martian.log_info("Demultiplexing from: %s" % input_files[0].filename)

            if demultiplex:
                bc_files = { bc: [output_file(output_path, f, bc) for f in output_files] for bc in good_bcs }
                err_files = [ output_file(output_path, f, "X") for f in output_files ]
                process_fastq_chunk(input_iters, bc_files, err_files, file_cache, interleave_map, summary_counts)

            else:
                out_files = [ output_file(output_path, f, 'X') for f in output_files ]
                process_fastq_chunk_no_demult(input_iters, out_files, file_cache, interleave_map, summary_counts)

        output_files = file_cache.have_opened

    # Write out the summary counts to JSON
    with open(outs.demultiplex_summary, "w") as f:
        json.dump(summary_counts, f)