def get_cellular_barcodes(data): if dd.get_cellular_barcodes(data): return dd.get_cellular_barcodes(data) if is_supported_transform(data): stem = dd.get_umi_type(data) bc1 = os.path.join(TRANSFORM_DIR, stem + "-cb1.txt") bc2 = os.path.join(TRANSFORM_DIR, stem + "-cb2.txt") return filter(file_exists, [bc1, bc2]) else: return []
def get_cellular_barcodes(data): if dd.get_cellular_barcodes(data): return dd.get_cellular_barcodes(data) if is_supported_transform(data): stem = dd.get_umi_type(data) bc1 = os.path.join(TRANSFORM_DIR, stem + "-cb1.txt") bc2 = os.path.join(TRANSFORM_DIR, stem + "-cb2.txt") bc3 = os.path.join(TRANSFORM_DIR, stem + "-cb3.txt") return filter(file_exists, [bc1, bc2, bc3]) else: return []
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1, fq2 = dd.get_input_sequence_files(data) fq2 = fq2 if fq2 else "" umi_dir = os.path.join(dd.get_work_dir(data), "umis") transform = dd.get_umi_type(data) transform_data = transforms[transform] safe_makedir(umi_dir) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] index_option = "--dual_index" if transform_data["dual"] else "" if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cmd = ( "{umis} fastqtransform {index_option} {split_option} {transform_file} " "{fq1} {fq2} " "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1, fq2 = dd.get_input_sequence_files(data) fq2 = fq2 if fq2 else "" umi_dir = os.path.join(dd.get_work_dir(data), "umis") transform = dd.get_umi_type(data) transform_data = transforms[transform] safe_makedir(umi_dir) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] index_option = "--dual_index" if transform_data["dual"] else "" if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cmd = ("{umis} fastqtransform {index_option} {split_option} {transform_file} " "{fq1} {fq2} " "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_data = transforms.get(transform, "") if not transform_data: logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio.") sys.exit(1) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_data = transforms.get(transform, "") if not transform_data: logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio.") sys.exit(1) transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = dd.get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None bc2 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, basestring): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) == 2: bc1 = bc[0] bc2 = bc[1] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = dd.get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None bc2 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, basestring): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) == 2: bc1 = bc[0] bc2 = bc[1] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]