def _detect_fastq_format(in_file, MAX_RECORDS=1000): ranges = { "sanger": (33, 126), "solexa": (59, 126), "illumina_1.3+": (64, 126), "illumina_1.5+": (66, 126) } gmin, gmax = 99, 0 possible = set(ranges.keys()) with closing(open_fastq(in_file)) as in_handle: four = itertools.islice(in_handle, 3, None, 4) count = 0 for line in four: if len(possible) == 1: return possible if count > MAX_RECORDS: break count += 1 vals = [ord(c) for c in line.rstrip()] # if there is a short sequence, skip it if len(vals) < 20: continue lmin = min(vals) lmax = max(vals) for encoding, (emin, emax) in ranges.items(): if encoding in possible: if lmin < emin or lmax > emax: possible.remove(encoding) return possible
def _detect_fastq_format(in_file, MAX_RECORDS=1000): ranges = {"sanger": (33, 126), "solexa": (59, 126), "illumina_1.3+": (64, 126), "illumina_1.5+": (66, 126)} gmin, gmax = 99, 0 possible = set(ranges.keys()) with closing(open_fastq(in_file)) as in_handle: four = itertools.islice(in_handle, 3, None, 4) count = 0 for line in four: if len(possible) == 1: return possible if count > MAX_RECORDS: break count += 1 vals = [ord(c) for c in line.rstrip()] # if there is a short sequence, skip it if len(vals) < 20: continue lmin = min(vals) lmax = max(vals) for encoding, (emin, emax) in ranges.items(): if encoding in possible: if lmin < emin or lmax > emax: possible.remove(encoding) return possible
def demultiplex_samples(data): """ demultiplex a fastqtransformed FASTQ file into separate sample barcode files """ work_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(work_dir, dd.get_sample_name(data)) demulti_dir = os.path.join(sample_dir, "demultiplexed") files = data["files"] if len(files) == 2: logger.error( "Sample demultiplexing doesn't handle paired-end reads, but " "we can add it. Open an issue here https://github.com/bcbio/bcbio-nextgen/issues if you need this and we'll add it." ) sys.exit(1) else: fq1 = files[0] # check if samples need to be demultiplexed with open_fastq(fq1) as in_handle: read = next(in_handle) if "SAMPLE_" not in read: return [[data]] bcfile = get_sample_barcodes(dd.get_sample_barcodes(data), sample_dir) demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) if demultiplexed: return [split_demultiplexed_sampledata(data, demultiplexed)] umis = config_utils.get_program("umis", data, default="umis") cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} " "--out_dir {tx_dir} {fq1}") msg = "Demultiplexing {fq1}." with file_transaction(data, demulti_dir) as tx_dir: do.run(cmd.format(**locals()), msg.format(**locals())) demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) return [split_demultiplexed_sampledata(data, demultiplexed)]
def demultiplex_samples(data): """ demultiplex a fastqtransformed FASTQ file into separate sample barcode files """ files = data["files"] if len(files) == 2: logger.error("Sample demultiplexing doesn't handle paired-end reads, but " "we can add it. Open an issue here https://github.com/chapmanb/bcbio-nextgen/issues if you need this and we'll add it.") sys.exit(1) else: fq1 = files[0] # check if samples need to be demultiplexed with open_fastq(fq1) as in_handle: read = in_handle.next() if "SAMPLE_" not in read: return [[data]] bcfile = dd.get_sample_barcodes(data) if not bcfile: logger.error("Sample demultiplexing needs a list of known indexes provided " "with via the sample_barcodes option in the algorithm section.") sys.exit(1) work_dir = os.path.join(dd.get_work_dir(data), "umis") sample_dir = os.path.join(work_dir, dd.get_sample_name(data)) demulti_dir = os.path.join(sample_dir, "demultiplexed") demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) if demultiplexed: return [split_demultiplexed_sampledata(data, demultiplexed)] umis = config_utils.get_program("umis", data, default="umis") cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} " "--out_dir {tx_dir} {fq1}") msg = "Demultiplexing {fq1}." with file_transaction(data, demulti_dir) as tx_dir: do.run(cmd.format(**locals()), msg.format(**locals())) demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*")) return [split_demultiplexed_sampledata(data, demultiplexed)]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fq1 = data["files"][0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info("No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return data else: logger.error("No UMI transform was specified, but %s does not look " "pre-transformed. Assuming non-umi data." % fq1) return data if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return data umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return data cmd = ("{umis} fastqtransform {transform_file} " "--cores {cores} " "{fq1}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return data
def is_transformed(fastq): """ check the first 100 reads to see if a FASTQ file has already been transformed by umis """ with open_fastq(fastq) as in_handle: for line in islice(in_handle, 400): if "UMI_" in line: return True return False
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (transform_file, ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4-len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if file_exists(transform): transform_file = transform else: transform_data = transforms[transform] transform_file = os.path.join(umi_dir, transform + ".json") transform_file = write_transform_file(transform_data, transform_file) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] if len(dd.get_cellular_barcodes(data)) == 2: split_option = "--separate_cb" else: split_option = "" umis = config_utils.get_program("umis", data, default="umis") cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = in_handle.next() if "UMI_" in read: data["files"] = [out_file] return [[data]] cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ("Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def umi_transform(data): """ transform each read by identifying the barcode and UMI for each read and putting the information in the read name """ fqfiles = data["files"] fqfiles.extend(list(repeat("", 4 - len(fqfiles)))) fq1, fq2, fq3, fq4 = fqfiles umi_dir = os.path.join(dd.get_work_dir(data), "umis") safe_makedir(umi_dir) transform = dd.get_umi_type(data) if not transform: logger.info( "No UMI transform specified, assuming pre-transformed data.") if is_transformed(fq1): logger.info( "%s detected as pre-transformed, passing it on unchanged." % fq1) data["files"] = [fq1] return [[data]] else: logger.error( "No UMI transform was specified, but %s does not look " "pre-transformed." % fq1) sys.exit(1) if file_exists(transform): transform_file = transform else: transform_file = get_transform_file(transform) if not file_exists(transform_file): logger.error( "The UMI transform can be specified as either a file or a " "bcbio-supported transform. Either the file %s does not exist " "or the transform is not supported by bcbio. Supported " "transforms are %s." % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS))) sys.exit(1) out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] cellular_barcodes = get_cellular_barcodes(data) if len(cellular_barcodes) > 1: split_option = "--separate_cb" else: split_option = "" if dd.get_demultiplexed(data): demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data) split_option = "" else: demuxed_option = "" cores = dd.get_num_cores(data) # skip transformation if the file already looks transformed with open_fastq(fq1) as in_handle: read = next(in_handle) if "UMI_" in read: data["files"] = [out_file] return [[data]] locale_export = utils.locale_export() umis = _umis_cmd(data) cmd = ("{umis} fastqtransform {split_option} {transform_file} " "--cores {cores} {demuxed_option} " "{fq1} {fq2} {fq3} {fq4}" "| seqtk seq -L 20 - | gzip > {tx_out_file}") message = ( "Inserting UMI and barcode information into the read name of %s" % fq1) with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]