def test_5_find_fastq_pairs(self): """Ensure we can correctly find paired fastq files. """ test_pairs = [ "/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq", "/path/to/input/D1HJVACXX_4_AAGAGATC_1.fastq", "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq", "/path/2/input/D1HJVACXX_4_AAGAGATC_2.fastq" ] out = fastq.combine_pairs(test_pairs) assert out[0] == [ "/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq", "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq" ], out[0] assert out[1] == [ "/path/to/input/D1HJVACXX_4_AAGAGATC_1.fastq", "/path/2/input/D1HJVACXX_4_AAGAGATC_2.fastq" ], out[1] test_pairs = [ "/path/to/input/Tester_1_fastq.txt", "/path/to/input/Tester_2_fastq.txt" ] out = fastq.combine_pairs(test_pairs) assert out[0] == test_pairs, out[0]
def run_autopair(args): outdir = utils.safe_makedir(args.outdir) to_run = [] extras = [] for fnames in fastq.combine_pairs(sorted(args.files)): if len(fnames) == 2: to_run.append(fnames) elif len(fnames) == 3: r1, r2, r3 = sorted(fnames) to_run.append([r1, r2]) extras.append(r3) else: assert len(fnames) == 1, fnames extras.append(fnames[0]) ready_to_run = [] for r1, r2 in to_run: target = os.path.commonprefix([r1, r2]) r3 = None for test_r3 in extras: if os.path.commonprefix([r1, test_r3]) == target and os.path.commonprefix([r2, test_r3]) == target: r3 = test_r3 break assert r3, (r1, r2, extras) base_name = os.path.join(outdir, os.path.commonprefix([r1, r2, r3]).rstrip("_R")) ready_to_run.append([base_name, r1, r3, r2, {"algorithm": {}, "resources": {}}]) parallel = {"type": "local", "cores": len(ready_to_run), "progs": []} run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def _sanity_check_files(item, files): """Ensure input files correspond with supported approaches. Handles BAM, fastqs, plus split fastqs. """ msg = None file_types = set([("bam" if x.endswith(".bam") else "fastq") for x in files if x]) if len(file_types) > 1: msg = "Found multiple file types (BAM and fastq)" file_type = file_types.pop() if file_type == "bam": if len(files) != 1: msg = "Expect a single BAM file input as input" elif file_type == "fastq": if len(files) not in [1, 2 ] and item["analysis"].lower() != "scrna-seq": pair_types = set([len(xs) for xs in fastq.combine_pairs(files)]) if len(pair_types) != 1 or pair_types.pop() not in [1, 2]: msg = "Expect either 1 (single end) or 2 (paired end) fastq inputs" if len(files) == 2 and files[0] == files[1]: msg = "Expect both fastq files to not be the same" if msg: raise ValueError("%s for %s: %s" % (msg, item.get("description", ""), files))
def run_autopair(args): outdir = utils.safe_makedir(args.outdir) to_run = [] extras = [] for fnames in fastq.combine_pairs(sorted(args.files)): if len(fnames) == 2: to_run.append(fnames) else: assert len(fnames) == 1 extras.append(fnames[0]) ready_to_run = [] for r1, r2 in to_run: target = os.path.commonprefix([r1, r2]) r3 = None for test_r3 in extras: if (os.path.commonprefix([r1, test_r3]) == target and os.path.commonprefix([r2, test_r3]) == target): r3 = test_r3 break assert r3, (r1, r2, extras) base_name = os.path.join( outdir, os.path.commonprefix([r1, r2, r3]).rstrip("_R")) ready_to_run.append( [base_name, r1, r3, r2, { "algorithm": {}, "resources": {} }]) parallel = {"type": "local", "cores": len(ready_to_run), "progs": []} run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def query_gsm(gsm, out_file, config = {}): gsm = gsm[0] out_dir = os.path.dirname(out_file) url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds\&term={0}\&retmode=json".format(gsm) cmd = "curl {0}".format(url) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out = process.stdout.read() data = json.loads(out) ids = data.get("esearchresult", {}).get("idlist", []) logger.debug("Get id sample for %s" % gsm) if ids: gsm_info = _query_info("gds", ids[-1]) srxlist = gsm_info.get("result", {}).get(ids[-1], {}).get("extrelations", {}) srxall = [] for srxe in srxlist: if srxe.get("targetftplink", None): srxall.append(srxe["targetftplink"]) logger.debug("Get FTP link for %s : %s" % (ids[-1], srxall)) outs = [] for srx in srxall: srafiles = _download_srx(gsm, srx, out_dir) logger.debug("Get SRA for %s: %s" % (gsm, " ".join(srafiles))) if srafiles: for sra in srafiles: outs.extend(_convert_fastq(sra, out_dir)) logger.debug("Get FASTQ for %s: %s" % (gsm, " ".join(outs))) if outs: files = combine_pairs(outs) out_file = fastq.merge(files, out_file, config) return out_file
def prep_fastq_inputs(in_files, data): """Prepare bgzipped fastq inputs """ if len(in_files) == 1 and _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], data["dirs"], data) elif len(in_files) == 1 and _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], data["dirs"], data) elif len(in_files) in [1, 2] and _ready_gzip_fastq(in_files, data): out = _symlink_in_files(in_files, data) else: if len(in_files) > 2: fpairs = fastq.combine_pairs(in_files) pair_types = set([len(xs) for xs in fpairs]) assert len(pair_types) == 1 fpairs.sort(key=lambda x: os.path.basename(x[0])) organized = [[xs[0] for xs in fpairs]] if len(fpairs[0]) > 1: organized.append([xs[1] for xs in fpairs]) in_files = organized parallel = {"type": "local", "num_jobs": len(in_files), "cores_per_job": max(1, data["config"]["algorithm"]["num_cores"] // len(in_files))} inputs = [{"in_file": x, "read_num": i, "dirs": data["dirs"], "config": data["config"], "is_cwl": "cwl_keys" in data, "rgnames": data["rgnames"]} for i, x in enumerate(in_files) if x] out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"], parallel) return out
def _prep_items_from_base(base, in_files, metadata, separators, force_single=False): """Prepare a set of configuration items for input files. """ details = [] in_files = _expand_dirs(in_files, KNOWN_EXTS) in_files = _expand_wildcards(in_files) ext_groups = collections.defaultdict(list) for ext, files in itertools.groupby( in_files, lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower())): ext_groups[ext].extend(list(files)) for ext, files in ext_groups.items(): if ext == "bam": for f in files: details.append(_prep_bam_input(f, base)) elif ext in ["fastq", "fq", "fasta"]: files, glob_files = _find_glob_matches(files, metadata) for fs in glob_files: details.append(_prep_fastq_input(fs, base)) for fs in fastq.combine_pairs(files, force_single, separators=separators): details.append(_prep_fastq_input(fs, base)) else: print("Ignoring unexpected input file types %s: %s" % (ext, list(files))) return details
def _check_paired(files, force_single): """check if files are fastq(.gz) and paired""" if files[0].endswith(".bam"): return files elif is_gsm(files[0]): return files return combine_pairs(files, force_single)
def run_autopair(args): outdir = utils.safe_makedir(args.outdir) to_run = [] extras = [] for fnames in fastq.combine_pairs(sorted(args.files)): if len(fnames) == 2: to_run.append(fnames) elif len(fnames) == 3: r1, r2, r3 = sorted(fnames) to_run.append([r1, r2]) extras.append(r3) else: assert len(fnames) == 1, fnames extras.append(fnames[0]) ready_to_run = [] tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None for r1, r2 in to_run: target = _commonprefix([r1, r2]) if tags: base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2]))) umi = None else: r3 = None for test_r3 in extras: if (_commonprefix([r1, test_r3]) == target and _commonprefix([r2, test_r3]) == target): r3 = test_r3 break assert r3, (r1, r2, extras) base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3]))) r1, r2, umi = _find_umi([r1, r2, r3]) ready_to_run.append([base_name, r1, r2, umi, tags, {"algorithm": {}, "resources": {}}]) parallel = {"type": "local", "cores": args.cores, "progs": []} run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def query_gsm(gsm, out_file, config = {}): gsm = gsm[0] out_dir = os.path.dirname(os.path.abspath(out_file)) name = utils.splitext_plus(os.path.basename(out_file))[0] url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra\&term={0}\&retmode=json".format(gsm) cmd = "curl {0}".format(url) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out = process.stdout.read() data = json.loads(out) ids = data.get("esearchresult", {}).get("idlist", []) logger.debug("Get id sample for %s" % gsm) if ids: gsm_info = _query_info("sra", ids[-1]) print(gsm_info) srrall = [] for srr in gsm_info: srrall.append(_create_link(srr)) logger.debug("Get FTP link for %s : %s" % (ids[-1], srrall)) outs = [] for srx in srrall: sra_dir = utils.safe_makedir(os.path.join(out_dir, name)) srafiles = _download_srx(gsm, srx, sra_dir) logger.debug("Get SRA for %s: %s" % (gsm, " ".join(srafiles))) if srafiles: for sra in srafiles: outs.extend(_convert_fastq(sra, out_dir)) logger.debug("Get FASTQ for %s: %s" % (gsm, " ".join(outs))) if outs: files = combine_pairs(outs) out_file = fastq.merge(files, out_file, config) return out_file
def _check_paired(files): """check if files are fastq(.gz) and paired""" if files[0].endswith(".bam"): return files elif is_gsm(files[0]): return files return combine_pairs(files)
def query_gsm(gsm, out_file, config = {}): gsm = gsm[0] out_dir = os.path.dirname(os.path.abspath(out_file)) name = utils.splitext_plus(os.path.basename(out_file))[0] url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra\&term={0}\&retmode=json".format(gsm) cmd = "curl {0}".format(url) process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) out = process.stdout.read() data = json.loads(out) ids = data.get("esearchresult", {}).get("idlist", []) logger.debug("Get id sample for %s" % gsm) if ids: gsm_info = _query_info("sra", ids[-1]) logger.debug("gsm_info:%s" % gsm_info) srrall = [] for srr in gsm_info: srrall.append(_create_link(srr)) logger.debug("Get FTP link for %s : %s" % (ids[-1], srrall)) outs = [] for srx in srrall: sra_dir = utils.safe_makedir(os.path.join(out_dir, name)) srafiles = _download_srx(srx, sra_dir) if srafiles: logger.debug("Get SRA for %s: %s" % (gsm, " ".join(srafiles))) for sra in srafiles: fastq_fn = _convert_fastq(sra, out_dir) if fastq_fn: outs.extend(fastq_fn) logger.debug("Get FASTQ for %s: %s" % (gsm, " ".join(outs))) if outs: files = combine_pairs(outs) out_file = fastq.merge(files, out_file, config) return out_file
def run_dragen(args): to_run = [] outdir = utils.safe_makedir(args.outdir) for fnames in fastq.combine_pairs(sorted(args.files)): to_run.append(fnames) for r1, r2 in to_run: out1_fq = os.path.join(outdir, r1) out2_fq = os.path.join(outdir, r2) n = 0 with utils.open_gzipsafe(r1) as r1_handle, \ utils.open_gzipsafe(r2) as r2_handle, \ gzip.open(out1_fq, "wb") as out1_handle, \ gzip.open(out2_fq, "wb") as out2_handle: for line1, line2 in itertools.zip_longest(r1_handle, r2_handle): if line1 is not None: if n % 4 == 0: # parse header line new_header1 = _add_umi_str(line1) + "\n" new_header2 = _add_umi_str(line2) + "\n" out1_handle.write(new_header1.encode()) out2_handle.write(new_header2.encode()) else: out1_handle.write(line1.encode()) out2_handle.write(line2.encode()) n += 1
def _prep_items_from_base(base, in_files): """Prepare a set of configuration items for input files. """ details = [] known_exts = { ".bam": "bam", ".cram": "bam", ".fq": "fastq", ".fastq": "fastq", ".txt": "fastq", ".fastq.gz": "fastq", ".fq.gz": "fastq", ".txt.gz": "fastq", ".gz": "fastq" } in_files = _expand_dirs(in_files, known_exts) in_files = _expand_wildcards(in_files) for i, (ext, files) in enumerate( itertools.groupby( in_files, lambda x: known_exts.get(utils.splitext_plus(x)[-1].lower()))): if ext == "bam": for f in files: details.append(_prep_bam_input(f, i, base)) elif ext == "fastq": files = list(files) for fs in fastq.combine_pairs(files): details.append(_prep_fastq_input(fs, base)) else: raise ValueError("Unexpected input file types: %s" % str(files)) return details
def _prep_items_from_base(base, in_files, metadata, separators, force_single=False): """Prepare a set of configuration items for input files. """ details = [] in_files = _expand_dirs(in_files, KNOWN_EXTS) in_files = _expand_wildcards(in_files) ext_groups = collections.defaultdict(list) for ext, files in itertools.groupby( in_files, lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower())): ext_groups[ext].extend(list(files)) for ext, files in ext_groups.items(): if ext == "bam": for f in files: details.append(_prep_bam_input(f, base)) elif ext in ["fastq", "fq", "fasta"]: files, glob_files = _find_glob_matches(files, metadata) for fs in glob_files: details.append(_prep_fastq_input(fs, base)) for fs in fastq.combine_pairs(files, force_single, separators=separators): details.append(_prep_fastq_input(fs, base)) elif ext in ["vcf"]: for f in files: details.append(_prep_vcf_input(f, base)) else: print("Ignoring unexpected input file types %s: %s" % (ext, list(files))) return details
def _check_paired(files, force_single, separators): """check if files are fastq(.gz) and paired""" full_name = _check_stems(files) if files[0].endswith(".bam"): return files elif is_gsm(files[0]): return files return combine_pairs(files, force_single, full_name, separators)
def test_5_find_fastq_pairs(self): """Ensure we can correctly find paired fastq files. """ test_pairs = ["/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq", "/path/to/input/D1HJVACXX_3_AAGAGATC_1.fastq", "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq", "/path/2/input/D1HJVACXX_3_AAGAGATC_2.fastq"] out = fastq.combine_pairs(test_pairs) assert out[0] == ["/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq", "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq"], out[0] assert out[1] == ["/path/to/input/D1HJVACXX_3_AAGAGATC_1.fastq", "/path/2/input/D1HJVACXX_3_AAGAGATC_2.fastq"], out[1] test_pairs = ["/path/to/input/Tester_1_fastq.txt", "/path/to/input/Tester_2_fastq.txt"] out = fastq.combine_pairs(test_pairs) assert out[0] == test_pairs, out[0]
def run_autopair(args): outdir = utils.safe_makedir(args.outdir) to_run = [] extras = [] for fnames in fastq.combine_pairs(sorted(args.files)): if len(fnames) == 2: to_run.append(fnames) elif len(fnames) == 3: r1, r2, r3 = sorted(fnames) to_run.append([r1, r2]) extras.append(r3) else: assert len(fnames) == 1, fnames extras.append(fnames[0]) ready_to_run = [] tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None if not tags: # Aim for 2 or 3 simultaneous processes, each with multiple cores target_processes = 2 process_cores = max(1, (args.cores // target_processes) + (args.cores % target_processes)) overall_processes = max(1, int(math.ceil(args.cores / float(process_cores)))) else: process_cores = 1 overall_processes = args.cores for r1, r2 in to_run: target = _commonprefix([r1, r2]) if tags: base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2]))) umi = None else: r3 = None for test_r3 in extras: if (_commonprefix([r1, test_r3]) == target and _commonprefix([r2, test_r3]) == target): r3 = test_r3 break assert r3, (r1, r2, extras) base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3]))) r1, r2, umi = _find_umi([r1, r2, r3]) # fastp handles a single pair of reads so we split processing to run on each if umi and not tags: ready_to_run.append([base_name, r1, None, umi, None, process_cores, {"algorithm": {}, "resources": {}}]) ready_to_run.append([base_name, None, r2, umi, None, process_cores, {"algorithm": {}, "resources": {}}]) else: ready_to_run.append([base_name, r1, r2, umi, tags, process_cores, {"algorithm": {}, "resources": {}}]) parallel = {"type": "local", "cores": overall_processes, "progs": []} run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
def _prep_items_from_base(base, in_files): """Prepare a set of configuration items for input files. """ details = [] fq_exts = [".fq", ".fastq", ".txt", ".gz"] gz_exts = tuple(["%s.gz" % ext for ext in fq_exts if ext != ".gz"]) for ext, files in itertools.groupby(in_files, lambda x: os.path.splitext(x)[-1].lower()): if ext == ".bam": for f in files: details.append(_prep_bam_input(f, base)) elif ext in fq_exts: files = list(files) if ext == ".gz": assert all(f.endswith(gz_exts) for f in files), (files, gz_exts) for fs in fastq.combine_pairs(files): details.append(_prep_fastq_input(fs, base)) else: raise ValueError("File type not yet implemented: %s" % ext) return details
def _prep_items_from_base(base, in_files, force_single=False): """Prepare a set of configuration items for input files. """ details = [] in_files = _expand_dirs(in_files, KNOWN_EXTS) in_files = _expand_wildcards(in_files) for i, (ext, files) in enumerate(itertools.groupby( in_files, lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower()))): if ext == "bam": for f in files: details.append(_prep_bam_input(f, i, base)) elif ext in ["fastq", "fq", "fasta"]: files = list(files) for fs in fastq.combine_pairs(files, force_single): details.append(_prep_fastq_input(fs, base)) else: print("Ignoring unexpected input file types %s: %s" % (ext, list(files))) return details
def _prep_items_from_base(base, in_files): """Prepare a set of configuration items for input files. """ details = [] fq_exts = [".fq", ".fastq", ".txt", ".gz"] gz_exts = tuple(["%s.gz" % ext for ext in fq_exts if ext != ".gz"]) for ext, files in itertools.groupby( in_files, lambda x: os.path.splitext(x)[-1].lower()): if ext == ".bam": for f in files: details.append(_prep_bam_input(f, base)) elif ext in fq_exts: files = list(files) if ext == ".gz": assert all(f.endswith(gz_exts) for f in files), (files, gz_exts) for fs in fastq.combine_pairs(files): details.append(_prep_fastq_input(fs, base)) else: raise ValueError("File type not yet implemented: %s" % ext) return details
def prep_fastq_inputs(in_files, data): """Prepare bgzipped fastq inputs """ if len(in_files) == 1 and _is_bam_input(in_files): out = _bgzip_from_bam(in_files[0], data["dirs"], data) elif len(in_files) == 1 and _is_cram_input(in_files): out = _bgzip_from_cram(in_files[0], data["dirs"], data) elif len(in_files) in [1, 2] and _ready_gzip_fastq(in_files, data): out = _symlink_in_files(in_files, data) else: if len(in_files) > 2: fpairs = fastq.combine_pairs(in_files) pair_types = set([len(xs) for xs in fpairs]) assert len(pair_types) == 1 fpairs.sort(key=lambda x: os.path.basename(x[0])) organized = [[xs[0] for xs in fpairs]] if len(fpairs[0]) > 1: organized.append([xs[1] for xs in fpairs]) in_files = organized parallel = { "type": "local", "num_jobs": len(in_files), "cores_per_job": max(1, data["config"]["algorithm"]["num_cores"] // len(in_files)) } inputs = [{ "in_file": x, "read_num": i, "dirs": data["dirs"], "config": data["config"], "is_cwl": "cwl_keys" in data, "rgnames": data["rgnames"] } for i, x in enumerate(in_files) if x] out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"], parallel) return out
def query_srr(sra, out_file, config = {}): sra = sra[0] outs = [] out_dir = os.path.dirname(os.path.abspath(out_file)) name = utils.splitext_plus(os.path.basename(out_file))[0] srrall = [] for srr in sra: srrall.append(_create_link(srr)) logger.debug("Get FTP link for %s : %s" % (name, srrall)) for srx in srrall: sra_dir = utils.safe_makedir(os.path.join(out_dir, name)) srafiles = _download_srx(srx, sra_dir) if srafiles: logger.debug("Get SRA for %s: %s" % (sra, " ".join(srafiles))) for sra in srafiles: fastq_fn = _convert_fastq(sra, out_dir) if fastq_fn: outs.extend(fastq_fn) logger.debug("Get FASTQ for %s: %s" % (sra, " ".join(outs))) if outs: files = combine_pairs(outs) out_file = fastq.merge(files, out_file, config) return out_file
def query_srr(sra, out_file, config={}): sra = sra[0] outs = [] out_dir = os.path.dirname(os.path.abspath(out_file)) name = utils.splitext_plus(os.path.basename(out_file))[0] srrall = [] for srr in sra: srrall.append(_create_link(srr)) logger.debug("Get FTP link for %s : %s" % (name, srrall)) for srx in srrall: sra_dir = utils.safe_makedir(os.path.join(out_dir, name)) srafiles = _download_srx(srx, sra_dir) if srafiles: logger.debug("Get SRA for %s: %s" % (sra, " ".join(srafiles))) for sra in srafiles: fastq_fn = _convert_fastq(sra, out_dir) if fastq_fn: outs.extend(fastq_fn) logger.debug("Get FASTQ for %s: %s" % (sra, " ".join(outs))) if outs: files = combine_pairs(outs) out_file = fastq.merge(files, out_file, config) return out_file
def _prep_items_from_base(base, in_files): """Prepare a set of configuration items for input files. """ details = [] known_exts = {".bam": "bam", ".fq": "fastq", ".fastq": "fastq", ".txt": "fastq", ".fastq.gz": "fastq", ".fq.gz": "fastq", ".txt.gz": "fastq", ".gz": "fastq"} in_files = _expand_dirs(in_files, known_exts) in_files = _expand_wildcards(in_files) for i, (ext, files) in enumerate(itertools.groupby( in_files, lambda x: known_exts.get(utils.splitext_plus(x)[-1].lower()))): if ext == "bam": for f in files: details.append(_prep_bam_input(f, i, base)) elif ext == "fastq": files = list(files) for fs in fastq.combine_pairs(files): details.append(_prep_fastq_input(fs, base)) else: raise ValueError("Unexpected input file types: %s" % str(files)) return details
def _check_paired(files): """check if files are fastq(.gz) and paired""" if files[0].endswith(".bam"): return files return combine_pairs(files)