Example #1
0
    def test_5_find_fastq_pairs(self):
        """Ensure we can correctly find paired fastq files.
        """
        test_pairs = [
            "/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq",
            "/path/to/input/D1HJVACXX_4_AAGAGATC_1.fastq",
            "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq",
            "/path/2/input/D1HJVACXX_4_AAGAGATC_2.fastq"
        ]
        out = fastq.combine_pairs(test_pairs)
        assert out[0] == [
            "/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq",
            "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq"
        ], out[0]
        assert out[1] == [
            "/path/to/input/D1HJVACXX_4_AAGAGATC_1.fastq",
            "/path/2/input/D1HJVACXX_4_AAGAGATC_2.fastq"
        ], out[1]

        test_pairs = [
            "/path/to/input/Tester_1_fastq.txt",
            "/path/to/input/Tester_2_fastq.txt"
        ]
        out = fastq.combine_pairs(test_pairs)
        assert out[0] == test_pairs, out[0]
def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    for r1, r2 in to_run:
        target = os.path.commonprefix([r1, r2])
        r3 = None
        for test_r3 in extras:
            if os.path.commonprefix([r1, test_r3]) == target and os.path.commonprefix([r2, test_r3]) == target:
                r3 = test_r3
                break
        assert r3, (r1, r2, extras)
        base_name = os.path.join(outdir, os.path.commonprefix([r1, r2, r3]).rstrip("_R"))
        ready_to_run.append([base_name, r1, r3, r2, {"algorithm": {}, "resources": {}}])

    parallel = {"type": "local", "cores": len(ready_to_run), "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
Example #3
0
def _sanity_check_files(item, files):
    """Ensure input files correspond with supported approaches.

    Handles BAM, fastqs, plus split fastqs.
    """
    msg = None
    file_types = set([("bam" if x.endswith(".bam") else "fastq") for x in files
                      if x])
    if len(file_types) > 1:
        msg = "Found multiple file types (BAM and fastq)"
    file_type = file_types.pop()
    if file_type == "bam":
        if len(files) != 1:
            msg = "Expect a single BAM file input as input"
    elif file_type == "fastq":
        if len(files) not in [1, 2
                              ] and item["analysis"].lower() != "scrna-seq":
            pair_types = set([len(xs) for xs in fastq.combine_pairs(files)])
            if len(pair_types) != 1 or pair_types.pop() not in [1, 2]:
                msg = "Expect either 1 (single end) or 2 (paired end) fastq inputs"
        if len(files) == 2 and files[0] == files[1]:
            msg = "Expect both fastq files to not be the same"
    if msg:
        raise ValueError("%s for %s: %s" %
                         (msg, item.get("description", ""), files))
def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        else:
            assert len(fnames) == 1
            extras.append(fnames[0])
    ready_to_run = []
    for r1, r2 in to_run:
        target = os.path.commonprefix([r1, r2])
        r3 = None
        for test_r3 in extras:
            if (os.path.commonprefix([r1, test_r3]) == target
                    and os.path.commonprefix([r2, test_r3]) == target):
                r3 = test_r3
                break
        assert r3, (r1, r2, extras)
        base_name = os.path.join(
            outdir,
            os.path.commonprefix([r1, r2, r3]).rstrip("_R"))
        ready_to_run.append(
            [base_name, r1, r3, r2, {
                "algorithm": {},
                "resources": {}
            }])

    parallel = {"type": "local", "cores": len(ready_to_run), "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}},
                  parallel)
Example #5
0
def query_gsm(gsm, out_file, config = {}):
    gsm = gsm[0]
    out_dir = os.path.dirname(out_file)
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gds\&term={0}\&retmode=json".format(gsm)
    cmd = "curl {0}".format(url)
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    out = process.stdout.read()
    data = json.loads(out)
    ids = data.get("esearchresult", {}).get("idlist", [])
    logger.debug("Get id sample for %s" % gsm)
    if ids:
        gsm_info = _query_info("gds", ids[-1])
        srxlist = gsm_info.get("result", {}).get(ids[-1], {}).get("extrelations", {})
        srxall = []
        for srxe in srxlist:
            if srxe.get("targetftplink", None):
                srxall.append(srxe["targetftplink"])
        logger.debug("Get FTP link for %s : %s" % (ids[-1], srxall))
        outs = []
        for srx in srxall:
            srafiles = _download_srx(gsm, srx, out_dir)
            logger.debug("Get SRA for %s: %s" % (gsm, " ".join(srafiles)))
            if srafiles:
                for sra in srafiles:
                    outs.extend(_convert_fastq(sra, out_dir))
            logger.debug("Get FASTQ for %s: %s" % (gsm, " ".join(outs)))
        if outs:
            files = combine_pairs(outs)
            out_file = fastq.merge(files, out_file, config)
            return out_file
Example #6
0
def prep_fastq_inputs(in_files, data):
    """Prepare bgzipped fastq inputs
    """
    if len(in_files) == 1 and _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], data["dirs"], data)
    elif len(in_files) == 1 and _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], data["dirs"], data)
    elif len(in_files) in [1, 2] and _ready_gzip_fastq(in_files, data):
        out = _symlink_in_files(in_files, data)
    else:
        if len(in_files) > 2:
            fpairs = fastq.combine_pairs(in_files)
            pair_types = set([len(xs) for xs in fpairs])
            assert len(pair_types) == 1
            fpairs.sort(key=lambda x: os.path.basename(x[0]))
            organized = [[xs[0] for xs in fpairs]]
            if len(fpairs[0]) > 1:
                organized.append([xs[1] for xs in fpairs])
            in_files = organized
        parallel = {"type": "local", "num_jobs": len(in_files),
                    "cores_per_job": max(1, data["config"]["algorithm"]["num_cores"] // len(in_files))}
        inputs = [{"in_file": x, "read_num": i, "dirs": data["dirs"], "config": data["config"],
                   "is_cwl": "cwl_keys" in data,
                   "rgnames": data["rgnames"]}
                  for i, x in enumerate(in_files) if x]
        out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs], data["config"], parallel)
    return out
Example #7
0
def _prep_items_from_base(base,
                          in_files,
                          metadata,
                          separators,
                          force_single=False):
    """Prepare a set of configuration items for input files.
    """
    details = []
    in_files = _expand_dirs(in_files, KNOWN_EXTS)
    in_files = _expand_wildcards(in_files)

    ext_groups = collections.defaultdict(list)
    for ext, files in itertools.groupby(
            in_files,
            lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower())):
        ext_groups[ext].extend(list(files))
    for ext, files in ext_groups.items():
        if ext == "bam":
            for f in files:
                details.append(_prep_bam_input(f, base))
        elif ext in ["fastq", "fq", "fasta"]:
            files, glob_files = _find_glob_matches(files, metadata)
            for fs in glob_files:
                details.append(_prep_fastq_input(fs, base))
            for fs in fastq.combine_pairs(files,
                                          force_single,
                                          separators=separators):
                details.append(_prep_fastq_input(fs, base))
        else:
            print("Ignoring unexpected input file types %s: %s" %
                  (ext, list(files)))
    return details
Example #8
0
def _check_paired(files, force_single):
    """check if files are fastq(.gz) and paired"""
    if files[0].endswith(".bam"):
        return files
    elif is_gsm(files[0]):
        return files
    return combine_pairs(files, force_single)
Example #9
0
def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None
    for r1, r2 in to_run:
        target = _commonprefix([r1, r2])
        if tags:
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2])))
            umi = None
        else:
            r3 = None
            for test_r3 in extras:
                if (_commonprefix([r1, test_r3]) == target and
                      _commonprefix([r2, test_r3]) == target):
                    r3 = test_r3
                    break
            assert r3, (r1, r2, extras)
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3])))
            r1, r2, umi = _find_umi([r1, r2, r3])
        ready_to_run.append([base_name, r1, r2, umi, tags, {"algorithm": {}, "resources": {}}])
    parallel = {"type": "local", "cores": args.cores, "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
Example #10
0
def query_gsm(gsm, out_file, config = {}):
    gsm = gsm[0]
    out_dir = os.path.dirname(os.path.abspath(out_file))
    name = utils.splitext_plus(os.path.basename(out_file))[0]
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra\&term={0}\&retmode=json".format(gsm)
    cmd = "curl {0}".format(url)
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    out = process.stdout.read()
    data = json.loads(out)
    ids = data.get("esearchresult", {}).get("idlist", [])
    logger.debug("Get id sample for %s" % gsm)
    if ids:
        gsm_info = _query_info("sra", ids[-1])
        print(gsm_info)
        srrall = []
        for srr in gsm_info:
            srrall.append(_create_link(srr))
        logger.debug("Get FTP link for %s : %s" % (ids[-1], srrall))
        outs = []
        for srx in srrall:
            sra_dir = utils.safe_makedir(os.path.join(out_dir, name))
            srafiles = _download_srx(gsm, srx, sra_dir)
            logger.debug("Get SRA for %s: %s" % (gsm, " ".join(srafiles)))
            if srafiles:
                for sra in srafiles:
                    outs.extend(_convert_fastq(sra, out_dir))
            logger.debug("Get FASTQ for %s: %s" % (gsm, " ".join(outs)))
        if outs:
            files = combine_pairs(outs)
            out_file = fastq.merge(files, out_file, config)
            return out_file
def _check_paired(files):
    """check if files are fastq(.gz) and paired"""
    if files[0].endswith(".bam"):
        return files
    elif is_gsm(files[0]):
        return files
    return combine_pairs(files)
Example #12
0
def query_gsm(gsm, out_file, config = {}):
    gsm = gsm[0]
    out_dir = os.path.dirname(os.path.abspath(out_file))
    name = utils.splitext_plus(os.path.basename(out_file))[0]
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra\&term={0}\&retmode=json".format(gsm)
    cmd = "curl {0}".format(url)
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    out = process.stdout.read()
    data = json.loads(out)
    ids = data.get("esearchresult", {}).get("idlist", [])
    logger.debug("Get id sample for %s" % gsm)
    if ids:
        gsm_info = _query_info("sra", ids[-1])
        logger.debug("gsm_info:%s" % gsm_info)
        srrall = []
        for srr in gsm_info:
            srrall.append(_create_link(srr))
        logger.debug("Get FTP link for %s : %s" % (ids[-1], srrall))
        outs = []
        for srx in srrall:
            sra_dir = utils.safe_makedir(os.path.join(out_dir, name))
            srafiles = _download_srx(srx, sra_dir)
            if srafiles:
                logger.debug("Get SRA for %s: %s" % (gsm, " ".join(srafiles)))
                for sra in srafiles:
                    fastq_fn = _convert_fastq(sra, out_dir)
                    if fastq_fn:
                        outs.extend(fastq_fn)
            logger.debug("Get FASTQ for %s: %s" % (gsm, " ".join(outs)))
        if outs:
            files = combine_pairs(outs)
            out_file = fastq.merge(files, out_file, config)
            return out_file
def run_dragen(args):
    to_run = []
    outdir = utils.safe_makedir(args.outdir)
    for fnames in fastq.combine_pairs(sorted(args.files)):
        to_run.append(fnames)
    for r1, r2 in to_run:
        out1_fq = os.path.join(outdir, r1)
        out2_fq = os.path.join(outdir, r2)
        n = 0
        with utils.open_gzipsafe(r1) as r1_handle, \
             utils.open_gzipsafe(r2) as r2_handle, \
             gzip.open(out1_fq, "wb") as out1_handle, \
             gzip.open(out2_fq, "wb") as out2_handle:
            for line1, line2 in itertools.zip_longest(r1_handle, r2_handle):
                if line1 is not None:
                    if n % 4 == 0:
                        # parse header line
                        new_header1 = _add_umi_str(line1) + "\n"
                        new_header2 = _add_umi_str(line2) + "\n"
                        out1_handle.write(new_header1.encode())
                        out2_handle.write(new_header2.encode())
                    else:
                        out1_handle.write(line1.encode())
                        out2_handle.write(line2.encode())
                    n += 1
Example #14
0
def _prep_items_from_base(base, in_files):
    """Prepare a set of configuration items for input files.
    """
    details = []
    known_exts = {
        ".bam": "bam",
        ".cram": "bam",
        ".fq": "fastq",
        ".fastq": "fastq",
        ".txt": "fastq",
        ".fastq.gz": "fastq",
        ".fq.gz": "fastq",
        ".txt.gz": "fastq",
        ".gz": "fastq"
    }
    in_files = _expand_dirs(in_files, known_exts)
    in_files = _expand_wildcards(in_files)

    for i, (ext, files) in enumerate(
            itertools.groupby(
                in_files,
                lambda x: known_exts.get(utils.splitext_plus(x)[-1].lower()))):
        if ext == "bam":
            for f in files:
                details.append(_prep_bam_input(f, i, base))
        elif ext == "fastq":
            files = list(files)
            for fs in fastq.combine_pairs(files):
                details.append(_prep_fastq_input(fs, base))
        else:
            raise ValueError("Unexpected input file types: %s" % str(files))
    return details
Example #15
0
def _prep_items_from_base(base, in_files, metadata, separators, force_single=False):
    """Prepare a set of configuration items for input files.
    """
    details = []
    in_files = _expand_dirs(in_files, KNOWN_EXTS)
    in_files = _expand_wildcards(in_files)

    ext_groups = collections.defaultdict(list)
    for ext, files in itertools.groupby(
            in_files, lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower())):
        ext_groups[ext].extend(list(files))
    for ext, files in ext_groups.items():
        if ext == "bam":
            for f in files:
                details.append(_prep_bam_input(f, base))
        elif ext in ["fastq", "fq", "fasta"]:
            files, glob_files = _find_glob_matches(files, metadata)
            for fs in glob_files:
                details.append(_prep_fastq_input(fs, base))
            for fs in fastq.combine_pairs(files, force_single, separators=separators):
                details.append(_prep_fastq_input(fs, base))
        elif ext in ["vcf"]:
            for f in files:
                details.append(_prep_vcf_input(f, base))
        else:
            print("Ignoring unexpected input file types %s: %s" % (ext, list(files)))
    return details
def _check_paired(files, force_single, separators):
    """check if files are fastq(.gz) and paired"""
    full_name = _check_stems(files)
    if files[0].endswith(".bam"):
        return files
    elif is_gsm(files[0]):
        return files
    return combine_pairs(files, force_single, full_name, separators)
    def test_5_find_fastq_pairs(self):
        """Ensure we can correctly find paired fastq files.
        """
        test_pairs = ["/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq",
                      "/path/to/input/D1HJVACXX_3_AAGAGATC_1.fastq",
                      "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq",
                      "/path/2/input/D1HJVACXX_3_AAGAGATC_2.fastq"]
        out = fastq.combine_pairs(test_pairs)
        assert out[0] == ["/path/to/input/D1HJVACXX_2_AAGAGATC_1.fastq",
                          "/path/2/input/D1HJVACXX_2_AAGAGATC_2.fastq"], out[0]
        assert out[1] == ["/path/to/input/D1HJVACXX_3_AAGAGATC_1.fastq",
                          "/path/2/input/D1HJVACXX_3_AAGAGATC_2.fastq"], out[1]

        test_pairs = ["/path/to/input/Tester_1_fastq.txt",
                      "/path/to/input/Tester_2_fastq.txt"]
        out = fastq.combine_pairs(test_pairs)
        assert out[0] == test_pairs, out[0]
def _check_paired(files, force_single, separators):
    """check if files are fastq(.gz) and paired"""
    full_name = _check_stems(files)
    if files[0].endswith(".bam"):
        return files
    elif is_gsm(files[0]):
        return files
    return combine_pairs(files, force_single, full_name, separators)
def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None
    if not tags:
        # Aim for 2 or 3 simultaneous processes, each with multiple cores
        target_processes = 2
        process_cores = max(1, (args.cores // target_processes) + (args.cores % target_processes))
        overall_processes = max(1, int(math.ceil(args.cores / float(process_cores))))
    else:
        process_cores = 1
        overall_processes = args.cores
    for r1, r2 in to_run:
        target = _commonprefix([r1, r2])
        if tags:
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2])))
            umi = None
        else:
            r3 = None
            for test_r3 in extras:
                if (_commonprefix([r1, test_r3]) == target and
                      _commonprefix([r2, test_r3]) == target):
                    r3 = test_r3
                    break
            assert r3, (r1, r2, extras)
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3])))
            r1, r2, umi = _find_umi([r1, r2, r3])
        # fastp handles a single pair of reads so we split processing to run on each
        if umi and not tags:
            ready_to_run.append([base_name, r1, None, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
            ready_to_run.append([base_name, None, r2, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
        else:
            ready_to_run.append([base_name, r1, r2, umi, tags, process_cores, {"algorithm": {}, "resources": {}}])
    parallel = {"type": "local", "cores": overall_processes, "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
Example #20
0
def run_autopair(args):
    outdir = utils.safe_makedir(args.outdir)
    to_run = []
    extras = []
    for fnames in fastq.combine_pairs(sorted(args.files)):
        if len(fnames) == 2:
            to_run.append(fnames)
        elif len(fnames) == 3:
            r1, r2, r3 = sorted(fnames)
            to_run.append([r1, r2])
            extras.append(r3)
        else:
            assert len(fnames) == 1, fnames
            extras.append(fnames[0])
    ready_to_run = []
    tags = [args.tag1, args.tag2] if args.tag1 and args.tag2 else None
    if not tags:
        # Aim for 2 or 3 simultaneous processes, each with multiple cores
        target_processes = 2
        process_cores = max(1, (args.cores // target_processes) + (args.cores % target_processes))
        overall_processes = max(1, int(math.ceil(args.cores / float(process_cores))))
    else:
        process_cores = 1
        overall_processes = args.cores
    for r1, r2 in to_run:
        target = _commonprefix([r1, r2])
        if tags:
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2])))
            umi = None
        else:
            r3 = None
            for test_r3 in extras:
                if (_commonprefix([r1, test_r3]) == target and _commonprefix([r2, test_r3]) == target):
                    r3 = test_r3
                    break
            assert r3, (r1, r2, extras)
            base_name = os.path.join(outdir, os.path.basename(_commonprefix([r1, r2, r3])))
            r1, r2, umi = _find_umi([r1, r2, r3])
        # fastp handles a single pair of reads so we split processing to run on each
        if umi and not tags:
            ready_to_run.append([base_name, r1, None, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
            ready_to_run.append([base_name, None, r2, umi, None, process_cores, {"algorithm": {}, "resources": {}}])
        else:
            ready_to_run.append([base_name, r1, r2, umi, tags, process_cores, {"algorithm": {}, "resources": {}}])
    parallel = {"type": "local", "cores": overall_processes, "progs": []}
    run_multicore(add_umis_to_fastq_parallel, ready_to_run, {"algorithm": {}}, parallel)
Example #21
0
def _prep_items_from_base(base, in_files):
    """Prepare a set of configuration items for input files.
    """
    details = []
    fq_exts = [".fq", ".fastq", ".txt", ".gz"]
    gz_exts = tuple(["%s.gz" % ext for ext in fq_exts if ext != ".gz"])
    for ext, files in itertools.groupby(in_files, lambda x: os.path.splitext(x)[-1].lower()):
        if ext == ".bam":
            for f in files:
                details.append(_prep_bam_input(f, base))
        elif ext in fq_exts:
            files = list(files)
            if ext == ".gz": assert all(f.endswith(gz_exts) for f in files), (files, gz_exts)
            for fs in fastq.combine_pairs(files):
                details.append(_prep_fastq_input(fs, base))
        else:
            raise ValueError("File type not yet implemented: %s" % ext)
    return details
Example #22
0
def _prep_items_from_base(base, in_files, force_single=False):
    """Prepare a set of configuration items for input files.
    """
    details = []
    in_files = _expand_dirs(in_files, KNOWN_EXTS)
    in_files = _expand_wildcards(in_files)

    for i, (ext, files) in enumerate(itertools.groupby(
            in_files, lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower()))):
        if ext == "bam":
            for f in files:
                details.append(_prep_bam_input(f, i, base))
        elif ext in ["fastq", "fq", "fasta"]:
            files = list(files)
            for fs in fastq.combine_pairs(files, force_single):
                details.append(_prep_fastq_input(fs, base))
        else:
            print("Ignoring unexpected input file types %s: %s" % (ext, list(files)))
    return details
Example #23
0
def _prep_items_from_base(base, in_files):
    """Prepare a set of configuration items for input files.
    """
    details = []
    fq_exts = [".fq", ".fastq", ".txt", ".gz"]
    gz_exts = tuple(["%s.gz" % ext for ext in fq_exts if ext != ".gz"])
    for ext, files in itertools.groupby(
            in_files, lambda x: os.path.splitext(x)[-1].lower()):
        if ext == ".bam":
            for f in files:
                details.append(_prep_bam_input(f, base))
        elif ext in fq_exts:
            files = list(files)
            if ext == ".gz":
                assert all(f.endswith(gz_exts)
                           for f in files), (files, gz_exts)
            for fs in fastq.combine_pairs(files):
                details.append(_prep_fastq_input(fs, base))
        else:
            raise ValueError("File type not yet implemented: %s" % ext)
    return details
Example #24
0
def prep_fastq_inputs(in_files, data):
    """Prepare bgzipped fastq inputs
    """
    if len(in_files) == 1 and _is_bam_input(in_files):
        out = _bgzip_from_bam(in_files[0], data["dirs"], data)
    elif len(in_files) == 1 and _is_cram_input(in_files):
        out = _bgzip_from_cram(in_files[0], data["dirs"], data)
    elif len(in_files) in [1, 2] and _ready_gzip_fastq(in_files, data):
        out = _symlink_in_files(in_files, data)
    else:
        if len(in_files) > 2:
            fpairs = fastq.combine_pairs(in_files)
            pair_types = set([len(xs) for xs in fpairs])
            assert len(pair_types) == 1
            fpairs.sort(key=lambda x: os.path.basename(x[0]))
            organized = [[xs[0] for xs in fpairs]]
            if len(fpairs[0]) > 1:
                organized.append([xs[1] for xs in fpairs])
            in_files = organized
        parallel = {
            "type":
            "local",
            "num_jobs":
            len(in_files),
            "cores_per_job":
            max(1, data["config"]["algorithm"]["num_cores"] // len(in_files))
        }
        inputs = [{
            "in_file": x,
            "read_num": i,
            "dirs": data["dirs"],
            "config": data["config"],
            "is_cwl": "cwl_keys" in data,
            "rgnames": data["rgnames"]
        } for i, x in enumerate(in_files) if x]
        out = run_multicore(_bgzip_from_fastq_parallel, [[d] for d in inputs],
                            data["config"], parallel)
    return out
Example #25
0
def query_srr(sra, out_file, config = {}):
    sra = sra[0]
    outs = []
    out_dir = os.path.dirname(os.path.abspath(out_file))
    name = utils.splitext_plus(os.path.basename(out_file))[0]
    srrall = []
    for srr in sra:
        srrall.append(_create_link(srr))
    logger.debug("Get FTP link for %s : %s" % (name, srrall))
    for srx in srrall:
        sra_dir = utils.safe_makedir(os.path.join(out_dir, name))
        srafiles = _download_srx(srx, sra_dir)
        if srafiles:
            logger.debug("Get SRA for %s: %s" % (sra, " ".join(srafiles)))
            for sra in srafiles:
                fastq_fn = _convert_fastq(sra, out_dir)
                if fastq_fn:
                    outs.extend(fastq_fn)
        logger.debug("Get FASTQ for %s: %s" % (sra, " ".join(outs)))
    if outs:
        files = combine_pairs(outs)
        out_file = fastq.merge(files, out_file, config)
        return out_file
Example #26
0
def query_srr(sra, out_file, config={}):
    sra = sra[0]
    outs = []
    out_dir = os.path.dirname(os.path.abspath(out_file))
    name = utils.splitext_plus(os.path.basename(out_file))[0]
    srrall = []
    for srr in sra:
        srrall.append(_create_link(srr))
    logger.debug("Get FTP link for %s : %s" % (name, srrall))
    for srx in srrall:
        sra_dir = utils.safe_makedir(os.path.join(out_dir, name))
        srafiles = _download_srx(srx, sra_dir)
        if srafiles:
            logger.debug("Get SRA for %s: %s" % (sra, " ".join(srafiles)))
            for sra in srafiles:
                fastq_fn = _convert_fastq(sra, out_dir)
                if fastq_fn:
                    outs.extend(fastq_fn)
        logger.debug("Get FASTQ for %s: %s" % (sra, " ".join(outs)))
    if outs:
        files = combine_pairs(outs)
        out_file = fastq.merge(files, out_file, config)
        return out_file
Example #27
0
def _prep_items_from_base(base, in_files):
    """Prepare a set of configuration items for input files.
    """
    details = []
    known_exts = {".bam": "bam", ".fq": "fastq",
                  ".fastq": "fastq", ".txt": "fastq",
                  ".fastq.gz": "fastq", ".fq.gz": "fastq",
                  ".txt.gz": "fastq", ".gz": "fastq"}
    in_files = _expand_dirs(in_files, known_exts)
    in_files = _expand_wildcards(in_files)

    for i, (ext, files) in enumerate(itertools.groupby(
            in_files, lambda x: known_exts.get(utils.splitext_plus(x)[-1].lower()))):
        if ext == "bam":
            for f in files:
                details.append(_prep_bam_input(f, i, base))
        elif ext == "fastq":
            files = list(files)
            for fs in fastq.combine_pairs(files):
                details.append(_prep_fastq_input(fs, base))
        else:
            raise ValueError("Unexpected input file types: %s" % str(files))
    return details
def _check_paired(files):
    """check if files are fastq(.gz) and paired"""
    if files[0].endswith(".bam"):
        return files
    return combine_pairs(files)