Example #1
0
def _detect_fastq_format(in_file, MAX_RECORDS=1000):
    ranges = {
        "sanger": (33, 126),
        "solexa": (59, 126),
        "illumina_1.3+": (64, 126),
        "illumina_1.5+": (66, 126)
    }

    gmin, gmax = 99, 0
    possible = set(ranges.keys())

    with closing(open_fastq(in_file)) as in_handle:
        four = itertools.islice(in_handle, 3, None, 4)
        count = 0
        for line in four:
            if len(possible) == 1:
                return possible
            if count > MAX_RECORDS:
                break
            count += 1
            vals = [ord(c) for c in line.rstrip()]
            # if there is a short sequence, skip it
            if len(vals) < 20:
                continue
            lmin = min(vals)
            lmax = max(vals)
            for encoding, (emin, emax) in ranges.items():
                if encoding in possible:
                    if lmin < emin or lmax > emax:
                        possible.remove(encoding)

    return possible
Example #2
0
def _detect_fastq_format(in_file, MAX_RECORDS=1000):
    ranges = {"sanger": (33, 126),
              "solexa": (59, 126),
              "illumina_1.3+": (64, 126),
              "illumina_1.5+": (66, 126)}

    gmin, gmax = 99, 0
    possible = set(ranges.keys())

    with closing(open_fastq(in_file)) as in_handle:
        four = itertools.islice(in_handle, 3, None, 4)
        count = 0
        for line in four:
            if len(possible) == 1:
                return possible
            if count > MAX_RECORDS:
                break
            count += 1
            vals = [ord(c) for c in line.rstrip()]
            # if there is a short sequence, skip it
            if len(vals) < 20:
                continue
            lmin = min(vals)
            lmax = max(vals)
            for encoding, (emin, emax) in ranges.items():
                if encoding in possible:
                    if lmin < emin or lmax > emax:
                        possible.remove(encoding)

    return possible
Example #3
0
def demultiplex_samples(data):
    """
    demultiplex a fastqtransformed FASTQ file into separate sample barcode files
    """
    work_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
    demulti_dir = os.path.join(sample_dir, "demultiplexed")

    files = data["files"]
    if len(files) == 2:
        logger.error(
            "Sample demultiplexing doesn't handle paired-end reads, but "
            "we can add it. Open an issue here https://github.com/bcbio/bcbio-nextgen/issues if you need this and we'll add it."
        )
        sys.exit(1)
    else:
        fq1 = files[0]
    # check if samples need to be demultiplexed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "SAMPLE_" not in read:
            return [[data]]

    bcfile = get_sample_barcodes(dd.get_sample_barcodes(data), sample_dir)
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    if demultiplexed:
        return [split_demultiplexed_sampledata(data, demultiplexed)]
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} "
           "--out_dir {tx_dir} {fq1}")
    msg = "Demultiplexing {fq1}."
    with file_transaction(data, demulti_dir) as tx_dir:
        do.run(cmd.format(**locals()), msg.format(**locals()))
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    return [split_demultiplexed_sampledata(data, demultiplexed)]
Example #4
0
def demultiplex_samples(data):
    """
    demultiplex a fastqtransformed FASTQ file into separate sample barcode files
    """
    files = data["files"]
    if len(files) == 2:
        logger.error("Sample demultiplexing doesn't handle paired-end reads, but "
            "we can add it. Open an issue here https://github.com/chapmanb/bcbio-nextgen/issues if you need this and we'll add it.")
        sys.exit(1)
    else:
        fq1 = files[0]
    # check if samples need to be demultiplexed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "SAMPLE_" not in read:
            return [[data]]
    bcfile = dd.get_sample_barcodes(data)
    if not bcfile:
        logger.error("Sample demultiplexing needs a list of known indexes provided "
                     "with via the sample_barcodes option in the algorithm section.")
        sys.exit(1)
    work_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
    demulti_dir = os.path.join(sample_dir, "demultiplexed")
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    if demultiplexed:
        return [split_demultiplexed_sampledata(data, demultiplexed)]
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} "
           "--out_dir {tx_dir} {fq1}")
    msg = "Demultiplexing {fq1}."
    with file_transaction(data, demulti_dir) as tx_dir:
        do.run(cmd.format(**locals()), msg.format(**locals()))
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    return [split_demultiplexed_sampledata(data, demultiplexed)]
Example #5
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1 = data["files"][0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)

    if not transform:
        logger.info("No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1)
            data["files"] = [fq1]
            return data
        else:
            logger.error("No UMI transform was specified, but %s does not look "
                         "pre-transformed. Assuming non-umi data." % fq1)
            return data

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s."
                % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return data
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return data

    cmd = ("{umis} fastqtransform {transform_file} "
           "--cores {cores} "
           "{fq1}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return data
Example #6
0
def is_transformed(fastq):
    """
    check the first 100 reads to see if a FASTQ file has already been transformed
    by umis
    """

    with open_fastq(fastq) as in_handle:
        for line in islice(in_handle, 400):
            if "UMI_" in line:
                return True
    return False
Example #7
0
def is_transformed(fastq):
    """
    check the first 100 reads to see if a FASTQ file has already been transformed
    by umis
    """

    with open_fastq(fastq) as in_handle:
        for line in islice(in_handle, 400):
            if "UMI_" in line:
                return True
    return False
Example #8
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4 - len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (transform_file, ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Example #9
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4-len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s."
                %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Example #10
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4-len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if file_exists(transform):
        transform_file = transform
    else:
        transform_data = transforms[transform]
        transform_file = os.path.join(umi_dir, transform + ".json")
        transform_file = write_transform_file(transform_data, transform_file)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    if len(dd.get_cellular_barcodes(data)) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Example #11
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4 - len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if not transform:
        logger.info(
            "No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info(
                "%s detected as pre-transformed, passing it on unchanged." %
                fq1)
            data["files"] = [fq1]
            return [[data]]
        else:
            logger.error(
                "No UMI transform was specified, but %s does not look "
                "pre-transformed." % fq1)
            sys.exit(1)

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    if dd.get_demultiplexed(data):
        demuxed_option = "--demuxed_cb %s" % dd.get_sample_name(data)
        split_option = ""
    else:
        demuxed_option = ""
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]
    locale_export = utils.locale_export()
    umis = _umis_cmd(data)
    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} {demuxed_option} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]