Beispiel #1
0
def get_cellular_barcodes(data):
    if dd.get_cellular_barcodes(data):
        return dd.get_cellular_barcodes(data)
    if is_supported_transform(data):
        stem = dd.get_umi_type(data)
        bc1 = os.path.join(TRANSFORM_DIR, stem + "-cb1.txt")
        bc2 = os.path.join(TRANSFORM_DIR, stem + "-cb2.txt")
        return filter(file_exists, [bc1, bc2])
    else:
        return []
Beispiel #2
0
def get_cellular_barcodes(data):
    if dd.get_cellular_barcodes(data):
        return dd.get_cellular_barcodes(data)
    if is_supported_transform(data):
        stem = dd.get_umi_type(data)
        bc1 = os.path.join(TRANSFORM_DIR, stem + "-cb1.txt")
        bc2 = os.path.join(TRANSFORM_DIR, stem + "-cb2.txt")
        bc3 = os.path.join(TRANSFORM_DIR, stem + "-cb3.txt")
        return filter(file_exists, [bc1, bc2, bc3])
    else:
        return []
Beispiel #3
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1, fq2 = dd.get_input_sequence_files(data)
    fq2 = fq2 if fq2 else ""
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    transform = dd.get_umi_type(data)
    transform_data = transforms[transform]
    safe_makedir(umi_dir)
    transform_file = os.path.join(umi_dir, transform + ".json")
    transform_file = write_transform_file(transform_data, transform_file)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    index_option = "--dual_index" if transform_data["dual"] else ""
    if len(dd.get_cellular_barcodes(data)) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = (
        "{umis} fastqtransform {index_option} {split_option} {transform_file} "
        "{fq1} {fq2} "
        "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Beispiel #4
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1, fq2 = dd.get_input_sequence_files(data)
    fq2 = fq2 if fq2 else ""
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    transform = dd.get_umi_type(data)
    transform_data = transforms[transform]
    safe_makedir(umi_dir)
    transform_file = os.path.join(umi_dir, transform + ".json")
    transform_file = write_transform_file(transform_data, transform_file)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    index_option = "--dual_index" if transform_data["dual"] else ""
    if len(dd.get_cellular_barcodes(data)) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = ("{umis} fastqtransform {index_option} {split_option} {transform_file} "
           "{fq1} {fq2} "
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Beispiel #5
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4 - len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if file_exists(transform):
        transform_file = transform
    else:
        transform_data = transforms.get(transform, "")
        if not transform_data:
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio.")
            sys.exit(1)
        transform_file = os.path.join(umi_dir, transform + ".json")
        transform_file = write_transform_file(transform_data, transform_file)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    if len(dd.get_cellular_barcodes(data)) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Beispiel #6
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4-len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if file_exists(transform):
        transform_file = transform
    else:
        transform_data = transforms.get(transform, "")
        if not transform_data:
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio.")
            sys.exit(1)
        transform_file = os.path.join(umi_dir, transform + ".json")
        transform_file = write_transform_file(transform_data, transform_file)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    if len(dd.get_cellular_barcodes(data)) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Beispiel #7
0
def filter_barcodes(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = dd.get_cellular_barcodes(data)
    if not bc:
        return [[data]]
    bc1 = None
    bc2 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, basestring):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) == 2:
        bc1 = bc[0]
        bc2 = bc[1]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Beispiel #8
0
def filter_barcodes(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = dd.get_cellular_barcodes(data)
    if not bc:
        return [[data]]
    bc1 = None
    bc2 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, basestring):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) == 2:
        bc1 = bc[0]
        bc2 = bc[1]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]