Exemple #1
0
def organize_samples(dirs, fc_name, fc_date, run_items):
    """Organize BAM output files by sample name, handling multiplexing.
    """
    bams_by_sample = collections.defaultdict(list)
    sample_info = dict()
    fastq_by_sample = collections.defaultdict(list)
    for lane_info in run_items:
        multiplex = lane_info.get("multiplex", None)
        if multiplex:
            mfastq_dir = os.path.join(dirs["work"], "%s_%s_%s_barcode" %
                    (lane_info["lane"], fc_date, fc_name))
            for multi in multiplex:
                name = (lane_info.get("name", ""), lane_info["description"],
                        multi["name"])
                fname = os.path.join(dirs["align"], "%s_%s_%s_%s-sort.bam" %
                    (lane_info["lane"], fc_date, fc_name, multi["barcode_id"]))
                if os.path.exists(fname):
                    bams_by_sample[name].append(fname)
                    sample_info[name] = lane_info
                    fastq_by_sample[name].append(get_fastq_files(mfastq_dir, lane_info,
                                                                 fc_name, multi["barcode_id"]))
        else:
            name = (lane_info.get("name", ""), lane_info["description"])
            fname = os.path.join(dirs["align"], "%s_%s_%s-sort.bam" %
                    (lane_info["lane"], fc_date, fc_name))
            if os.path.exists(fname):
                bams_by_sample[name].append(fname)
                sample_info[name] = lane_info
                fastq_by_sample[name].append(get_fastq_files(dirs["fastq"],
                                                             lane_info, fc_name))
    return sorted(bams_by_sample.items()), dict(fastq_by_sample), sample_info
Exemple #2
0
def organize_samples(dirs, fc_name, fc_date, run_items, align_items = None):
    """Organize BAM output files by sample name, handling multiplexing.
    """
    if align_items != None:
        bams_by_lane = _organize_bam_by_lane(align_items)
    else:
        bams_by_lane = dict()
    
    bams_by_sample = collections.defaultdict(list)
    sample_info = dict()
    fastq_by_sample = collections.defaultdict(list)
    for lane_info in run_items:
        multiplex = lane_info.get("multiplex", None)
        
        if multiplex:
            mfastq_dir = os.path.join(dirs["work"], "%s_%s_%s_barcode" %
                    (lane_info["lane"], fc_date, fc_name))
            for multi in multiplex:
                name = (lane_info.get("name", ""), lane_info["description"],
                        multi["name"])
                base = "%s_%s_%s_%s" % (lane_info["lane"], fc_date, fc_name, multi["barcode_id"])
                fname = os.path.join(dirs["align"], "%s-sort.bam" % base)
                has_bam = False
                if os.path.exists(fname):
                    has_bam = True
                    bams_by_sample[name].append(fname)
                elif bams_by_lane.has_key(base):
                    has_bam = True
                    bams_by_sample[name].append(bams_by_lane[base])
                else:
                    pass # Not all barcodes may exist; would like a way to check here
                if has_bam:
                    sample_info[name] = lane_info
                    fastq_by_sample[name].append(get_fastq_files(mfastq_dir, lane_info,
                                                                 fc_name, multi["barcode_id"]))
        else:
            name = (lane_info.get("name", ""), lane_info["description"])
            base = "%s_%s_%s" % (lane_info["lane"], fc_date, fc_name)
            fname = os.path.join(dirs["align"], "%s-sort.bam" % base)
            if os.path.exists(fname):
                bams_by_sample[name].append(fname)
            elif bams_by_lane.has_key(base):
                bams_by_sample[name].append(bams_by_lane[base])
            else:
                raise ValueError("Did not find BAM files for %s" % lane_info)
            sample_info[name] = lane_info
            fastq_by_sample[name].append(get_fastq_files(dirs["fastq"],
                                                         lane_info, fc_name))
    return sorted(bams_by_sample.items()), dict(fastq_by_sample), sample_info
Exemple #3
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    data["files"] = get_fastq_files(data)
    return [[data]]
Exemple #4
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    data["files"] = get_fastq_files(data)
    return [[data]]
Exemple #5
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    logger.info("Preparing %s" % lane_name)
    full_fastq1, full_fastq2 = get_fastq_files(
        dirs["fastq"],
        dirs["work"],
        lane_items[0],
        fc_name,
        dirs=dirs,
        config=shared.update_config_w_custom(config, lane_items[0]))
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)
    out = []
    for item in lane_items:
        config = shared.update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            for fastq1, fastq2, lane_ext in _prep_fastq_files(
                    item, bc_files, dirs, config):
                cur_lane_name = lane_name
                cur_lane_desc = item["description"]
                if item.get("name", "") and config["algorithm"].get(
                        "include_short_name", True):
                    cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)
                if item["barcode_id"] is not None:
                    cur_lane_name += "_%s" % (item["barcode_id"])
                if lane_ext is not None:
                    cur_lane_name += "_s{0}".format(lane_ext)
                out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                            dirs, config))
    return out
Exemple #6
0
def calling(data):
    """Main function to parallelize peak calling."""
    chip_bam = dd.get_work_bam(data)
    if data["work_bam_rep"] != "":
        rep_bam = data["work_bam_rep"]
    else:
        rep_bam = ""
    input_bam = data["work_bam_input"]
    caller_fn = get_callers()[data["rmats_fn"]]
    name = dd.get_sample_name(data)
    fastq_file = fastq.get_fastq_files(data)
    read_len = bam.fastq.estimate_read_length(fastq_file[0])
    if read_len < 50:
        read_len = 50
    elif read_len > 50 and read_len < 75:
        read_len = 75
    else:
        read_len = 100
    if len(fastq_file) > 1:
        read_pair = "paired"
    else:
        read_pair = "single"
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["rmats_fn"], name ))
    out_file = caller_fn(name, chip_bam, rep_bam, input_bam, dd.get_gtf_file(data), out_dir, read_len, read_pair, data["config"])
    data["rmats_file"] = out_file
    return [[data]]
Exemple #7
0
def process_lane(info, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    config = _update_config_w_custom(config, info)

    sample_name = info.get("description", "")
    if (config["algorithm"].get("include_short_name", True) and
            info.get("name", "")):
        sample_name = "%s---%s" % (info.get("name", ""), sample_name)
    genome_build = info.get("genome_build", None)
    multiplex = info.get("multiplex", None)

    log.info("Processing sample: %s; lane %s; reference genome %s; " \
             "researcher %s; analysis method %s" %
             (sample_name, info["lane"], genome_build,
              info.get("researcher", ""), info.get("analysis", "")))
    if multiplex:
        log.debug("Sample %s multiplexed as: %s" % (sample_name, multiplex))

    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], info, fc_name)
    lane_name = "%s_%s_%s" % (info['lane'], fc_date, fc_name)
    lane_items = []
    for mname, msample, fastq1, fastq2 in split_by_barcode(full_fastq1,
            full_fastq2, multiplex, lane_name, dirs, config):
        mlane_name = "%s_%s" % (lane_name, mname) if mname else lane_name
        if msample is None:
            msample = "%s---%s" % (sample_name, mname)
        lane_items.append((fastq1, fastq2, genome_build, mlane_name, msample,
                           dirs, config))
    return lane_items
Exemple #8
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    logger.info("Demulitplexing %s" % lane_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"],
                                               lane_items[0], fc_name, config=config)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)
    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if bc_files.has_key(item["barcode_id"]):
            for fastq1, fastq2, lane_ext in _prep_fastq_files(item, bc_files, dirs, config):
                cur_lane_name = lane_name
                cur_lane_desc = item["description"]
                if item.get("name", "") and config["algorithm"].get("include_short_name", True):
                    cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)
                if item["barcode_id"] is not None:
                    cur_lane_name += "_%s" % (item["barcode_id"])
                if lane_ext is not None:
                    cur_lane_name += "_s{0}".format(lane_ext)
                if config["algorithm"].get("trim_reads", False):
                    trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None],
                                                dirs, config)
                    fastq1 = trim_info[0]
                    if fastq2 is not None:
                        fastq2 = trim_info[1]
                out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                            dirs, config))
    return out
Exemple #9
0
def _get_fastq_size(item, fastq_dir, fc_name):
    """Retrieve the size of reads from the first flowcell sequence.
    """
    (fastq1, _) = get_fastq_files(fastq_dir, None, item, fc_name)
    with open(fastq1) as in_handle:
        try:
            rec = SeqIO.parse(in_handle, "fastq").next()
            size = len(rec.seq)
        except StopIteration:
            size = 0
    return size
Exemple #10
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    data = utils.to_single_data(data)
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    data["files"] = get_fastq_files(data)
    # get_fastq_files swaps over quality scores to standard, unless trimming
    if not (dd.get_trim_reads(data)):
        data = dd.set_quality_format(data, "standard")
    return [[data]]
Exemple #11
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    data = utils.to_single_data(data)
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    data["files"] = get_fastq_files(data)
    # get_fastq_files swaps over quality scores to standard, unless trimming
    if not(dd.get_trim_reads(data)):
        data = dd.set_quality_format(data, "standard")
    return [[data]]
Exemple #12
0
def _get_fastq_size(item, fastq_dir, fc_name):
    """Retrieve the size of reads from the first flowcell sequence.
    """
    (fastq1, _) = get_fastq_files(fastq_dir, None, item, fc_name)
    with open(fastq1) as in_handle:
        try:
            rec = SeqIO.parse(in_handle, "fastq").next()
            size = len(rec.seq)
        except StopIteration:
            size = 0
    return size
Exemple #13
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """

    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], dirs["work"],
                                               lane_items[0], fc_name, config=config)

    # Filter phiX
    custom_config = _update_config_w_custom(config, lane_items[0])
    if custom_config["algorithm"].get("filter_phix", False):
        # If we are starting from demultiplexed material, we will skip a lane-wise screening
        # Screening will be performed on a sample basis
        if custom_config["algorithm"].get("demultiplexed", False):
            logger.warn("Will not filter phix lane-wise on already demultiplexed files. " \
                "You will have to specify genomes_filter_out option for each sample")

        else:
            logger.info("Filtering phiX from %s" % lane_name)
            info = {"genomes_filter_out": "spiked_phix", "description": lane_name}
            processed = remove_contaminants(full_fastq1, full_fastq2, info, lane_name, info["description"], dirs, custom_config)
            (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4]

    logger.info("Demultiplexing %s" % lane_name)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)

    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            fastq1, fastq2 = bc_files[item["barcode_id"]]
            cur_lane_name = lane_name
            cur_lane_desc = item["description"]
            if item.get("name", "") and config["algorithm"].get("include_short_name", True):
                cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)

            if item["barcode_id"] is not None:
                cur_lane_name += "_%s" % (item["barcode_id"])

            if config["algorithm"].get("trim_reads", False):
                trim_info = brun_trim_fastq([x for x in [fastq1, fastq2] if x is not None],
                                            dirs, config)
                fastq1 = trim_info[0]
                if fastq2 is not None:
                    fastq2 = trim_info[1]

            out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                        dirs, config))

    return out
Exemple #14
0
def process_lane(item):
    """Prepare lanes, potentially splitting based on barcodes and reducing the
    number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % item["rgnames"]["lane"])
    file1, file2 = get_fastq_files(item)
    if item.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE)
        else:
            file1, file2 = fastq.downsample(file1, file2, item, NUM_DOWNSAMPLE, quick=True)
    item["files"] = (file1, file2)
    return [item]
Exemple #15
0
def _get_fastq_size(item, fastq_dir, fc_name):
    """Retrieve the size of reads from the first flowcell sequence.
    """
    (fastq1, _) = get_fastq_files(fastq_dir, None, item, fc_name, unpack=False)
    with open(fastq1) as in_handle:
        try:
            if fastq1.endswith(".gz"):
                in_handle = gzip.GzipFile(fileobj=in_handle)

            rec = SeqIO.parse(in_handle, "fastq").next()
            size = len(rec.seq)
        except StopIteration:
            size = 0
    return size
Exemple #16
0
def _get_fastq_size(item, fastq_dir, fc_name):
    """Retrieve the size of reads from the first flowcell sequence.
    """
    (fastq1, _) = get_fastq_files(fastq_dir, None, item, fc_name, unpack=False)
    with open(fastq1) as in_handle:
        try:
            if fastq1.endswith(".gz"):
                in_handle = gzip.GzipFile(fileobj=in_handle)
                
            rec = SeqIO.parse(in_handle, "fastq").next()
            size = len(rec.seq)
        except StopIteration:
            size = 0
    return size
Exemple #17
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    file1, file2 = get_fastq_files(data)
    if data.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, data, NUM_DOWNSAMPLE)
            file2 = None
        else:
            file1, file2 = fastq.downsample(file1, file2, data, NUM_DOWNSAMPLE, quick=True)
    data["files"] = [file1, file2]
    return [[data]]
Exemple #18
0
def process_lane(item):
    """Prepare lanes, potentially splitting based on barcodes and reducing the
    number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % item["rgnames"]["lane"])
    file1, file2 = get_fastq_files(item)
    if item.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, item, NUM_DOWNSAMPLE)
            file2 = None
        else:
            file1, file2 = fastq.downsample(file1, file2, item,
                                            NUM_DOWNSAMPLE, quick=True)
    item["files"] = [file1, file2]
    return [[item]]
Exemple #19
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    NUM_DOWNSAMPLE = 10000
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    file1, file2 = get_fastq_files(data)
    if data.get("test_run", False):
        if bam.is_bam(file1):
            file1 = bam.downsample(file1, data, NUM_DOWNSAMPLE)
            file2 = None
        else:
            file1, file2 = fastq.downsample(file1,
                                            file2,
                                            data,
                                            NUM_DOWNSAMPLE,
                                            quick=True)
    data["files"] = [file1, file2]
    return [[data]]
Exemple #20
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"],
                                               dirs["work"], lane_items[0], fc_name, dirs=dirs,
                                               config=config_utils.update_w_custom(config, lane_items[0]))
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_items[0]["rgnames"]["lane"], dirs, config)
    out = []
    for item in lane_items:
        logger.debug("Preparing %s" % item["rgnames"]["lane"])
        config = config_utils.update_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            for fastq1, fastq2, lane_ext in _prep_fastq_files(item, bc_files, dirs, config):
                if item["barcode_id"] is not None:
                    item["rgnames"]["lane"] += "_%s" % (item["barcode_id"])
                if lane_ext is not None:
                    item["rgnames"]["lane"] += "_s{0}".format(lane_ext)
                out.append((fastq1, fastq2, item, dirs, config))
    return out
Exemple #21
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    lane_name = "%s_%s_%s" % (lane_items[0]["lane"], fc_date, fc_name)
    log.debug("Demulitplexing %s" % lane_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"], lane_items[0], fc_name)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items, lane_name, dirs, config)
    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if bc_files.has_key(item["barcode_id"]):
            fastq1, fastq2 = bc_files[item["barcode_id"]]
            cur_lane_name = lane_name
            cur_lane_desc = item["description"]
            if item.get("name", ""):
                cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)
            if item["barcode_id"] is not None:
                cur_lane_name += "_%s" % (item["barcode_id"])
            out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc, dirs, config))
    return out
Exemple #22
0
def process_lane(item):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    logger.debug("Preparing %s" % item["rgnames"]["lane"])
    item["files"] = get_fastq_files(item)
    return [item]
Exemple #23
0
def process_lane(lane_items, fc_name, fc_date, dirs, config):
    """Prepare lanes, potentially splitting based on barcodes.
    """

    lane_name = "%s_%s_%s" % (lane_items[0]['lane'], fc_date, fc_name)
    full_fastq1, full_fastq2 = get_fastq_files(dirs["fastq"],
                                               dirs["work"],
                                               lane_items[0],
                                               fc_name,
                                               config=config)

    # Filter phiX
    custom_config = _update_config_w_custom(config, lane_items[0])
    if custom_config["algorithm"].get("filter_phix", False):
        # If we are starting from demultiplexed material, we will skip a lane-wise screening
        # Screening will be performed on a sample basis
        if custom_config["algorithm"].get("demultiplexed", False):
            logger.warn("Will not filter phix lane-wise on already demultiplexed files. " \
                "You will have to specify genomes_filter_out option for each sample")

        else:
            logger.info("Filtering phiX from %s" % lane_name)
            info = {
                "genomes_filter_out": "spiked_phix",
                "description": lane_name
            }
            processed = remove_contaminants(full_fastq1, full_fastq2, info,
                                            lane_name, info["description"],
                                            dirs, custom_config)
            (full_fastq1, full_fastq2, _, lane_name) = processed[0][0:4]

    logger.info("Demultiplexing %s" % lane_name)
    bc_files = split_by_barcode(full_fastq1, full_fastq2, lane_items,
                                lane_name, dirs, config)

    out = []
    for item in lane_items:
        config = _update_config_w_custom(config, item)
        # Can specify all barcodes but might not have actual sequences
        # Would be nice to have a good way to check this is okay here.
        if item["barcode_id"] in bc_files:
            fastq1, fastq2 = bc_files[item["barcode_id"]]
            cur_lane_name = lane_name
            cur_lane_desc = item["description"]
            if item.get("name", "") and config["algorithm"].get(
                    "include_short_name", True):
                cur_lane_desc = "%s : %s" % (item["name"], cur_lane_desc)

            if item["barcode_id"] is not None:
                cur_lane_name += "_%s" % (item["barcode_id"])

            if config["algorithm"].get("trim_reads", False):
                trim_info = brun_trim_fastq(
                    [x for x in [fastq1, fastq2] if x is not None], dirs,
                    config)
                fastq1 = trim_info[0]
                if fastq2 is not None:
                    fastq2 = trim_info[1]

            out.append((fastq1, fastq2, item, cur_lane_name, cur_lane_desc,
                        dirs, config))

    return out
Exemple #24
0
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config):
    """Split a fastq file into multiplex pieces using barcode details.
    """
    unmatched_str = "unmatched"
    demultiplexed = config["algorithm"].get("demultiplexed", False)
    if len(multiplex) == 1 and multiplex[0]["barcode_id"] is None:
        return {None: (fastq1, fastq2)}

    bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name)
    nomatch_file = "%s_%s_1_fastq.txt" % (base_name, unmatched_str)
    metrics_file = "%s.bc_metrics" % base_name
    out_files = []
    for info in multiplex:
        if demultiplexed:
            out_tuple = [info["barcode_id"]]
            # If the data is already demultiplexed, the sequence files must have been specified in the config
            out_tuple.extend(
                get_fastq_files(dirs["fastq"],
                                dirs["work"],
                                info,
                                "",
                                config=config))
            #out_tuple.extend([fastq1,fastq2])
            out_files.append(tuple(out_tuple))
            continue

        fq_fname = lambda x: os.path.join(
            bc_dir, "%s_%s_%s_fastq.txt" % (base_name, info["barcode_id"], x))
        bc_file1 = fq_fname("1")
        bc_file2 = fq_fname("2") if fastq2 else None
        out_files.append((info["barcode_id"], bc_file1, bc_file2))

    if not utils.file_exists(bc_dir) and not demultiplexed:
        with file_transaction(bc_dir) as tx_bc_dir:
            with utils.chdir(tx_bc_dir):
                tag_file, need_trim = _make_tag_file(multiplex, unmatched_str,
                                                     config)
                cl = [
                    config["program"]["barcode"], tag_file,
                    "%s_--b--_--r--_fastq.txt" % base_name, fastq1
                ]
                if fastq2:
                    cl.append(fastq2)

                cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"])
                cl.append("--metrics=%s" % metrics_file)
                if int(config["algorithm"]["bc_read"]) > 1:
                    cl.append("--read=%s" % config["algorithm"]["bc_read"])

                if int(config["algorithm"]["bc_position"]) == 5:
                    cl.append("--five")

                if config["algorithm"].get("bc_allow_indels", True) is False:
                    cl.append("--noindel")

                if "bc_offset" in config["algorithm"]:
                    cl.append("--bc_offset=%s" %
                              config["algorithm"]["bc_offset"])

                subprocess.check_call(cl)

    else:
        with utils.curdir_tmpdir() as tmp_dir:
            with utils.chdir(tmp_dir):
                _, need_trim = _make_tag_file(multiplex, unmatched_str, config)

    out = {}
    for b, f1, f2 in out_files:
        if os.path.exists(f1):
            if b in need_trim:
                f1, f2 = _basic_trim(f1, f2, need_trim[b], config)

            out[b] = (f1, f2)

    if not demultiplexed:
        return out

    casava_stats = _find_demultiplex_stats_htm(base_name, config)
    if not casava_stats:
        logger2.warn("Demultiplex_Stats.htm not found! " \
                     "Barcode stats will be meaningless.")
        bc_metrics = {int(multiplex[0]["lane"]): \
                        {None: {
                             "read_count": 0,
                             "name": None,
                             "barcode_id": None}}
                             }
    else:
        bc_metrics = _parse_demultiplex_stats_htm(casava_stats)

    _write_demultiplex_metrics(multiplex, bc_metrics, metrics_file)

    return out
Exemple #25
0
def process_lane(item):
    """Prepare lanes, potentially splitting based on barcodes.
    """
    logger.debug("Preparing %s" % item["rgnames"]["lane"])
    item["files"] = get_fastq_files(item)
    return [item]
Exemple #26
0
def split_by_barcode(fastq1, fastq2, multiplex, base_name, dirs, config):
    """Split a fastq file into multiplex pieces using barcode details.
    """
    unmatched_str = "unmatched"
    demultiplexed = config["algorithm"].get("demultiplexed", False)
    if len(multiplex) == 1 and multiplex[0]["barcode_id"] is None:
        return {None: (fastq1, fastq2)}

    bc_dir = os.path.join(dirs["work"], "%s_barcode" % base_name)
    nomatch_file = "%s_%s_1_fastq.txt" % (base_name, unmatched_str)
    metrics_file = "%s.bc_metrics" % base_name
    out_files = []
    for info in multiplex:
        if demultiplexed:
            out_tuple = [info["barcode_id"]]
            # If the data is already demultiplexed, the sequence files must have been specified in the config
            out_tuple.extend(get_fastq_files(dirs["fastq"], dirs["work"],
                                               info, "", config=config))
            #out_tuple.extend([fastq1,fastq2])
            out_files.append(tuple(out_tuple))
            continue

        fq_fname = lambda x: os.path.join(bc_dir, "%s_%s_%s_fastq.txt" %
                             (base_name, info["barcode_id"], x))
        bc_file1 = fq_fname("1")
        bc_file2 = fq_fname("2") if fastq2 else None
        out_files.append((info["barcode_id"], bc_file1, bc_file2))

    if not utils.file_exists(bc_dir) and not demultiplexed:
        with file_transaction(bc_dir) as tx_bc_dir:
            with utils.chdir(tx_bc_dir):
                tag_file, need_trim = _make_tag_file(multiplex, unmatched_str, config)
                cl = [config["program"]["barcode"], tag_file,
                      "%s_--b--_--r--_fastq.txt" % base_name, fastq1]
                if fastq2:
                    cl.append(fastq2)

                cl.append("--mismatch=%s" % config["algorithm"]["bc_mismatch"])
                cl.append("--metrics=%s" % metrics_file)
                if int(config["algorithm"]["bc_read"]) > 1:
                    cl.append("--read=%s" % config["algorithm"]["bc_read"])

                if int(config["algorithm"]["bc_position"]) == 5:
                    cl.append("--five")

                if config["algorithm"].get("bc_allow_indels", True) is False:
                    cl.append("--noindel")

                if "bc_offset" in config["algorithm"]:
                    cl.append("--bc_offset=%s" % config["algorithm"]["bc_offset"])

                subprocess.check_call(cl)

    else:
        with utils.curdir_tmpdir() as tmp_dir:
            with utils.chdir(tmp_dir):
                _, need_trim = _make_tag_file(multiplex, unmatched_str, config)

    out = {}
    for b, f1, f2 in out_files:
        if os.path.exists(f1):
            if b in need_trim:
                f1, f2 = _basic_trim(f1, f2, need_trim[b], config)

            out[b] = (f1, f2)

    if not demultiplexed:
        return out

    casava_stats = _find_demultiplex_stats_htm(base_name, config)
    if not casava_stats:
        logger2.warn("Demultiplex_Stats.htm not found! " \
                     "Barcode stats will be meaningless.")
        bc_metrics = {int(multiplex[0]["lane"]): \
                        {None: {
                             "read_count": 0,
                             "name": None,
                             "barcode_id": None}}
                             }
    else:
        bc_metrics = _parse_demultiplex_stats_htm(casava_stats)

    _write_demultiplex_metrics(multiplex, bc_metrics, metrics_file)
    
    return out