Example #1
0
def _trim_adapters(fastq_files, out_dir, data):
    """
    for small insert sizes, the read length can be longer than the insert
    resulting in the reverse complement of the 3' adapter being sequenced.
    this takes adapter sequences and trims the only the reverse complement
    of the adapter

    MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim)
    """
    to_trim = _get_sequences_to_trim(data["config"], SUPPORTED_ADAPTERS)
    if dd.get_trim_reads(data) == "fastp":
        out_files, report_file = _fastp_trim(fastq_files, to_trim, out_dir,
                                             data)
    else:
        out_files, report_file = _atropos_trim(fastq_files, to_trim, out_dir,
                                               data)
    # quality_format = _get_quality_format(data["config"])
    # out_files = replace_directory(append_stem(fastq_files, "_%s.trimmed" % name), out_dir)
    # log_file = "%s_log_cutadapt.txt" % splitext_plus(out_files[0])[0]
    # out_files = _cutadapt_trim(fastq_files, quality_format, to_trim, out_files, log_file, data)
    # if file_exists(log_file):
    #     content = open(log_file).read().replace(fastq_files[0], name)
    #     if len(fastq_files) > 1:
    #         content = content.replace(fastq_files[1], name)
    #     open(log_file, 'w').write(content)
    return out_files
Example #2
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Example #3
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Example #4
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    data = utils.to_single_data(data)
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    data["files"] = get_fastq_files(data)
    # get_fastq_files swaps over quality scores to standard, unless trimming
    if not (dd.get_trim_reads(data)):
        data = dd.set_quality_format(data, "standard")
    return [[data]]
Example #5
0
def prepare_sample(data):
    """Prepare a sample to be run, potentially converting from BAM to
    FASTQ and/or downsampling the number of reads for a test run
    """
    data = utils.to_single_data(data)
    logger.debug("Preparing %s" % data["rgnames"]["sample"])
    data["files"] = get_fastq_files(data)
    # get_fastq_files swaps over quality scores to standard, unless trimming
    if not(dd.get_trim_reads(data)):
        data = dd.set_quality_format(data, "standard")
    return [[data]]
Example #6
0
def trim_sample(data):
    """Trim from a sample with the provided trimming method.
    Support methods: read_through.
    """
    data = utils.to_single_data(data)
    trim_reads = dd.get_trim_reads(data)
    # this block is to maintain legacy configuration files
    if not trim_reads:
        logger.info("Skipping trimming of %s." % dd.get_sample_name(data))
    else:
        if "skewer" in dd.get_tools_on(data) or trim_reads == "skewer":
            trim_adapters = skewer.trim_adapters
        else:
            trim_adapters = trim.trim_adapters
        out_files = trim_adapters(data)
        data["files"] = out_files
    return [[data]]
Example #7
0
def trim_sample(data):
    """Trim from a sample with the provided trimming method.
    Support methods: read_through.
    """
    data = utils.to_single_data(data)
    trim_reads = dd.get_trim_reads(data)
    # this block is to maintain legacy configuration files
    if not trim_reads:
        logger.info("Skipping trimming of %s." % dd.get_sample_name(data))
    else:
        if "skewer" in dd.get_tools_on(data) or trim_reads == "skewer":
            trim_adapters = skewer.trim_adapters
        else:
            trim_adapters = trim.trim_adapters
        out_files = trim_adapters(data)
        data["files"] = out_files
    return [[data]]
Example #8
0
def _trim_adapters(fastq_files, out_dir, data):
    """
    for small insert sizes, the read length can be longer than the insert
    resulting in the reverse complement of the 3' adapter being sequenced.
    this takes adapter sequences and trims the only the reverse complement
    of the adapter

    MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim)
    """
    to_trim = _get_sequences_to_trim(data["config"], SUPPORTED_ADAPTERS)
    if dd.get_trim_reads(data) == "fastp":
        out_files, report_file = _fastp_trim(fastq_files, to_trim, out_dir, data)
    else:
        out_files, report_file = _atropos_trim(fastq_files, to_trim, out_dir, data)
    # quality_format = _get_quality_format(data["config"])
    # out_files = replace_directory(append_stem(fastq_files, "_%s.trimmed" % name), out_dir)
    # log_file = "%s_log_cutadapt.txt" % splitext_plus(out_files[0])[0]
    # out_files = _cutadapt_trim(fastq_files, quality_format, to_trim, out_files, log_file, data)
    # if file_exists(log_file):
    #     content = open(log_file).read().replace(fastq_files[0], name)
    #     if len(fastq_files) > 1:
    #         content = content.replace(fastq_files[1], name)
    #     open(log_file, 'w').write(content)
    return out_files