def _trim_adapters(fastq_files, out_dir, data): """ for small insert sizes, the read length can be longer than the insert resulting in the reverse complement of the 3' adapter being sequenced. this takes adapter sequences and trims the only the reverse complement of the adapter MYSEQUENCEAAAARETPADA -> MYSEQUENCEAAAA (no polyA trim) """ to_trim = _get_sequences_to_trim(data["config"], SUPPORTED_ADAPTERS) if dd.get_trim_reads(data) == "fastp": out_files, report_file = _fastp_trim(fastq_files, to_trim, out_dir, data) else: out_files, report_file = _atropos_trim(fastq_files, to_trim, out_dir, data) # quality_format = _get_quality_format(data["config"]) # out_files = replace_directory(append_stem(fastq_files, "_%s.trimmed" % name), out_dir) # log_file = "%s_log_cutadapt.txt" % splitext_plus(out_files[0])[0] # out_files = _cutadapt_trim(fastq_files, quality_format, to_trim, out_files, log_file, data) # if file_exists(log_file): # content = open(log_file).read().replace(fastq_files[0], name) # if len(fastq_files) > 1: # content = content.replace(fastq_files[1], name) # open(log_file, 'w').write(content) return out_files
def get_fastq_files(data): """Retrieve fastq files for the given lane, ready to process. """ assert "files" in data, "Did not find `files` in input; nothing to process" ready_files = [] should_gzip = True # Bowtie does not accept gzipped fastq if 'bowtie' in data['reference'].keys(): should_gzip = False for fname in data["files"]: if fname.endswith(".bam"): if _pipeline_needs_fastq(data["config"], data): ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"], data, data["dirs"], data["config"]) else: ready_files = [fname] elif objectstore.is_remote(fname): ready_files.append(fname) # Trimming does quality conversion, so if not doing that, do an explicit conversion elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard": out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert")) ready_files.append(fastq.groom(fname, data, out_dir=out_dir)) else: ready_files.append(fname) ready_files = [x for x in ready_files if x is not None] if should_gzip: out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq")) ready_files = [_gzip_fastq(x, out_dir) for x in ready_files] for in_file in ready_files: if not objectstore.is_remote(in_file): assert os.path.exists(in_file), "%s does not exist." % in_file return ready_files
def prepare_sample(data): """Prepare a sample to be run, potentially converting from BAM to FASTQ and/or downsampling the number of reads for a test run """ data = utils.to_single_data(data) logger.debug("Preparing %s" % data["rgnames"]["sample"]) data["files"] = get_fastq_files(data) # get_fastq_files swaps over quality scores to standard, unless trimming if not (dd.get_trim_reads(data)): data = dd.set_quality_format(data, "standard") return [[data]]
def prepare_sample(data): """Prepare a sample to be run, potentially converting from BAM to FASTQ and/or downsampling the number of reads for a test run """ data = utils.to_single_data(data) logger.debug("Preparing %s" % data["rgnames"]["sample"]) data["files"] = get_fastq_files(data) # get_fastq_files swaps over quality scores to standard, unless trimming if not(dd.get_trim_reads(data)): data = dd.set_quality_format(data, "standard") return [[data]]
def trim_sample(data): """Trim from a sample with the provided trimming method. Support methods: read_through. """ data = utils.to_single_data(data) trim_reads = dd.get_trim_reads(data) # this block is to maintain legacy configuration files if not trim_reads: logger.info("Skipping trimming of %s." % dd.get_sample_name(data)) else: if "skewer" in dd.get_tools_on(data) or trim_reads == "skewer": trim_adapters = skewer.trim_adapters else: trim_adapters = trim.trim_adapters out_files = trim_adapters(data) data["files"] = out_files return [[data]]