def clean_reads(self, dataPath, name, sampleType): """Trim adapter sequences from the extracted reads, format and organize the cleaned reads into new files. Cutadapt is run to trim the adapter sequences from the sequence reads to remove any 'noise' from the assembly process. The cleaned reads output from cutadapt are then reprocessed to determine if the softclipped sequences were trimmed off or not to further filter out reads. The softclipped sequences that remain are stored and a new fastq file is written. Args: dataPath (str): The path to the data files for this target. name (str): The target name. type (str): A string indicating a tumor ('sv') or normal ('norm') sample being processed. Return: check (boolean): A boolean to indicate whether the are any reads left after cleaning is complete. """ cutadapt = self.params.get_param('cutadapt') # Cutadapt binary cutadaptConfigFn = self.params.get_param('cutadapt_config_file') utils.log( self.loggingName, 'info', 'Cleaning reads using %s with configuration file %s' % (cutadapt, cutadaptConfigFn)) self.files['%s_cleaned_fq' % sampleType] = os.path.join( dataPath, name + '_%s_reads_cleaned.fastq' % sampleType) utils.log( self.loggingName, 'info', 'Writing clean reads to %s' % self.files['%s_cleaned_fq' % sampleType]) output, errors = utils.run_cutadapt( cutadapt, cutadaptConfigFn, self.files['%s_fq' % sampleType], self.files['%s_cleaned_fq' % sampleType], self.loggingName) self.setup_cleaned_reads(sampleType) self.files['%s_cleaned_fq' % sampleType], self.cleaned_read_recs[ sampleType] = utils.get_fastq_reads( self.files['%s_cleaned_fq' % sampleType], self.get_sv_reads(sampleType)) self.clear_sv_reads(sampleType) check = self.continue_analysis_check(sampleType) utils.log(self.loggingName, 'info', 'Clean reads exist %s' % check) return check
def clean_reads(self, sample_type): ''' ''' # Run cleaning program cutadapt = self.params.get_param('cutadapt') cutadapt_config = self.params.get_param('cutadapt_config_file') utils.log(self.logging_name, 'info', 'Cleaning reads using %s with configuration file %s' % (cutadapt, cutadapt_config)) self.files['%s_cleaned_fq' % sample_type] = os.path.join(self.paths['data'], self.name + "_%s_reads_cleaned.fastq" % sample_type) utils.log(self.logging_name, 'info', 'Writing clean reads to %s' % self.files['%s_cleaned_fq' % sample_type]) cutadapt_parameters = utils.stringify(cutadapt_config) cutadapt_cmd = '%s %s %s %s > %s' % (sys.executable, cutadapt, cutadapt_parameters, self.files['%s_fq' % sample_type], self.files['%s_cleaned_fq' % sample_type]) utils.log(self.logging_name, 'debug', 'Cutadapt system command %s' % cutadapt_cmd) cutadapt_proc = subprocess.Popen(cutadapt_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) output, errors = cutadapt_proc.communicate() utils.log(self.logging_name, 'debug', 'Clean reads output %s' % output) utils.log(self.logging_name, 'debug', 'Clean reads errors %s' % errors) # Use these for pulling out reads after finding sample-only kmers. # Filter the cleaned reads to make sure soft clips were not adapters, re-write fastq if not self.cleaned_read_recs: self.cleaned_read_recs = {} self.cleaned_read_recs[sample_type] = None self.files['%s_cleaned_fq' % sample_type], self.cleaned_read_recs[sample_type] = utils.get_fastq_reads(self.files['%s_cleaned_fq' % sample_type], self.sv_reads[sample_type]) self.sv_reads[sample_type] = None check = True if len(self.cleaned_read_recs[sample_type]) == 0: check = False utils.log(self.logging_name, 'info', 'Check there are cleaned reads %r' % check) return check
def clean_reads(self, dataPath, name, sampleType): """Trim adapter sequences from the extracted reads, format and organize the cleaned reads into new files. Cutadapt is run to trim the adapter sequences from the sequence reads to remove any 'noise' from the assembly process. The cleaned reads output from cutadapt are then reprocessed to determine if the softclipped sequences were trimmed off or not to further filter out reads. The softclipped sequences that remain are stored and a new fastq file is written. Args: dataPath (str): The path to the data files for this target. name (str): The target name. type (str): A string indicating a tumor ('sv') or normal ('norm') sample being processed. Return: check (boolean): A boolean to indicate whether the are any reads left after cleaning is complete. """ cutadapt = self.params.get_param('cutadapt') # Cutadapt binary cutadaptConfigFn = self.params.get_param('cutadapt_config_file') utils.log(self.loggingName, 'info', 'Cleaning reads using %s with configuration file %s' % (cutadapt, cutadaptConfigFn)) self.files['%s_cleaned_fq' % sampleType] = os.path.join(dataPath, name + '_%s_reads_cleaned.fastq' % sampleType) utils.log(self.loggingName, 'info', 'Writing clean reads to %s' % self.files['%s_cleaned_fq' % sampleType]) output, errors = utils.run_cutadapt(cutadapt, cutadaptConfigFn, self.files['%s_fq' % sampleType], self.files['%s_cleaned_fq' % sampleType], self.loggingName) self.setup_cleaned_reads(sampleType) self.files['%s_cleaned_fq' % sampleType], self.cleaned_read_recs[sampleType] = utils.get_fastq_reads(self.files['%s_cleaned_fq' % sampleType], self.get_sv_reads(sampleType)) self.clear_sv_reads(sampleType) check = self.continue_analysis_check(sampleType) utils.log(self.loggingName, 'info', 'Clean reads exist %s' % check) return check