class rDnaPipeline( object ): """ A tool for running a community analysis pipeline on PacBioData """ def __init__(self): parse_args() self.__dict__.update( vars(args) ) self.validate_settings() self.initialize_output() if self.debug: initialize_logger( self.log_file, logging.DEBUG ) else: initialize_logger( self.log_file, logging.INFO ) def validate_settings(self): # Validate the input file root, ext = split_root_from_ext( self.input_file ) if ext in ['.bas.h5', '.fofn']: self.data_type = 'bash5' elif ext in ['.fq', '.fastq']: self.data_type = 'fastq' elif ext in ['.fa', '.fsa', '.fasta']: self.data_type = 'fasta' self.enable_masking = False self.enable_consensus = False else: raise TypeError('Sequence file must be a bas.h5 file, a ' + \ 'fasta file, or a fofn of multiple such files') # If Clustering was disabled, also disable the consensus process if not self.enable_clustering: self.enable_consensus = False # If Consensus is enabled, initialize the appropriate tool if self.enable_consensus: self.consensusTool = DagConRunner('gcon.py', 'r') # Searching for Mothur executable, and set the Mothur Process counter self.mothur = validate_executable( self.mothur ) self.processCount = 0 def initialize_output(self): # Create the Output directory create_directory( self.output_dir ) # Create a symbolic link from the data file to the output dir baseName = os.path.basename( self.input_file ) symlinkPath = os.path.join( self.output_dir, baseName ) if os.path.exists( symlinkPath ): pass else: absPath = os.path.abspath( self.input_file ) os.symlink( absPath, symlinkPath ) self.sequenceFile = baseName # Move into the Output directory and create Log directory and files os.chdir( self.output_dir ) create_directory( 'log' ) stdoutLog = os.path.join('log', 'mothur_stdout.log') stderrLog = os.path.join('log', 'mothur_stderr.log') self.log_file = os.path.join('log', 'rna_pipeline.log') # Instantiate the MothurRunner object self.factory = MothurRunner( self.mothur, self.nproc, stdoutLog, stderrLog) def getProcessLogFile(self, process, isMothurProcess=False): if isMothurProcess: logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, process) else: logFile = 'process%02d.%s.logfile' % (self.processCount, process) return os.path.join('log', logFile) def process_setup(self, inputFile, processName, suffix=None, suffixList=None): """ Return a tuple containing the output file and a boolean flag describing whether the output file already exists """ log.info('Preparing to run %s on "%s"' % (processName, inputFile)) self.processCount += 1 if suffix: outputFile = get_output_name(inputFile, suffix) return outputFile elif suffixList: outputFiles = [] for suffix in suffixList: outputFile = get_output_name( inputFile, suffix ) outputFiles.append( outputFile ) return outputFiles def output_files_exist( self, outputFile=None, outputList=None ): if outputFile: if file_exists( outputFile ): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False elif outputList: if all_files_exist( outputList ): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False def checkOutputFile( self, outputFile ): if file_exists( outputFile ): log.info('Expected output "%s" found' % outputFile) else: msg = 'Expected output "%s" not found!' % outputFile log.info( msg ) raise IOError( msg ) def process_cleanup(self, outputFile=None, outputList=None): """ Log if the process successfully created it's output, and raise an error message if not """ if outputFile: self.checkOutputFile( outputFile ) elif outputList: for outputFile in outputList: self.checkOutputFile( outputFile ) log.info('All expected output files found - process successful!\n') def write_dummy_file(self, dummyFile): with open(dummyFile, 'w') as handle: handle.write('DONE') return dummyFile def extract_raw_ccs(self, inputFile): outputFile = self.process_setup( inputFile, 'extractCcsFromBasH5', suffix='fastq' ) if self.output_files_exist( outputFile=outputFile ): return outputFile elif file_has_ccs( inputFile ): extract_ccs(inputFile, outputFile, self.raw_data) else: msg = 'Raw data file has no CCS data!' log.error( msg ) raise ValueError( msg ) self.process_cleanup( outputFile=outputFile ) return outputFile def filter_fastq(self, fastqFile): outputFile = self.process_setup( fastqFile, 'FilterQuality', suffix='filter.fastq' ) if self.output_files_exist( outputFile=outputFile ): return outputFile quality_filter( fastqFile, outputFile ) self.process_cleanup( outputFile=outputFile ) return outputFile def separate_fastq(self, fastqFile): outputList = self.process_setup( fastqFile, 'Fastq.Info', suffixList=['fasta', 'qual'] ) if self.output_files_exist( outputList=outputList ): return outputList mothurArgs = {'fastq':fastqFile, 'fasta':'T', 'qfile':'T'} logFile = self.getProcessLogFile('fastq.info', True) self.factory.runJob('fastq.info', mothurArgs, logFile) self.process_cleanup( outputList=outputList ) return outputList def align_sequences(self, fastaFile): outputFile = self.process_setup( fastaFile, 'Align.Seqs', suffix='align' ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':fastaFile, 'reference':self.alignment_reference, 'flip':'t'} logFile = self.getProcessLogFile('align.seqs', True) self.factory.runJob('align.seqs', mothurArgs, logFile) self.process_cleanup( outputFile=outputFile ) return outputFile def screen_sequences(self, alignFile, start=None, end=None, min_length=None): if alignFile.endswith('.align'): outputExt = 'good.align' elif alignFile.endswith('.fasta'): outputExt = 'good.fasta' outputFile = self.process_setup( alignFile, 'Screen.Seqs', suffix=outputExt ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'start':start, 'end':end, 'minlength':min_length} logFile = self.getProcessLogFile('screen.seqs', True) self.factory.runJob('screen.seqs', mothurArgs, logFile) self.process_cleanup( outputFile=outputFile ) return outputFile def summarize_sequences(self, fastaFile): outputFile = self.process_setup( fastaFile, 'Summary.Seqs', suffix='summary' ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':fastaFile} logFile = self.getProcessLogFile('summary.seqs', True) self.factory.runJob('summary.seqs', mothurArgs, logFile) self.process_cleanup( outputFile=outputFile ) return outputFile def parse_summary_file(self, summaryFile): log.info('Preparing to run SummaryReader...') parser = SummaryReader(summaryFile, self.fraction) log.info('Identifying full-length alignment positions...') start, end = parser.getFullLengthPositions() log.info('Full-length start is NAST Alignment position %s' % start) log.info('Full-length end is NAST Alignment position %s' % end) log.info('Calculating minimum allowed alignment positions...') maxStart, minEnd = parser.getAllowedPositions() log.info('Maximum allowed start is NAST Alignment position %s' % maxStart) log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd) return maxStart, minEnd def find_chimeras(self, alignFile): outputFile = self.process_setup( alignFile, 'UCHIME', suffix='uchime.accnos' ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'reference':self.chimera_reference} logFile = self.getProcessLogFile('chimera.uchime', True) self.factory.runJob('chimera.uchime', mothurArgs, logFile) self.process_cleanup( outputFile=outputFile ) return outputFile def remove_sequences(self, alignFile, idFile): outputFile = self.process_setup( alignFile, 'Remove.Seqs', suffix='pick.align' ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'accnos':idFile} logFile = self.getProcessLogFile('remove.seqs', True) self.factory.runJob('remove.seqs', mothurArgs, logFile) self.process_cleanup( outputFile=outputFile ) return outputFile def filter_sequences(self, alignFile): outputFile = self.process_setup( alignFile, 'Filter.Seqs', suffix='filter.fasta' ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'vertical':'T'} logFile = self.getProcessLogFile( 'filter.seqs', True ) self.factory.runJob( 'filter.seqs', mothurArgs, logFile ) self.process_cleanup( outputFile=outputFile ) return outputFile def add_quality_to_alignment(self, fastqFile, alignFile): outputFile = self.process_setup( alignFile, 'QualityAligner', suffix='fastq' ) if self.output_files_exist( outputFile=output ): return output aligner = QualityAligner( fastqFile, alignFile, outputFile ) aligner.run() self.process_cleanup( outputFile=outputFile ) return outputFile def mask_fastq_sequences(self, fastqFile): outputFile = self.process_setup( fastqFile, 'QualityMasker', suffix='masked.fastq' ) if self.output_files_exist( outputFile=outputFile ): return outputFile masker = QualityMasker(fastqFile, outputFile, self.minQv) masker.run() self.process_cleanup( outputFile=outputFile ) return outputFile def unique_sequences( self, alignFile ): if alignFile.endswith('.align'): outputSuffixes = ['unique.align', 'names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['unique.fasta', 'names'] outputList = self.process_setup( alignFile, 'Unique.Seqs', suffixList=outputSuffixes ) if self.output_files_exist( outputList=outputList ): return outputList mothurArgs = {'fasta':alignFile} logFile = self.getProcessLogFile('unique.seqs', True) self.factory.runJob('unique.seqs', mothurArgs, logFile) self.process_cleanup( outputList=outputList ) return outputList def precluster_sequences( self, alignFile, nameFile ): if alignFile.endswith('.align'): outputSuffixes = ['precluster.align', 'precluster.names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['precluster.fasta', 'precluster.names'] outputList = self.process_setup( alignFile, 'Pre.Cluster', suffixList=outputSuffixes ) if self.output_files_exist( outputList=outputList ): return outputList mothurArgs = { 'fasta':alignFile, 'name': nameFile, 'diffs':self.precluster_diffs } logFile = self.getProcessLogFile('pre.cluster', True) self.factory.runJob('pre.cluster', mothurArgs, logFile) self.process_cleanup( outputList=outputList ) return outputList def calculate_distance_matrix( self, alignFile ): outputFile = self.process_setup( alignFile, 'Dist.Seqs', suffix='phylip.dist' ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = { 'fasta':alignFile, 'calc':'onegap', 'countends':'F', 'output':'lt' } logFile = self.getProcessLogFile('dist.seqs', True) self.factory.runJob('dist.seqs', mothurArgs, logFile) self.process_cleanup( outputFile=outputFile ) return outputFile def cluster_sequences(self, distanceMatrix): if self.clusteringMethod == 'nearest': outputSuffix = 'nn.list' elif self.clusteringMethod == 'average': outputSuffix = 'an.list' elif self.clusteringMethod == 'furthest': outputSuffix = 'fn.list' outputFile = self.process_setup( distanceMatrix, 'Cluster', suffix=outputSuffix ) if self.output_files_exist( outputFile=outputFile ): return outputFile mothurArgs = {'phylip':distanceMatrix, 'method':self.clusteringMethod} logFile = self.getProcessLogFile( 'cluster', True ) self.factory.runJob( 'cluster', mothurArgs, logFile ) self.process_cleanup( outputFile=outputFile ) return outputFile def separate_cluster_sequences(self, listFile, sequenceFile): outputFile = self.process_setup( listFile, 'ClusterSeparator', suffix='list.clusters') if self.output_files_exist( outputFile=outputFile ): return outputFile separator = ClusterSeparator( listFile, sequenceFile, outputFile, self.distance, self.min_cluster_size ) separator() self.process_cleanup( outputFile=outputFile ) return outputFile def generate_consensus_sequences(self, clusterListFile): outputFile = self.process_setup( clusterListFile, 'ClusterResequencer', suffix='consensus') if self.output_files_exist( outputFile=outputFile ): return outputFile consensusFiles = [] with open( clusterListFile ) as handle: for line in handle: sequenceFile, referenceFile, count = line.strip().split() if referenceFile.endswith('None'): consensusFiles.append( (sequenceFile, 'None') ) else: root_name = os.path.basename( sequenceFile ) consensus = self.consensusTool( sequenceFile, referenceFile ) consensusFiles.append( (referenceFile, consensus) ) with open( outputFile, 'w' ) as handle: for filenamePair in consensusFiles: handle.write('%s\t%s\n' % filenamePair) self.process_cleanup( outputFile=outputFile ) return outputFile def cleanup_consensus_folder( self, consensusFile ): outputFile = self.process_setup( consensusFile, 'ConsensusCleanup', suffix='consensus.cleanup' ) if self.output_files_exist( outputFile=outputFile ): return outputFile reseqPath = os.path.join( os.getcwd(), 'reseq' ) for filename in os.listdir( reseqPath ): filePath = os.path.join( reseqPath, filename ) if filePath.endswith('_input.fa'): os.remove( filePath ) elif filePath.endswith('_input.fa.aln'): os.remove( filePath ) elif filePath.endswith('_input.fa.aln_unsorted'): os.remove( filePath ) self.write_dummy_file( outputFile ) self.process_cleanup( outputFile=outputFile ) return outputFile def select_final_sequences( self, consensusFile ): outputFile = self.process_setup( consensusFile, 'SequenceSelector', suffix='consensus.selected' ) if self.output_files_exist( outputFile=outputFile ): return outputFile selectedFiles = [] with open( consensusFile ) as handle: for line in handle: referenceFile, consensusFile = line.strip().split() if consensusFile.endswith('None'): pass elif fasta_count( consensusFile ) == 1: selectedFiles.append( consensusFile ) else: selectedFiles.append( referenceFile ) with open( outputFile, 'w' ) as handle: for filename in selectedFiles: handle.write(filename + '\n') self.process_cleanup( outputFile=outputFile ) return outputFile def output_final_sequences( self, finalSequenceList ): outputFile = self.process_setup( finalSequenceList, 'SequenceWriter', suffix='fasta' ) if self.output_files_exist( outputFile=outputFile ): return outputFile with FastaWriter( outputFile ) as writer: with open( finalSequenceList ) as handle: for line in handle: sequenceFile = line.strip() copy_fasta_sequences( sequenceFile, writer ) self.process_cleanup( outputFile=outputFile ) return outputFile def run(self): if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs( self.sequenceFile ) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq( fastqFile ) fastaFile, qualFile = self.separate_fastq( filteredFastq ) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences( fastaFile ) summaryFile = self.summarize_sequences( alignedFile ) maxStart, minEnd = self.parse_summary_file( summaryFile ) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimeraIds = self.find_chimeras( screenedFile ) noChimeraFile = self.remove_sequences( screenedFile, chimeraIds ) # Filter out un-used columns to speed up re-alignment and clustering filteredFile = self.filter_sequences( noChimeraFile ) # If masking is enabled, create an aligned FASTQ, mask the # low-quality bases and remove over-masked reads if self.enable_masking: alignedFastqFile = self.add_quality_to_alignment( fastqFile, filteredFile ) maskedFastq = self.mask_fastq_sequences( alignedFastqFile ) maskedFasta = self.convert_fastq_to_fasta( maskedFastq ) screenedFasta = self.screen_sequences( maskedFasta, min_length=self.min_length) fileForClustering = screenedFasta # Otherwise if masking is disabled, we'll use unique-ify and # pre-cluster our sequences else: uniqueFile, nameFile = self.unique_sequences( filteredFile ) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile ) fileForClustering = preclusteredFile # If enabled, calculate sequence distances and cluster if self.enable_clustering: distanceMatrix = self.calculate_distance_matrix( fileForClustering ) listFile = self.cluster_sequences( distanceMatrix ) # If enabled, generate a consensus for each cluster from above if self.enable_consensus: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile ) consensusFile = self.generate_consensus_sequences( clusterListFile ) self.cleanup_consensus_folder( consensusFile ) selectedFile = self.select_final_sequences( consensusFile ) finalFile = self.output_final_sequences( selectedFile )
class rDnaPipeline(object): """ A tool for running a community analysis pipeline on PacBioData """ def __init__(self): parse_args() self.__dict__.update(vars(args)) self.validate_settings() self.initialize_output() initialize_logger(log, log_file=self.log_file, debug=self.debug) def validate_settings(self): # Validate the input file root, ext = split_root_from_ext(self.input_file) if ext in ['.bas.h5', '.fofn']: self.data_type = 'bash5' elif ext in ['.fq', '.fastq']: self.data_type = 'fastq' elif ext in ['.fa', '.fsa', '.fasta']: self.data_type = 'fasta' else: raise TypeError('Sequence file must be a bas.h5 file, a ' + \ 'fasta file, or a fofn of multiple such files') self.step_list = self.calculate_steps() if self.enable_consensus: self.consensusTool = DagConRunner('gcon.py', 'r') # Searching for Mothur executable, and set the Mothur Process counter self.mothur = validate_executable(self.mothur) self.processCount = 0 def initialize_output(self): # Create the Output directory create_directory(self.output_dir) # Create a symbolic link from the data file to the output dir baseName = os.path.basename(self.input_file) symlinkPath = os.path.join(self.output_dir, baseName) if os.path.exists(symlinkPath): pass else: absPath = os.path.abspath(self.input_file) os.symlink(absPath, symlinkPath) self.sequenceFile = baseName # Move into the Output directory and create Log directory and files os.chdir(self.output_dir) create_directory('log') stdoutLog = os.path.join('log', 'mothur_stdout.log') stderrLog = os.path.join('log', 'mothur_stderr.log') self.log_file = os.path.join('log', 'rna_pipeline.log') # Instantiate the MothurRunner object self.factory = MothurRunner(self.mothur, self.nproc, stdoutLog, stderrLog) def calculate_steps(self): if self.enable_iteration: count = int(self.distance / self.step) - 1 step_list = [i * self.step for i in range(1, count + 1)] else: step_list = [] return step_list + [self.distance] def getProcessLogFile(self, process, isMothurProcess=False): if isMothurProcess: logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, process) else: logFile = 'process%02d.%s.logfile' % (self.processCount, process) return os.path.join('log', logFile) def process_setup(self, inputFile, processName, suffix=None, suffixList=None): """ Return a tuple containing the output file and a boolean flag describing whether the output file already exists """ log.info('Preparing to run %s on "%s"' % (processName, inputFile)) self.processCount += 1 if suffix: outputFile = get_output_name(inputFile, suffix) return outputFile elif suffixList: outputFiles = [] for suffix in suffixList: outputFile = get_output_name(inputFile, suffix) outputFiles.append(outputFile) return outputFiles def output_files_exist(self, output_file=None, output_list=None): if output_file: if file_exists(output_file): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False elif output_list: if all_files_exist(output_list): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False def check_output_file(self, outputFile): if os.path.exists(outputFile): log.info('Expected output "%s" found' % outputFile) else: msg = 'Expected output "%s" not found!' % outputFile log.error(msg) raise IOError(msg) def process_cleanup(self, output_file=None, output_list=None): """ Log if the process successfully created it's output, and raise an error message if not """ if output_file: self.check_output_file(output_file) elif output_list: for output_file in output_list: self.check_output_file(output_file) log.info('All expected output files found - process successful!\n') def extract_raw_ccs(self, inputFile): outputFile = self.process_setup(inputFile, 'extractCcsFromBasH5', suffix='fastq') if self.output_files_exist(output_file=outputFile): return outputFile elif file_has_ccs(inputFile): extract_ccs(inputFile, outputFile, self.raw_data) else: msg = 'Raw data file has no CCS data!' log.error(msg) raise ValueError(msg) self.process_cleanup(output_file=outputFile) return outputFile def filter_fastq(self, fastqFile): outputFile = self.process_setup(fastqFile, 'FilterQuality', suffix='filter.fastq') if self.output_files_exist(output_file=outputFile): return outputFile quality_filter(fastqFile, outputFile, min_accuracy=self.min_accuracy) self.process_cleanup(output_file=outputFile) return outputFile def separate_fastq(self, fastqFile): outputList = self.process_setup(fastqFile, 'Fastq.Info', suffixList=['fasta', 'qual']) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = {'fastq': fastqFile, 'fasta': 'T', 'qfile': 'T'} logFile = self.getProcessLogFile('fastq.info', True) self.factory.runJob('fastq.info', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def align_sequences(self, fastaFile): outputFile = self.process_setup(fastaFile, 'Align.Seqs', suffix='align') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'fasta': fastaFile, 'reference': self.alignment_reference, 'flip': 't' } logFile = self.getProcessLogFile('align.seqs', True) self.factory.runJob('align.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def screen_sequences(self, alignFile, start=None, end=None, min_length=None): if alignFile.endswith('.align'): outputExt = 'good.align' elif alignFile.endswith('.fasta'): outputExt = 'good.fasta' outputFile = self.process_setup(alignFile, 'Screen.Seqs', suffix=outputExt) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'fasta': alignFile, 'start': start, 'end': end, 'minlength': min_length } logFile = self.getProcessLogFile('screen.seqs', True) self.factory.runJob('screen.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def summarize_sequences(self, fastaFile): outputFile = self.process_setup(fastaFile, 'Summary.Seqs', suffix='summary') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': fastaFile} logFile = self.getProcessLogFile('summary.seqs', True) self.factory.runJob('summary.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def parse_summary_file(self, summaryFile): log.info('Preparing to run SummaryReader...') parser = SummaryReader(summaryFile, self.fraction) log.info('Identifying full-length alignment positions...') start, end = parser.getFullLengthPositions() log.info('Full-length start is NAST Alignment position %s' % start) log.info('Full-length end is NAST Alignment position %s' % end) log.info('Calculating minimum allowed alignment positions...') maxStart, minEnd = parser.getAllowedPositions() log.info('Maximum allowed start is NAST Alignment position %s' % maxStart) log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd) return maxStart, minEnd def find_chimeras(self, alignFile): outputFile = self.process_setup(alignFile, 'UCHIME', suffix='uchime.accnos') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'reference': self.chimera_reference} logFile = self.getProcessLogFile('chimera.uchime', True) self.factory.runJob('chimera.uchime', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def remove_sequences(self, alignFile, idFile): outputFile = self.process_setup(alignFile, 'Remove.Seqs', suffix='pick.align') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'accnos': idFile} logFile = self.getProcessLogFile('remove.seqs', True) self.factory.runJob('remove.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def filter_sequences(self, alignFile, trump=None): outputFile = self.process_setup(alignFile, 'Filter.Seqs', suffix='filter.fasta') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'vertical': 'T', 'trump': trump} logFile = self.getProcessLogFile('filter.seqs', True) self.factory.runJob('filter.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def add_quality_to_alignment(self, fastqFile, alignFile): outputFile = self.process_setup(alignFile, 'QualityAligner', suffix='fastq') if self.output_files_exist(output_file=outputFile): return outputFile aligner = QualityAligner(fastqFile, alignFile, outputFile) aligner.run() self.process_cleanup(output_file=outputFile) return outputFile def mask_fastq_sequences(self, fastqFile): outputFile = self.process_setup(fastqFile, 'QualityMasker', suffix='masked.fastq') if self.output_files_exist(output_file=outputFile): return outputFile masker = QualityMasker(fastqFile, outputFile, self.minQv) masker.run() self.process_cleanup(output_file=outputFile) return outputFile def unique_sequences(self, alignFile): if alignFile.endswith('.align'): outputSuffixes = ['unique.align', 'names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['unique.fasta', 'names'] outputList = self.process_setup(alignFile, 'Unique.Seqs', suffixList=outputSuffixes) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = {'fasta': alignFile} logFile = self.getProcessLogFile('unique.seqs', True) self.factory.runJob('unique.seqs', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def precluster_sequences(self, alignFile, nameFile): if alignFile.endswith('.align'): outputSuffixes = ['precluster.align', 'precluster.names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['precluster.fasta', 'precluster.names'] outputList = self.process_setup(alignFile, 'Pre.Cluster', suffixList=outputSuffixes) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = { 'fasta': alignFile, 'name': nameFile, 'diffs': self.precluster_diffs } logFile = self.getProcessLogFile('pre.cluster', True) self.factory.runJob('pre.cluster', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def calculate_distance_matrix(self, alignFile): outputFile = self.process_setup(alignFile, 'Dist.Seqs', suffix='phylip.dist') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'fasta': alignFile, 'calc': 'onegap', 'countends': 'F', 'output': 'lt' } logFile = self.getProcessLogFile('dist.seqs', True) self.factory.runJob('dist.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def cluster_sequences(self, distanceMatrix, nameFile): if self.clusteringMethod == 'nearest': outputSuffix = 'nn.list' elif self.clusteringMethod == 'average': outputSuffix = 'an.list' elif self.clusteringMethod == 'furthest': outputSuffix = 'fn.list' outputFile = self.process_setup(distanceMatrix, 'Cluster', suffix=outputSuffix) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'phylip': distanceMatrix, 'name': nameFile, 'method': self.clusteringMethod } logFile = self.getProcessLogFile('cluster', True) self.factory.runJob('cluster', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def separate_cluster_sequences(self, listFile, sequenceFile, distance, min_cluster_size): outputFile = self.process_setup(listFile, 'ClusterSeparator', suffix='clusters') if self.output_files_exist(output_file=outputFile): return outputFile outputDir = 'Dist_%s' % distance separator = ClusterSeparator(listFile, sequenceFile, outputFile, outputDir, distance, min_cluster_size) separator() self.process_cleanup(output_file=outputFile) return outputFile def generate_consensus_sequences(self, cluster_list_file, distance): output_file = self.process_setup(cluster_list_file, 'ClusterResequencer', suffix='consensus') if self.output_files_exist(output_file=output_file): return output_file generate_consensus_files(cluster_list_file, self.consensusTool, output_file) self.process_cleanup(output_file=output_file) return output_file def generate_ref_sequences(self, cluster_list_file, distance): output_file = self.process_setup(cluster_list_file, 'ClusterResequencer', suffix='consensus') if self.output_files_exist(output_file=output_file): return output_file generate_reference_files(cluster_list_file, output_file) self.process_cleanup(output_file=output_file) return output_file def cleanup_uchime_output(self, screenedFile): outputFile = self.process_setup(screenedFile, 'UchimeCleanup', suffix='uchime.cleanup') uchimePath = os.getcwd() for filename in os.listdir(uchimePath): if filename.endswith('_formatted'): file_path = os.path.join(uchimePath, filename) os.remove(file_path) write_dummy_file(outputFile) self.process_cleanup(output_file=outputFile) return outputFile def cleanup_consensus_folder(self, consensusFile, distance): outputFile = self.process_setup(consensusFile, 'ConsensusCleanup', suffix='consensus.cleanup') if self.output_files_exist(output_file=outputFile): return outputFile reseqPath = os.path.join(os.getcwd(), 'Dist_%s' % distance) clean_consensus_outputs(reseqPath, outputFile) self.process_cleanup(output_file=outputFile) return outputFile def select_sequences(self, consensusFile): outputFile = self.process_setup(consensusFile, 'SequenceSelector', suffix='consensus.selected') if self.output_files_exist(output_file=outputFile): return outputFile select_consensus_files(consensusFile, outputFile) self.process_cleanup(output_file=outputFile) return outputFile def select_ref_sequences(self, consensusFile): outputFile = self.process_setup(consensusFile, 'SequenceSelector', suffix='consensus.selected') if self.output_files_exist(output_file=outputFile): return outputFile select_reference_files(consensusFile, outputFile) self.process_cleanup(output_file=outputFile) return outputFile def output_selected_sequences(self, selectedSequences): outputFile = self.process_setup(selectedSequences, 'SequenceWriter', suffix='fasta') if self.output_files_exist(output_file=outputFile): return outputFile copy_fasta_list(selectedSequences, outputFile) self.process_cleanup(output_file=outputFile) return outputFile def write_name_file(self, consensusFile, selectedFile, outputRoot=None): if outputRoot is None: outputRoot = 'Final_Output.fasta' outputFile = self.process_setup(outputRoot, 'CreateNameFile', suffix='names') if self.output_files_exist(output_file=outputFile): return outputFile create_name_file(consensusFile, selectedFile, outputFile) self.process_cleanup(output_file=outputFile) return outputFile def run(self): print self.step_list if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs(self.sequenceFile) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq(fastqFile) fastaFile, qualFile = self.separate_fastq(filteredFastq) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences(fastaFile) summaryFile = self.summarize_sequences(alignedFile) maxStart, minEnd = self.parse_summary_file(summaryFile) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras(screenedFile) self.cleanup_uchime_output(screenedFile) if file_exists(chimera_ids): no_chimera_file = self.remove_sequences(screenedFile, chimera_ids) else: no_chimera_file = screenedFile filteredFile = self.filter_sequences(no_chimera_file, trump='.') uniqueFile, nameFile = self.unique_sequences(filteredFile) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile) fileToCluster = preclusteredFile clusterFileRoot = '.'.join(fileToCluster.split('.')[:-1]) for i, step in enumerate(self.step_list): log.info("Beginning iteration #%s - %s" % (i + 1, step)) iterationInput = clusterFileRoot + '.%s.fasta' % step shutil.copyfile(fileToCluster, iterationInput) distanceMatrix = self.calculate_distance_matrix(iterationInput) listFile = self.cluster_sequences(distanceMatrix, nameFile) # Include all clusters during intermediate stages, others use min_cluster_size if step == self.distance: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, self.min_cluster_size) else: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, 1) # Generate the consensus sequences for the next round if step == self.distance and self.enable_consensus: # If consensus is enabled and this is the last round, generate a GCON consensus log.info( "Generating consensus sequences for iteration #%s - %s" % (i + 1, step)) consensusFile = self.generate_consensus_sequences( clusterListFile, step) self.cleanup_consensus_folder(consensusFile, step) selectedFile = self.select_sequences(consensusFile) selectedSequenceFile = self.output_selected_sequences( selectedFile) else: # Otherwise generate reference sequences by picking high-QV reads log.info( "Selecting reference sequences for iteration #%s - %s" % (i + 1, step)) consensusFile = self.generate_ref_sequences( clusterListFile, step) selectedFile = self.select_ref_sequences(consensusFile) selectedSequenceFile = self.output_selected_sequences( selectedFile) # Whichever method was used, we need to update the nameFile accordingly nameFile = self.write_name_file(consensusFile, selectedFile, selectedSequenceFile) # If this isn't the last round, we must re-align and re-filter the new consensus sequences if step != self.distance: log.info( "Iterative clustering not finished, preparing sequences for next iteration" ) alignedFile = self.align_sequences(selectedSequenceFile) fileToCluster = self.filter_sequences(alignedFile, trump='.') log.info("Finished iteration #%s - %s" % (i + 1, step)) try: os.symlink(selectedSequenceFile, "Final_Output.fasta") except: pass try: os.symlink(nameFile, "Final_Output.names") except: pass
class rDnaPipeline( object ): """ A tool for running a community analysis pipeline on PacBioData """ ########################## # Initialization Methods # ########################## def __init__(self, sequenceFile=None): if sequenceFile is None: self.initializeFromArgs() else: self.initializeFromCall(sequenceFile) self.validateSettings() self.initializeOutput() self.initializeLogger() def initializeFromArgs(self): import argparse desc = 'A pipeline tool for analyzing rRNA amplicons' parser = argparse.ArgumentParser(description=desc) parser.add_argument('sequenceFile', metavar='FILE', help="File of rRNA sequencing data to use") parser.add_argument('-a', '--minimum_accuracy', type=float, metavar='FLOAT', dest='minAccuracy', default=MIN_ACCURACY, help='Minimum predicted sequence accuracy') parser.add_argument('-d', '--distance', metavar='FLOAT', type=float, default=0.03, help="Distance at which to cluster sequences") parser.add_argument('-n', '--num_processes', metavar='INT', default=1, dest='numProc', type=int, help="Number of processors to use") parser.add_argument('-f', '--fraction', metavar='FLOAT', type=float, default=DEFAULT_FRAC, help='Fraction of full-length to require of each read') parser.add_argument('-c', '--clustering_method', metavar='METHOD', dest='clusteringMethod', default=DEFAULT_METHOD, choices=CLUSTER_METHODS, help="Distance algorithm to use in clustering") parser.add_argument('-o', '--output', dest='outputDir', metavar='DIR', default='rna_pipeline_run', help="Specify the output folder") parser.add_argument('-q', '--minimum_qv', type=int, metavar='INT', dest='minQv', default=MIN_QV, help='Minimum QV to allow after sequence masking') parser.add_argument('-l', '--minimum_length', type=int, metavar='INT', dest='minLength', default=MIN_LENGTH, help='Minimun length sequence to allow after masking') parser.add_argument('--precluster_diffs', type=int, metavar='INT', dest='preclusterDiffs', default=PRECLUSTER_DIFFS, help='Maximum number of differences to allow in pre-clustering') parser.add_argument('-r', '--minimum_ratio', type=float, metavar='FLOAT', dest='minRatio', default=MIN_RATIO, help='Minimum ratio of retained bases to allow after masking') parser.add_argument('-A', '--alignment_reference', metavar='REF', default='silva.both.align', dest='alignmentRef', help="Reference MSA for aligning query sequences") parser.add_argument('-C', '--chimera_reference', metavar='REF', default='silva.gold.align', dest='chimeraRef', help="Reference MSA for Chimera detection") parser.add_argument('--enable_masking', action='store_true', dest='enableMasking', help="Turn off the low-quality Masking step") parser.add_argument('--disable_clustering', action='store_false', dest='enableClustering', help="Turn off the Clustering and Resequencing steps") parser.add_argument('--disable_consensus', action='store_false', dest='enableConsensus', help="Turn off the Consensus step") parser.add_argument('--blasr', metavar='BLASR_PATH', help="Specify the path to the Blasr executable") parser.add_argument('--mothur', metavar='MOTHUR_PATH', default='mothur', help="Specify the path to the Mothur executable") parser.add_argument('--debug', action='store_true', help="Turn on DEBUG message logging") args = parser.parse_args() self.__dict__.update( vars(args) ) def validateSettings(self): # Validate the input file root, ext = self.splitRootFromExt( self.sequenceFile ) if ext in ['.bas.h5', '.fofn']: self.dataType = 'bash5' elif ext in ['.fq', '.fastq']: self.dataType = 'fastq' elif ext in ['.fa', '.fsa', '.fasta']: self.dataType = 'fasta' self.enableMasking = False self.enableConsensus = False else: raise TypeError('Sequence file must be a bas.h5 file, a ' + \ 'fasta file, or a fofn of multiple such files') # If Clustering was disabled, also disable the consensus process if not self.enableClustering: self.enableConsensus = False # If Consensus is enabled, initialize the appropriate tool if self.enableConsensus: self.consensusTool = DagConRunner('gcon.py', 'r') # Searching for Mothur executable, and set the Mothur Process counter self.mothur = validateExecutable( self.mothur ) self.processCount = 0 # Validate numerical parameters validateInt( self.numProc, minValue=0 ) validateFloat( self.distance, minValue=MIN_DIST, maxValue=MAX_DIST ) def initializeOutput(self): # Create the Output directory createDirectory( self.outputDir ) # Create a symbolic link from the data file to the output dir baseName = os.path.basename( self.sequenceFile ) symlinkPath = os.path.join( self.outputDir, baseName ) if os.path.exists( symlinkPath ): pass else: absPath = os.path.abspath( self.sequenceFile ) os.symlink( absPath, symlinkPath ) self.sequenceFile = baseName # Move into the Output directory and create Log directory and files os.chdir( self.outputDir ) createDirectory( 'log' ) stdoutLog = os.path.join('log', 'mothur_stdout.log') stderrLog = os.path.join('log', 'mothur_stderr.log') self.logFile = os.path.join('log', 'rna_pipeline.log') # Instantiate the MothurRunner object self.factory = MothurRunner( self.mothur, self.numProc, stdoutLog, stderrLog) def initializeLogger(self): dateFormat = "%Y-%m-%d %I:%M:%S" self.log = logging.getLogger() if self.debug: self.log.setLevel( logging.DEBUG ) else: self.log.setLevel( logging.INFO ) # Initialize the LogHandler for the master log file logHandler = logging.FileHandler( self.logFile ) lineFormat = "%(asctime)s %(levelname)s %(processName)s " + \ "%(funcName)s %(lineno)d %(message)s" logFormatter = logging.Formatter(fmt=lineFormat, datefmt=dateFormat) logHandler.setFormatter( logFormatter ) self.log.addHandler( logHandler ) # Initialize a LogHandler for STDOUT outHandler = logging.StreamHandler( stream=sys.stdout ) outLineFormat = "%(asctime)s %(message)s" outFormatter = logging.Formatter(fmt=outLineFormat, datefmt=dateFormat) outHandler.setFormatter( outFormatter ) self.log.addHandler( outHandler ) # Record the initialization of the pipeline self.log.info("INFO logger initialized") self.log.debug("DEBUG logger initialized") self.log.info("Initializing RnaPipeline v%s" % __version__) self.log.debug("Using the following parameters:") for param, value in self.__dict__.iteritems(): self.log.debug("\t%s = %s" % (param, value)) self.log.info("Initialization of RnaPipeline completed\n") def getProcessLogFile(self, process, isMothurProcess=False): if isMothurProcess: logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, process) else: logFile = 'process%02d.%s.logfile' % (self.processCount, process) return os.path.join('log', logFile) def processSetup(self, inputFile, processName, suffix=None, suffixList=None): """ Return a tuple containing the output file and a boolean flag describing whether the output file already exists """ self.log.info('Preparing to run %s on "%s"' % (processName, inputFile)) self.processCount += 1 if suffix: outputFile = self.predictOutputFile(inputFile, suffix) return outputFile elif suffixList: outputFiles = [] for suffix in suffixList: outputFile = self.predictOutputFile( inputFile, suffix ) outputFiles.append( outputFile ) return outputFiles def outputFilesExist( self, outputFile=None, outputList=None ): if outputFile: if fileExists( outputFile ): self.log.info('Output files detected, skipping process...\n') return True else: self.log.info('Output files not found, running process...') return False elif outputList: if allFilesExist( outputList ): self.log.info('Output files detected, skipping process...\n') return True else: self.log.info('Output files not found, running process...') return False def checkOutputFile( self, outputFile ): if fileExists( outputFile ): self.log.info('Expected output "%s" found' % outputFile) else: msg = 'Expected output "%s" not found!' % outputFile self.log.info( msg ) raise IOError( msg ) def processCleanup(self, outputFile=None, outputList=None): """ Log if the process successfully created it's output, and raise an error message if not """ if outputFile: self.checkOutputFile( outputFile ) elif outputList: for outputFile in outputList: self.checkOutputFile( outputFile ) self.log.info('All expected output files found - process successful!\n') def writeDummyFile(self, dummyFile): with open(dummyFile, 'w') as handle: handle.write('DONE') return dummyFile def extractCcsFromBasH5(self, inputFile): outputFile = self.processSetup( inputFile, 'extractCcsFromBasH5', suffix='fastq' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile extractor = BasH5Extractor( inputFile, outputFile ) extractor.outputCcsFastq() self.processCleanup( outputFile=outputFile ) return outputFile def filterFastqFile(self, fastqFile): outputList = self.processSetup( fastqFile, 'FilterQuality', suffix='filter.fastq' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile aligner = QualityFilter( fastqFile, outputFile, self. ) aligner() self.processCleanup( outputFile=outputFile ) return outputFile def separateFastqFile(self, fastqFile): outputList = self.processSetup( fastqFile, 'Fastq.Info', suffixList=['fasta', 'qual'] ) if self.outputFilesExist( outputList=outputList ): return outputList mothurArgs = {'fastq':fastqFile, 'fasta':'T', 'qfile':'T'} logFile = self.getProcessLogFile('fastq.info', True) self.factory.runJob('fastq.info', mothurArgs, logFile) self.processCleanup( outputList=outputList ) return outputList def alignSequences(self, fastaFile): outputFile = self.processSetup( fastaFile, 'Align.Seqs', suffix='align' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':fastaFile, 'reference':self.alignmentRef, 'flip':'t'} logFile = self.getProcessLogFile('align.seqs', True) self.factory.runJob('align.seqs', mothurArgs, logFile) self.processCleanup( outputFile=outputFile ) return outputFile def screenSequences(self, alignFile, start=None, end=None, minLength=None): if alignFile.endswith('.align'): outputExt = 'good.align' elif alignFile.endswith('.fasta'): outputExt = 'good.fasta' outputFile = self.processSetup( alignFile, 'Screen.Seqs', suffix=outputExt ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'start':start, 'end':end, 'minlength':minLength} logFile = self.getProcessLogFile('screen.seqs', True) self.factory.runJob('screen.seqs', mothurArgs, logFile) self.processCleanup( outputFile=outputFile ) return outputFile def summarizeSequences(self, fastaFile): outputFile = self.processSetup( fastaFile, 'Summary.Seqs', suffix='summary' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':fastaFile} logFile = self.getProcessLogFile('summary.seqs', True) self.factory.runJob('summary.seqs', mothurArgs, logFile) self.processCleanup( outputFile=outputFile ) return outputFile def parseSummaryFile(self, summaryFile): self.log.info('Preparing to run SummaryReader...') parser = SummaryReader(summaryFile, self.fraction) self.log.info('Identifying full-length alignment positions...') start, end = parser.getFullLengthPositions() self.log.info('Full-length start is NAST Alignment position %s' % start) self.log.info('Full-length end is NAST Alignment position %s' % end) self.log.info('Calculating minimum allowed alignment positions...') maxStart, minEnd = parser.getAllowedPositions() self.log.info('Maximum allowed start is NAST Alignment position %s' % maxStart) self.log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd) return maxStart, minEnd def findChimeras(self, alignFile): outputFile = self.processSetup( alignFile, 'UCHIME', suffix='uchime.accnos' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'reference':self.chimeraRef} logFile = self.getProcessLogFile('chimera.uchime', True) self.factory.runJob('chimera.uchime', mothurArgs, logFile) self.processCleanup( outputFile=outputFile ) return outputFile def removeSequences(self, alignFile, idFile): outputFile = self.processSetup( alignFile, 'Remove.Seqs', suffix='pick.align' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'accnos':idFile} logFile = self.getProcessLogFile('remove.seqs', True) self.factory.runJob('remove.seqs', mothurArgs, logFile) self.processCleanup( outputFile=outputFile ) return outputFile def filterSequences(self, alignFile): outputFile = self.processSetup( alignFile, 'Filter.Seqs', suffix='filter.fasta' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = {'fasta':alignFile, 'vertical':'T'} logFile = self.getProcessLogFile( 'filter.seqs', True ) self.factory.runJob( 'filter.seqs', mothurArgs, logFile ) self.processCleanup( outputFile=outputFile ) return outputFile def addQualityToAlignment(self, fastqFile, alignFile): outputFile = self.processSetup( alignFile, 'QualityAligner', suffix='fastq' ) if self.outputFilesExist( outputFile=output ): return output aligner = QualityAligner( fastqFile, alignFile, outputFile ) aligner.run() self.processCleanup( outputFile=outputFile ) return outputFile def maskFastqSequences(self, fastqFile): outputFile = self.processSetup( fastqFile, 'QualityMasker', suffix='masked.fastq' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile masker = QualityMasker(fastqFile, outputFile, self.minQv) masker.run() self.processCleanup( outputFile=outputFile ) return outputFile def uniqueSequences( self, alignFile ): if alignFile.endswith('.align'): outputSuffixes = ['unique.align', 'names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['unique.fasta', 'names'] outputList = self.processSetup( alignFile, 'Unique.Seqs', suffixList=outputSuffixes ) if self.outputFilesExist( outputList=outputList ): return outputList mothurArgs = {'fasta':alignFile} logFile = self.getProcessLogFile('unique.seqs', True) self.factory.runJob('unique.seqs', mothurArgs, logFile) self.processCleanup( outputList=outputList ) return outputList def preclusterSequences( self, alignFile, nameFile ): if alignFile.endswith('.align'): outputSuffixes = ['precluster.align', 'precluster.names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['precluster.fasta', 'precluster.names'] outputList = self.processSetup( alignFile, 'Pre.Cluster', suffixList=outputSuffixes ) if self.outputFilesExist( outputList=outputList ): return outputList mothurArgs = { 'fasta':alignFile, 'name': nameFile, 'diffs':self.preclusterDiffs } logFile = self.getProcessLogFile('pre.cluster', True) self.factory.runJob('pre.cluster', mothurArgs, logFile) self.processCleanup( outputList=outputList ) return outputList def calculateDistanceMatrix( self, alignFile ): outputFile = self.processSetup( alignFile, 'Dist.Seqs', suffix='phylip.dist' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = { 'fasta':alignFile, 'calc':'nogaps', 'output':'lt' } logFile = self.getProcessLogFile('dist.seqs', True) self.factory.runJob('dist.seqs', mothurArgs, logFile) self.processCleanup( outputFile=outputFile ) return outputFile def clusterSequences(self, distanceMatrix): if self.clusteringMethod == 'nearest': outputSuffix = 'nn.list' elif self.clusteringMethod == 'average': outputSuffix = 'an.list' elif self.clusteringMethod == 'furthest': outputSuffix = 'fn.list' outputFile = self.processSetup( distanceMatrix, 'Cluster', suffix=outputSuffix ) if self.outputFilesExist( outputFile=outputFile ): return outputFile mothurArgs = {'phylip':distanceMatrix, 'method':self.clusteringMethod} logFile = self.getProcessLogFile( 'cluster', True ) self.factory.runJob( 'cluster', mothurArgs, logFile ) self.processCleanup( outputFile=outputFile ) return outputFile def separateClusterSequences(self, listFile, sequenceFile): outputFile = self.processSetup( listFile, 'ClusterSeparator', suffix='list.clusters') if self.outputFilesExist( outputFile=outputFile ): return outputFile separator = ClusterSeparator( listFile, sequenceFile, self.distance, outputFile ) separator() self.processCleanup( outputFile=outputFile ) return outputFile def generateConsensusSequences(self, clusterListFile): outputFile = self.processSetup( clusterListFile, 'ClusterResequencer', suffix='consensus') if self.outputFilesExist( outputFile=outputFile ): return outputFile consensusFiles = [] with open( clusterListFile ) as handle: for line in handle: sequenceFile, referenceFile = line.strip().split() if referenceFile.endswith('None'): consensusFiles.append( (sequenceFile, 'None') ) else: consensus = self.consensusTool( sequenceFile, referenceFile ) consensusFiles.append( (referenceFile, consensus) ) with open( outputFile, 'w' ) as handle: for filenamePair in consensusFiles: handle.write('%s\t%s\n' % filenamePair) self.processCleanup( outputFile=outputFile ) return outputFile def cleanupConsensusFolder( self, consensusFile ): outputFile = self.processSetup( consensusFile, 'ConsensusCleanup', suffix='consensus.cleanup' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile reseqPath = os.path.join( os.getcwd(), 'reseq' ) for filename in os.listdir( reseqPath ): filePath = os.path.join( reseqPath, filename ) if filePath.endswith('_input.fa'): os.remove( filePath ) elif filePath.endswith('_input.fa.aln'): os.remove( filePath ) elif filePath.endswith('_input.fa.aln_unsorted'): os.remove( filePath ) self.writeDummyFile( outputFile ) self.processCleanup( outputFile=outputFile ) return outputFile def selectFinalSequences( self, consensusFile ): outputFile = self.processSetup( consensusFile, 'SequenceSelector', suffix='consensus.selected' ) if self.outputFilesExist( outputFile=outputFile ): return outputFile selectedFiles = [] with open( consensusFile ) as handle: for line in handle: referenceFile, consensusFile = line.strip().split() if consensusFile.endswith('None'): selectedFiles.append( referenceFile ) elif fasta_count( consensusFile ) == 1: selectedFiles.append( consensusFile ) else: selectedFiles.append( referenceFile ) with open( outputFile, 'w' ) as handle: for filename in selectedFiles: handle.write(filename + '\n') self.processCleanup( outputFile=outputFile ) return outputFile def __call__(self): if self.dataType == 'bash5': fastqFile = self.extractCcsFromBasH5( self.sequenceFile ) elif self.dataType == 'fastq': fastqFile = self.sequenceFile elif self.dataType == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filterFastqFile( fastqFile ) fastaFile, qualFile = self.separateFastqFile( fastqFile )
class rDnaPipeline( object ): """ A tool for running a community analysis pipeline on PacBioData """ def __init__(self): parse_args() self.__dict__.update( vars(args) ) self.validate_settings() self.initialize_output() initialize_logger( log, log_file=self.log_file, debug=self.debug ) def validate_settings(self): # Validate the input file root, ext = split_root_from_ext( self.input_file ) if ext in ['.bas.h5', '.fofn']: self.data_type = 'bash5' elif ext in ['.fq', '.fastq']: self.data_type = 'fastq' elif ext in ['.fa', '.fsa', '.fasta']: self.data_type = 'fasta' else: raise TypeError('Sequence file must be a bas.h5 file, a ' + \ 'fasta file, or a fofn of multiple such files') self.step_list = self.calculate_steps() if self.enable_consensus: self.consensusTool = DagConRunner('gcon.py', 'r') # Searching for Mothur executable, and set the Mothur Process counter self.mothur = validate_executable( self.mothur ) self.processCount = 0 def initialize_output(self): # Create the Output directory create_directory( self.output_dir ) # Create a symbolic link from the data file to the output dir baseName = os.path.basename( self.input_file ) symlinkPath = os.path.join( self.output_dir, baseName ) if os.path.exists( symlinkPath ): pass else: absPath = os.path.abspath( self.input_file ) os.symlink( absPath, symlinkPath ) self.sequenceFile = baseName # Move into the Output directory and create Log directory and files os.chdir( self.output_dir ) create_directory( 'log' ) stdoutLog = os.path.join('log', 'mothur_stdout.log') stderrLog = os.path.join('log', 'mothur_stderr.log') self.log_file = os.path.join('log', 'rna_pipeline.log') # Instantiate the MothurRunner object self.factory = MothurRunner( self.mothur, self.nproc, stdoutLog, stderrLog) def calculate_steps(self): if self.enable_iteration: count = int(self.distance / self.step) - 1 step_list = [i * self.step for i in range(1, count+1)] else: step_list = [] return step_list + [self.distance] def getProcessLogFile(self, process, isMothurProcess=False): if isMothurProcess: logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, process) else: logFile = 'process%02d.%s.logfile' % (self.processCount, process) return os.path.join('log', logFile) def process_setup(self, inputFile, processName, suffix=None, suffixList=None): """ Return a tuple containing the output file and a boolean flag describing whether the output file already exists """ log.info('Preparing to run %s on "%s"' % (processName, inputFile)) self.processCount += 1 if suffix: outputFile = get_output_name(inputFile, suffix) return outputFile elif suffixList: outputFiles = [] for suffix in suffixList: outputFile = get_output_name( inputFile, suffix ) outputFiles.append( outputFile ) return outputFiles def output_files_exist(self, output_file=None, output_list=None): if output_file: if file_exists( output_file ): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False elif output_list: if all_files_exist( output_list ): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False def check_output_file( self, outputFile ): if os.path.exists( outputFile ): log.info('Expected output "%s" found' % outputFile) else: msg = 'Expected output "%s" not found!' % outputFile log.error( msg ) raise IOError( msg ) def process_cleanup(self, output_file=None, output_list=None): """ Log if the process successfully created it's output, and raise an error message if not """ if output_file: self.check_output_file( output_file ) elif output_list: for output_file in output_list: self.check_output_file( output_file ) log.info('All expected output files found - process successful!\n') def extract_raw_ccs(self, inputFile): outputFile = self.process_setup( inputFile, 'extractCcsFromBasH5', suffix='fastq' ) if self.output_files_exist(output_file=outputFile): return outputFile elif file_has_ccs( inputFile ): extract_ccs(inputFile, outputFile, self.raw_data) else: msg = 'Raw data file has no CCS data!' log.error( msg ) raise ValueError( msg ) self.process_cleanup(output_file=outputFile) return outputFile def filter_fastq(self, fastqFile): outputFile = self.process_setup( fastqFile, 'FilterQuality', suffix='filter.fastq' ) if self.output_files_exist(output_file=outputFile): return outputFile quality_filter( fastqFile, outputFile, min_accuracy=self.min_accuracy ) self.process_cleanup(output_file=outputFile) return outputFile def separate_fastq(self, fastqFile): outputList = self.process_setup( fastqFile, 'Fastq.Info', suffixList=['fasta', 'qual'] ) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = {'fastq':fastqFile, 'fasta':'T', 'qfile':'T'} logFile = self.getProcessLogFile('fastq.info', True) self.factory.runJob('fastq.info', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def align_sequences(self, fastaFile): outputFile = self.process_setup( fastaFile, 'Align.Seqs', suffix='align' ) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta':fastaFile, 'reference':self.alignment_reference, 'flip':'t'} logFile = self.getProcessLogFile('align.seqs', True) self.factory.runJob('align.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def screen_sequences(self, alignFile, start=None, end=None, min_length=None): if alignFile.endswith('.align'): outputExt = 'good.align' elif alignFile.endswith('.fasta'): outputExt = 'good.fasta' outputFile = self.process_setup( alignFile, 'Screen.Seqs', suffix=outputExt ) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta':alignFile, 'start':start, 'end':end, 'minlength':min_length} logFile = self.getProcessLogFile('screen.seqs', True) self.factory.runJob('screen.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def summarize_sequences(self, fastaFile): outputFile = self.process_setup( fastaFile, 'Summary.Seqs', suffix='summary' ) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta':fastaFile} logFile = self.getProcessLogFile('summary.seqs', True) self.factory.runJob('summary.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def parse_summary_file(self, summaryFile): log.info('Preparing to run SummaryReader...') parser = SummaryReader(summaryFile, self.fraction) log.info('Identifying full-length alignment positions...') start, end = parser.getFullLengthPositions() log.info('Full-length start is NAST Alignment position %s' % start) log.info('Full-length end is NAST Alignment position %s' % end) log.info('Calculating minimum allowed alignment positions...') maxStart, minEnd = parser.getAllowedPositions() log.info('Maximum allowed start is NAST Alignment position %s' % maxStart) log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd) return maxStart, minEnd def find_chimeras(self, alignFile): outputFile = self.process_setup( alignFile, 'UCHIME', suffix='uchime.accnos' ) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta':alignFile, 'reference':self.chimera_reference} logFile = self.getProcessLogFile('chimera.uchime', True) self.factory.runJob('chimera.uchime', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def remove_sequences(self, alignFile, idFile): outputFile = self.process_setup( alignFile, 'Remove.Seqs', suffix='pick.align' ) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta':alignFile, 'accnos':idFile} logFile = self.getProcessLogFile('remove.seqs', True) self.factory.runJob('remove.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def filter_sequences(self, alignFile, trump=None ): outputFile = self.process_setup( alignFile, 'Filter.Seqs', suffix='filter.fasta' ) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'vertical': 'T', 'trump': trump} logFile = self.getProcessLogFile( 'filter.seqs', True ) self.factory.runJob( 'filter.seqs', mothurArgs, logFile ) self.process_cleanup(output_file=outputFile) return outputFile def add_quality_to_alignment(self, fastqFile, alignFile): outputFile = self.process_setup( alignFile, 'QualityAligner', suffix='fastq' ) if self.output_files_exist(output_file=outputFile): return outputFile aligner = QualityAligner( fastqFile, alignFile, outputFile ) aligner.run() self.process_cleanup(output_file=outputFile) return outputFile def mask_fastq_sequences(self, fastqFile): outputFile = self.process_setup( fastqFile, 'QualityMasker', suffix='masked.fastq' ) if self.output_files_exist(output_file=outputFile): return outputFile masker = QualityMasker(fastqFile, outputFile, self.minQv) masker.run() self.process_cleanup(output_file=outputFile) return outputFile def unique_sequences( self, alignFile ): if alignFile.endswith('.align'): outputSuffixes = ['unique.align', 'names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['unique.fasta', 'names'] outputList = self.process_setup( alignFile, 'Unique.Seqs', suffixList=outputSuffixes ) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = {'fasta':alignFile} logFile = self.getProcessLogFile('unique.seqs', True) self.factory.runJob('unique.seqs', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def precluster_sequences( self, alignFile, nameFile ): if alignFile.endswith('.align'): outputSuffixes = ['precluster.align', 'precluster.names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['precluster.fasta', 'precluster.names'] outputList = self.process_setup( alignFile, 'Pre.Cluster', suffixList=outputSuffixes ) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = { 'fasta':alignFile, 'name': nameFile, 'diffs':self.precluster_diffs } logFile = self.getProcessLogFile('pre.cluster', True) self.factory.runJob('pre.cluster', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def calculate_distance_matrix( self, alignFile ): outputFile = self.process_setup( alignFile, 'Dist.Seqs', suffix='phylip.dist') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'fasta':alignFile, 'calc':'onegap', 'countends':'F', 'output':'lt' } logFile = self.getProcessLogFile('dist.seqs', True) self.factory.runJob('dist.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def cluster_sequences(self, distanceMatrix, nameFile ): if self.clusteringMethod == 'nearest': outputSuffix = 'nn.list' elif self.clusteringMethod == 'average': outputSuffix = 'an.list' elif self.clusteringMethod == 'furthest': outputSuffix = 'fn.list' outputFile = self.process_setup( distanceMatrix, 'Cluster', suffix=outputSuffix ) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'phylip':distanceMatrix, 'name':nameFile, 'method':self.clusteringMethod} logFile = self.getProcessLogFile( 'cluster', True ) self.factory.runJob( 'cluster', mothurArgs, logFile ) self.process_cleanup(output_file=outputFile) return outputFile def separate_cluster_sequences(self, listFile, sequenceFile, distance, min_cluster_size): outputFile = self.process_setup( listFile, 'ClusterSeparator', suffix='clusters') if self.output_files_exist(output_file=outputFile): return outputFile outputDir = 'Dist_%s' % distance separator = ClusterSeparator( listFile, sequenceFile, outputFile, outputDir, distance, min_cluster_size ) separator() self.process_cleanup(output_file=outputFile) return outputFile def generate_consensus_sequences(self, cluster_list_file, distance): output_file = self.process_setup( cluster_list_file, 'ClusterResequencer', suffix='consensus') if self.output_files_exist(output_file=output_file): return output_file generate_consensus_files( cluster_list_file, self.consensusTool, output_file ) self.process_cleanup(output_file=output_file) return output_file def generate_ref_sequences(self, cluster_list_file, distance): output_file = self.process_setup( cluster_list_file, 'ClusterResequencer', suffix='consensus') if self.output_files_exist(output_file=output_file): return output_file generate_reference_files( cluster_list_file, output_file ) self.process_cleanup(output_file=output_file) return output_file def cleanup_uchime_output( self, screenedFile ): outputFile = self.process_setup( screenedFile, 'UchimeCleanup', suffix='uchime.cleanup' ) uchimePath = os.getcwd() for filename in os.listdir( uchimePath ): if filename.endswith('_formatted'): file_path = os.path.join( uchimePath, filename ) os.remove( file_path ) write_dummy_file( outputFile ) self.process_cleanup(output_file=outputFile) return outputFile def cleanup_consensus_folder( self, consensusFile, distance ): outputFile = self.process_setup( consensusFile, 'ConsensusCleanup', suffix='consensus.cleanup' ) if self.output_files_exist(output_file=outputFile): return outputFile reseqPath = os.path.join( os.getcwd(), 'Dist_%s' % distance ) clean_consensus_outputs( reseqPath, outputFile ) self.process_cleanup(output_file=outputFile) return outputFile def select_sequences( self, consensusFile ): outputFile = self.process_setup( consensusFile, 'SequenceSelector', suffix='consensus.selected' ) if self.output_files_exist(output_file=outputFile): return outputFile select_consensus_files( consensusFile, outputFile ) self.process_cleanup(output_file=outputFile) return outputFile def select_ref_sequences( self, consensusFile ): outputFile = self.process_setup( consensusFile, 'SequenceSelector', suffix='consensus.selected' ) if self.output_files_exist(output_file=outputFile): return outputFile select_reference_files( consensusFile, outputFile ) self.process_cleanup(output_file=outputFile) return outputFile def output_selected_sequences( self, selectedSequences ): outputFile = self.process_setup( selectedSequences, 'SequenceWriter', suffix='fasta' ) if self.output_files_exist(output_file=outputFile): return outputFile copy_fasta_list( selectedSequences, outputFile ) self.process_cleanup(output_file=outputFile) return outputFile def write_name_file(self, consensusFile, selectedFile, outputRoot=None ): if outputRoot is None: outputRoot = 'Final_Output.fasta' outputFile = self.process_setup( outputRoot, 'CreateNameFile', suffix='names' ) if self.output_files_exist(output_file=outputFile): return outputFile create_name_file( consensusFile, selectedFile, outputFile ) self.process_cleanup(output_file=outputFile) return outputFile def run(self): print self.step_list if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs( self.sequenceFile ) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq( fastqFile ) fastaFile, qualFile = self.separate_fastq( filteredFastq ) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences( fastaFile ) summaryFile = self.summarize_sequences( alignedFile ) maxStart, minEnd = self.parse_summary_file( summaryFile ) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) # Identify and remove chimeric reads chimera_ids = self.find_chimeras( screenedFile ) self.cleanup_uchime_output( screenedFile ) if file_exists( chimera_ids ): no_chimera_file = self.remove_sequences( screenedFile, chimera_ids ) else: no_chimera_file = screenedFile filteredFile = self.filter_sequences( no_chimera_file, trump='.' ) uniqueFile, nameFile = self.unique_sequences( filteredFile ) preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile ) fileToCluster = preclusteredFile clusterFileRoot = '.'.join( fileToCluster.split('.')[:-1] ) for i, step in enumerate( self.step_list ): log.info("Beginning iteration #%s - %s" % (i+1, step)) iterationInput = clusterFileRoot + '.%s.fasta' % step shutil.copyfile( fileToCluster, iterationInput ) distanceMatrix = self.calculate_distance_matrix( iterationInput ) listFile = self.cluster_sequences( distanceMatrix, nameFile ) # Include all clusters during intermediate stages, others use min_cluster_size if step == self.distance: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, self.min_cluster_size ) else: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile, step, 1 ) # Generate the consensus sequences for the next round if step == self.distance and self.enable_consensus: # If consensus is enabled and this is the last round, generate a GCON consensus log.info("Generating consensus sequences for iteration #%s - %s" % (i+1, step)) consensusFile = self.generate_consensus_sequences( clusterListFile, step ) self.cleanup_consensus_folder( consensusFile, step ) selectedFile = self.select_sequences( consensusFile ) selectedSequenceFile = self.output_selected_sequences( selectedFile ) else: # Otherwise generate reference sequences by picking high-QV reads log.info("Selecting reference sequences for iteration #%s - %s" % (i+1, step)) consensusFile = self.generate_ref_sequences( clusterListFile, step ) selectedFile = self.select_ref_sequences( consensusFile ) selectedSequenceFile = self.output_selected_sequences( selectedFile ) # Whichever method was used, we need to update the nameFile accordingly nameFile = self.write_name_file( consensusFile, selectedFile, selectedSequenceFile ) # If this isn't the last round, we must re-align and re-filter the new consensus sequences if step != self.distance: log.info("Iterative clustering not finished, preparing sequences for next iteration") alignedFile = self.align_sequences( selectedSequenceFile ) fileToCluster = self.filter_sequences( alignedFile, trump='.' ) log.info("Finished iteration #%s - %s" % (i+1, step)) try: os.symlink( selectedSequenceFile, "Final_Output.fasta") except: pass try: os.symlink( nameFile, "Final_Output.names") except: pass
class rDnaPipeline(object): """ A tool for running a community analysis pipeline on PacBioData """ def __init__(self): parse_args() self.__dict__.update(vars(args)) self.validate_settings() self.initialize_output() initialize_logger(log, log_file=self.log_file, debug=self.debug) def validate_settings(self): # Validate the input file root, ext = split_root_from_ext(self.input_file) if ext in ['.bas.h5', '.fofn']: self.data_type = 'bash5' elif ext in ['.fq', '.fastq']: self.data_type = 'fastq' elif ext in ['.fa', '.fsa', '.fasta']: self.data_type = 'fasta' self.enable_masking = False self.enable_consensus = False else: raise TypeError('Sequence file must be a bas.h5 file, a ' + \ 'fasta file, or a fofn of multiple such files') # If Clustering was disabled, also disable the consensus process if not self.enable_clustering: self.enable_consensus = False # If Consensus is enabled, initialize the appropriate tool if self.enable_consensus: self.consensusTool = DagConRunner('gcon.py', 'r') # Searching for Mothur executable, and set the Mothur Process counter self.mothur = validate_executable(self.mothur) self.processCount = 0 def initialize_output(self): # Create the Output directory create_directory(self.output_dir) # Create a symbolic link from the data file to the output dir baseName = os.path.basename(self.input_file) symlinkPath = os.path.join(self.output_dir, baseName) if os.path.exists(symlinkPath): pass else: absPath = os.path.abspath(self.input_file) os.symlink(absPath, symlinkPath) self.sequenceFile = baseName # Move into the Output directory and create Log directory and files os.chdir(self.output_dir) create_directory('log') stdoutLog = os.path.join('log', 'mothur_stdout.log') stderrLog = os.path.join('log', 'mothur_stderr.log') self.log_file = os.path.join('log', 'rna_pipeline.log') # Instantiate the MothurRunner object self.factory = MothurRunner(self.mothur, self.nproc, stdoutLog, stderrLog) def getProcessLogFile(self, process, isMothurProcess=False): if isMothurProcess: logFile = 'process%02d.mothur.%s.logfile' % (self.processCount, process) else: logFile = 'process%02d.%s.logfile' % (self.processCount, process) return os.path.join('log', logFile) def process_setup(self, inputFile, processName, suffix=None, suffixList=None): """ Return a tuple containing the output file and a boolean flag describing whether the output file already exists """ log.info('Preparing to run %s on "%s"' % (processName, inputFile)) self.processCount += 1 if suffix: outputFile = get_output_name(inputFile, suffix) return outputFile elif suffixList: outputFiles = [] for suffix in suffixList: outputFile = get_output_name(inputFile, suffix) outputFiles.append(outputFile) return outputFiles def output_files_exist(self, output_file=None, output_list=None): if output_file: if file_exists(output_file): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False elif output_list: if all_files_exist(output_list): log.info('Output files detected, skipping process...\n') return True else: log.info('Output files not found, running process...') return False def check_output_file(self, outputFile): if os.path.exists(outputFile): log.info('Expected output "%s" found' % outputFile) else: msg = 'Expected output "%s" not found!' % outputFile log.error(msg) raise IOError(msg) def process_cleanup(self, output_file=None, output_list=None): """ Log if the process successfully created it's output, and raise an error message if not """ if output_file: self.check_output_file(output_file) elif output_list: for output_file in output_list: self.check_output_file(output_file) log.info('All expected output files found - process successful!\n') def write_dummy_file(self, dummyFile): with open(dummyFile, 'w') as handle: handle.write('DONE') return dummyFile def extract_raw_ccs(self, inputFile): outputFile = self.process_setup(inputFile, 'extractCcsFromBasH5', suffix='fastq') if self.output_files_exist(output_file=outputFile): return outputFile elif file_has_ccs(inputFile): extract_ccs(inputFile, outputFile, self.raw_data) else: msg = 'Raw data file has no CCS data!' log.error(msg) raise ValueError(msg) self.process_cleanup(output_file=outputFile) return outputFile def filter_fastq(self, fastqFile): outputFile = self.process_setup(fastqFile, 'FilterQuality', suffix='filter.fastq') if self.output_files_exist(output_file=outputFile): return outputFile quality_filter(fastqFile, outputFile) self.process_cleanup(output_file=outputFile) return outputFile def separate_fastq(self, fastqFile): outputList = self.process_setup(fastqFile, 'Fastq.Info', suffixList=['fasta', 'qual']) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = {'fastq': fastqFile, 'fasta': 'T', 'qfile': 'T'} logFile = self.getProcessLogFile('fastq.info', True) self.factory.runJob('fastq.info', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def align_sequences(self, fastaFile): outputFile = self.process_setup(fastaFile, 'Align.Seqs', suffix='align') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'fasta': fastaFile, 'reference': self.alignment_reference, 'flip': 't' } logFile = self.getProcessLogFile('align.seqs', True) self.factory.runJob('align.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def screen_sequences(self, alignFile, start=None, end=None, min_length=None): if alignFile.endswith('.align'): outputExt = 'good.align' elif alignFile.endswith('.fasta'): outputExt = 'good.fasta' outputFile = self.process_setup(alignFile, 'Screen.Seqs', suffix=outputExt) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'fasta': alignFile, 'start': start, 'end': end, 'minlength': min_length } logFile = self.getProcessLogFile('screen.seqs', True) self.factory.runJob('screen.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def summarize_sequences(self, fastaFile): outputFile = self.process_setup(fastaFile, 'Summary.Seqs', suffix='summary') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': fastaFile} logFile = self.getProcessLogFile('summary.seqs', True) self.factory.runJob('summary.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def parse_summary_file(self, summaryFile): log.info('Preparing to run SummaryReader...') parser = SummaryReader(summaryFile, self.fraction) log.info('Identifying full-length alignment positions...') start, end = parser.getFullLengthPositions() log.info('Full-length start is NAST Alignment position %s' % start) log.info('Full-length end is NAST Alignment position %s' % end) log.info('Calculating minimum allowed alignment positions...') maxStart, minEnd = parser.getAllowedPositions() log.info('Maximum allowed start is NAST Alignment position %s' % maxStart) log.info('Minimum allowed end is NAST Alignment position %s\n' % minEnd) return maxStart, minEnd def find_chimeras(self, alignFile): outputFile = self.process_setup(alignFile, 'UCHIME', suffix='uchime.accnos') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'reference': self.chimera_reference} logFile = self.getProcessLogFile('chimera.uchime', True) self.factory.runJob('chimera.uchime', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def find_chimeras_denovo(self, alignFile, nameFile): outputFile = self.process_setup(alignFile, 'UCHIME', suffix='uchime.accnos') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'name': nameFile} logFile = self.getProcessLogFile('chimera.uchime', True) self.factory.runJob('chimera.uchime', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def remove_sequences(self, alignFile, idFile): inputSuffix = alignFile.split('.')[-1] outputSuffix = 'pick.%s' % inputSuffix outputFile = self.process_setup(alignFile, 'Remove.Seqs', suffix=outputSuffix) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'accnos': idFile} logFile = self.getProcessLogFile('remove.seqs', True) self.factory.runJob('remove.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def filter_sequences(self, alignFile, trump=None): outputFile = self.process_setup(alignFile, 'Filter.Seqs', suffix='filter.fasta') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = {'fasta': alignFile, 'vertical': 'T', 'trump': trump} logFile = self.getProcessLogFile('filter.seqs', True) self.factory.runJob('filter.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def add_quality_to_alignment(self, fastqFile, alignFile): outputFile = self.process_setup(alignFile, 'QualityAligner', suffix='fastq') if self.output_files_exist(output_file=outputFile): return outputFile aligner = QualityAligner(fastqFile, alignFile, outputFile) aligner.run() self.process_cleanup(output_file=outputFile) return outputFile def mask_fastq_sequences(self, fastqFile): outputFile = self.process_setup(fastqFile, 'QualityMasker', suffix='masked.fastq') if self.output_files_exist(output_file=outputFile): return outputFile masker = QualityMasker(fastqFile, outputFile, self.minQv) masker.run() self.process_cleanup(output_file=outputFile) return outputFile def unique_sequences(self, alignFile): if alignFile.endswith('.align'): outputSuffixes = ['unique.align', 'names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['unique.fasta', 'names'] outputList = self.process_setup(alignFile, 'Unique.Seqs', suffixList=outputSuffixes) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = {'fasta': alignFile} logFile = self.getProcessLogFile('unique.seqs', True) self.factory.runJob('unique.seqs', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def precluster_sequences(self, alignFile, nameFile): if alignFile.endswith('.align'): outputSuffixes = ['precluster.align', 'precluster.names'] elif alignFile.endswith('.fasta'): outputSuffixes = ['precluster.fasta', 'precluster.names'] outputList = self.process_setup(alignFile, 'Pre.Cluster', suffixList=outputSuffixes) if self.output_files_exist(output_list=outputList): return outputList mothurArgs = { 'fasta': alignFile, 'name': nameFile, 'diffs': self.precluster_diffs } logFile = self.getProcessLogFile('pre.cluster', True) self.factory.runJob('pre.cluster', mothurArgs, logFile) self.process_cleanup(output_list=outputList) return outputList def calculate_distance_matrix(self, alignFile): outputFile = self.process_setup(alignFile, 'Dist.Seqs', suffix='phylip.dist') if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'fasta': alignFile, 'calc': 'onegap', 'countends': 'F', 'output': 'lt' } logFile = self.getProcessLogFile('dist.seqs', True) self.factory.runJob('dist.seqs', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def cluster_sequences(self, distanceMatrix, nameFile): if self.clusteringMethod == 'nearest': outputSuffix = 'nn.list' elif self.clusteringMethod == 'average': outputSuffix = 'an.list' elif self.clusteringMethod == 'furthest': outputSuffix = 'fn.list' outputFile = self.process_setup(distanceMatrix, 'Cluster', suffix=outputSuffix) if self.output_files_exist(output_file=outputFile): return outputFile mothurArgs = { 'phylip': distanceMatrix, 'name': nameFile, 'method': self.clusteringMethod } logFile = self.getProcessLogFile('cluster', True) self.factory.runJob('cluster', mothurArgs, logFile) self.process_cleanup(output_file=outputFile) return outputFile def separate_cluster_sequences(self, listFile, sequenceFile): outputFile = self.process_setup(listFile, 'ClusterSeparator', suffix='list.clusters') if self.output_files_exist(output_file=outputFile): return outputFile separator = ClusterSeparator(listFile, sequenceFile, outputFile, self.distance, self.min_cluster_size) separator() self.process_cleanup(output_file=outputFile) return outputFile def generate_consensus_sequences(self, clusterListFile): outputFile = self.process_setup(clusterListFile, 'ClusterResequencer', suffix='consensus') if self.output_files_exist(output_file=outputFile): return outputFile consensusFiles = [] with open(clusterListFile) as handle: for line in handle: sequenceFile, referenceFile, count = line.strip().split() if referenceFile.endswith('None'): consensusFiles.append((sequenceFile, 'None')) else: root_name = os.path.basename(sequenceFile) consensus = self.consensusTool(sequenceFile, referenceFile) consensusFiles.append((referenceFile, consensus)) with open(outputFile, 'w') as handle: for filenamePair in consensusFiles: handle.write('%s\t%s\n' % filenamePair) self.process_cleanup(output_file=outputFile) return outputFile def cleanup_uchime_output(self, screenedFile): outputFile = self.process_setup(screenedFile, 'UchimeCleanup', suffix='uchime.cleanup') uchimePath = os.getcwd() for filename in os.listdir(uchimePath): if filename.endswith('_formatted'): file_path = os.path.join(uchimePath, filename) os.remove(file_path) self.write_dummy_file(outputFile) self.process_cleanup(output_file=outputFile) return outputFile def cleanup_consensus_folder(self, consensusFile): outputFile = self.process_setup(consensusFile, 'ConsensusCleanup', suffix='consensus.cleanup') if self.output_files_exist(output_file=outputFile): return outputFile reseqPath = os.path.join(os.getcwd(), 'reseq') for filename in os.listdir(reseqPath): filePath = os.path.join(reseqPath, filename) if filePath.endswith('_input.fa'): os.remove(filePath) elif filePath.endswith('_input.fa.aln'): os.remove(filePath) elif filePath.endswith('_input.fa.aln_unsorted'): os.remove(filePath) self.write_dummy_file(outputFile) self.process_cleanup(output_file=outputFile) return outputFile def select_final_sequences(self, consensusFile): outputFile = self.process_setup(consensusFile, 'SequenceSelector', suffix='consensus.selected') if self.output_files_exist(output_file=outputFile): return outputFile selectedFiles = [] with open(consensusFile) as handle: for line in handle: referenceFile, consensusFile = line.strip().split() if consensusFile.endswith('None'): pass elif fasta_count(consensusFile) == 1: selectedFiles.append(consensusFile) else: selectedFiles.append(referenceFile) with open(outputFile, 'w') as handle: for filename in selectedFiles: handle.write(filename + '\n') self.process_cleanup(output_file=outputFile) return outputFile def output_final_sequences(self, finalSequenceList): outputFile = self.process_setup(finalSequenceList, 'SequenceWriter', suffix='fasta') if self.output_files_exist(output_file=outputFile): return outputFile with FastaWriter(outputFile) as writer: with open(finalSequenceList) as handle: for line in handle: sequenceFile = line.strip() copy_fasta_sequences(sequenceFile, writer) self.process_cleanup(output_file=outputFile) return outputFile def run(self): if self.data_type == 'bash5': fastqFile = self.extract_raw_ccs(self.sequenceFile) elif self.data_type == 'fastq': fastqFile = self.sequenceFile elif self.data_type == 'fasta': fastqFile = None fastaFile = self.sequenceFile # If we have a Fastq, filter low-quality reads and convert to FASTA if fastqFile: filteredFastq = self.filter_fastq(fastqFile) fastaFile, qualFile = self.separate_fastq(filteredFastq) # Align the Fasta sequences and remove partial reads alignedFile = self.align_sequences(fastaFile) summaryFile = self.summarize_sequences(alignedFile) maxStart, minEnd = self.parse_summary_file(summaryFile) screenedFile = self.screen_sequences(alignedFile, start=maxStart, end=minEnd) #filteredFile = self.filter_sequences( screenedFile, trump='.' ) filteredFile = self.filter_sequences(screenedFile) # If masking is enabled, create an aligned FASTQ, mask the # low-quality bases and remove over-masked reads if self.enable_masking: alignedFastqFile = self.add_quality_to_alignment( fastqFile, filteredFile) maskedFastq = self.mask_fastq_sequences(alignedFastqFile) maskedFasta = self.convert_fastq_to_fasta(maskedFastq) screenedFile = self.screen_sequences(maskedFasta, min_length=self.min_length) # Otherwise if masking is disabled, we'll use unique-ify and # pre-cluster our sequences else: uniqueFile, nameFile = self.unique_sequences(filteredFile) screenedFile, nameFile = self.precluster_sequences( uniqueFile, nameFile) # Identify and remove chimeric reads #chimera_ids = self.find_chimeras_denovo( screenedFile, nameFile ) chimera_ids = self.find_chimeras(screenedFile) self.cleanup_uchime_output(screenedFile) if file_exists(chimera_ids): fileForClustering = self.remove_sequences(screenedFile, chimera_ids) else: fileForClustering = screenedFile # If enabled, calculate sequence distances and cluster if self.enable_clustering: distanceMatrix = self.calculate_distance_matrix(fileForClustering) listFile = self.cluster_sequences(distanceMatrix, nameFile) # If enabled, generate a consensus for each cluster from above if self.enable_consensus: clusterListFile = self.separate_cluster_sequences( listFile, fastqFile) consensusFile = self.generate_consensus_sequences(clusterListFile) self.cleanup_consensus_folder(consensusFile) selectedFile = self.select_final_sequences(consensusFile) finalFile = self.output_final_sequences(selectedFile)