Esempio n. 1
0
 def checkOutputFile( self, outputFile ):
     if file_exists( outputFile ):
         log.info('Expected output "%s" found' % outputFile)
     else:
         msg = 'Expected output "%s" not found!' % outputFile
         log.info( msg )
         raise IOError( msg )
Esempio n. 2
0
 def output_files_exist(self, output_file=None, output_list=None):
     if output_file:
         if file_exists(output_file):
             log.info('Output files detected, skipping process...\n')
             return True
         else:
             log.info('Output files not found, running process...')
             return False
     elif output_list:
         if all_files_exist(output_list):
             log.info('Output files detected, skipping process...\n')
             return True
         else:
             log.info('Output files not found, running process...')
             return False
Esempio n. 3
0
 def output_files_exist( self, outputFile=None, outputList=None ):
     if outputFile:
         if file_exists( outputFile ):
             log.info('Output files detected, skipping process...\n')
             return True
         else:
             log.info('Output files not found, running process...')
             return False
     elif outputList:
         if all_files_exist( outputList ):
             log.info('Output files detected, skipping process...\n')
             return True
         else:
             log.info('Output files not found, running process...')
             return False
Esempio n. 4
0
    def run(self):
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs(self.sequenceFile)
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq(fastqFile)
            fastaFile, qualFile = self.separate_fastq(filteredFastq)

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences(fastaFile)
        summaryFile = self.summarize_sequences(alignedFile)
        maxStart, minEnd = self.parse_summary_file(summaryFile)
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras(screenedFile)
        self.cleanup_uchime_output(screenedFile)
        if file_exists(chimera_ids):
            no_chimera_file = self.remove_sequences(screenedFile, chimera_ids)
        else:
            no_chimera_file = screenedFile

        # Filter out un-used columns to speed up re-alignment and clustering
        filteredFile = self.filter_sequences(no_chimera_file, trump='.')

        uniqueFile, nameFile = self.unique_sequences(filteredFile)
        preclusteredFile, nameFile = self.precluster_sequences(
            uniqueFile, nameFile)
        fileForClustering = preclusteredFile

        distanceMatrix = self.calculate_distance_matrix(fileForClustering)
        listFile = self.cluster_sequences(distanceMatrix, nameFile)

        clusterListFile = self.separate_cluster_sequences(listFile, fastqFile)
        consensusFile = self.generate_consensus_sequences(clusterListFile)
        self.cleanup_consensus_folder(consensusFile)
        selectedFile = self.select_final_sequences(consensusFile)
        finalFile = self.output_final_sequences(selectedFile)
    def run(self):
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs( self.sequenceFile )
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq( fastqFile )
            fastaFile, qualFile = self.separate_fastq( filteredFastq )

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences( fastaFile )
        summaryFile = self.summarize_sequences( alignedFile )
        maxStart, minEnd = self.parse_summary_file( summaryFile )
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras( screenedFile )
        self.cleanup_uchime_output( screenedFile )
        if file_exists( chimera_ids ):
            no_chimera_file = self.remove_sequences( screenedFile, chimera_ids )
        else:
            no_chimera_file = screenedFile

        # Filter out un-used columns to speed up re-alignment and clustering
        filteredFile = self.filter_sequences( no_chimera_file, trump='.' )

        uniqueFile, nameFile = self.unique_sequences( filteredFile )
        preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile )
        fileForClustering = preclusteredFile

        distanceMatrix = self.calculate_distance_matrix( fileForClustering )
        listFile = self.cluster_sequences( distanceMatrix, nameFile )

        clusterListFile = self.separate_cluster_sequences( listFile, fastqFile )
        consensusFile = self.generate_consensus_sequences( clusterListFile )
        self.cleanup_consensus_folder( consensusFile )
        selectedFile = self.select_final_sequences( consensusFile )
        finalFile = self.output_final_sequences( selectedFile )
Esempio n. 6
0
    def run(self):
        print self.step_list
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs(self.sequenceFile)
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq(fastqFile)
            fastaFile, qualFile = self.separate_fastq(filteredFastq)

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences(fastaFile)
        summaryFile = self.summarize_sequences(alignedFile)
        maxStart, minEnd = self.parse_summary_file(summaryFile)
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras(screenedFile)
        self.cleanup_uchime_output(screenedFile)
        if file_exists(chimera_ids):
            no_chimera_file = self.remove_sequences(screenedFile, chimera_ids)
        else:
            no_chimera_file = screenedFile

        filteredFile = self.filter_sequences(no_chimera_file, trump='.')
        uniqueFile, nameFile = self.unique_sequences(filteredFile)
        preclusteredFile, nameFile = self.precluster_sequences(
            uniqueFile, nameFile)
        fileToCluster = preclusteredFile

        clusterFileRoot = '.'.join(fileToCluster.split('.')[:-1])
        for i, step in enumerate(self.step_list):
            log.info("Beginning iteration #%s - %s" % (i + 1, step))
            iterationInput = clusterFileRoot + '.%s.fasta' % step
            shutil.copyfile(fileToCluster, iterationInput)
            distanceMatrix = self.calculate_distance_matrix(iterationInput)
            listFile = self.cluster_sequences(distanceMatrix, nameFile)

            # Include all clusters during intermediate stages, others use min_cluster_size
            if step == self.distance:
                clusterListFile = self.separate_cluster_sequences(
                    listFile, fastqFile, step, self.min_cluster_size)
            else:
                clusterListFile = self.separate_cluster_sequences(
                    listFile, fastqFile, step, 1)

            # Generate the consensus sequences for the next round
            if step == self.distance and self.enable_consensus:
                # If consensus is enabled and this is the last round, generate a GCON consensus
                log.info(
                    "Generating consensus sequences for iteration #%s - %s" %
                    (i + 1, step))
                consensusFile = self.generate_consensus_sequences(
                    clusterListFile, step)
                self.cleanup_consensus_folder(consensusFile, step)
                selectedFile = self.select_sequences(consensusFile)
                selectedSequenceFile = self.output_selected_sequences(
                    selectedFile)
            else:
                # Otherwise generate reference sequences by picking high-QV reads
                log.info(
                    "Selecting reference sequences for iteration #%s - %s" %
                    (i + 1, step))
                consensusFile = self.generate_ref_sequences(
                    clusterListFile, step)
                selectedFile = self.select_ref_sequences(consensusFile)
                selectedSequenceFile = self.output_selected_sequences(
                    selectedFile)

            # Whichever method was used, we need to update the nameFile accordingly
            nameFile = self.write_name_file(consensusFile, selectedFile,
                                            selectedSequenceFile)

            # If this isn't the last round, we must re-align and re-filter the new consensus sequences
            if step != self.distance:
                log.info(
                    "Iterative clustering not finished, preparing sequences for next iteration"
                )
                alignedFile = self.align_sequences(selectedSequenceFile)
                fileToCluster = self.filter_sequences(alignedFile, trump='.')
            log.info("Finished iteration #%s - %s" % (i + 1, step))

        try:
            os.symlink(selectedSequenceFile, "Final_Output.fasta")
        except:
            pass

        try:
            os.symlink(nameFile, "Final_Output.names")
        except:
            pass
    def run(self):
        print self.step_list
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs( self.sequenceFile )
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq( fastqFile )
            fastaFile, qualFile = self.separate_fastq( filteredFastq )

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences( fastaFile )
        summaryFile = self.summarize_sequences( alignedFile )
        maxStart, minEnd = self.parse_summary_file( summaryFile )
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras( screenedFile )
        self.cleanup_uchime_output( screenedFile )
        if file_exists( chimera_ids ):
            no_chimera_file = self.remove_sequences( screenedFile, chimera_ids )
        else:
            no_chimera_file = screenedFile

        filteredFile = self.filter_sequences( no_chimera_file, trump='.' )
        uniqueFile, nameFile = self.unique_sequences( filteredFile )
        preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile )
        fileToCluster = preclusteredFile

        clusterFileRoot = '.'.join( fileToCluster.split('.')[:-1] )
        for i, step in enumerate( self.step_list ):
            log.info("Beginning iteration #%s - %s" % (i+1, step))
            iterationInput = clusterFileRoot + '.%s.fasta' % step
            shutil.copyfile( fileToCluster, iterationInput )
            distanceMatrix = self.calculate_distance_matrix( iterationInput )
            listFile = self.cluster_sequences( distanceMatrix, nameFile )

            # Include all clusters during intermediate stages, others use min_cluster_size
            if step == self.distance:
                clusterListFile = self.separate_cluster_sequences( listFile, fastqFile,
                                                                   step, self.min_cluster_size )
            else:
                clusterListFile = self.separate_cluster_sequences( listFile, fastqFile,
                                                                   step, 1 )

            # Generate the consensus sequences for the next round
            if step == self.distance and self.enable_consensus:
                # If consensus is enabled and this is the last round, generate a GCON consensus
                log.info("Generating consensus sequences for iteration #%s - %s" % (i+1, step))
                consensusFile = self.generate_consensus_sequences( clusterListFile, step )
                self.cleanup_consensus_folder( consensusFile, step )
                selectedFile = self.select_sequences( consensusFile )
                selectedSequenceFile = self.output_selected_sequences( selectedFile )
            else:
                # Otherwise generate reference sequences by picking high-QV reads
                log.info("Selecting reference sequences for iteration #%s - %s" % (i+1, step))
                consensusFile = self.generate_ref_sequences( clusterListFile, step )
                selectedFile = self.select_ref_sequences( consensusFile )
                selectedSequenceFile = self.output_selected_sequences( selectedFile )

            # Whichever method was used, we need to update the nameFile accordingly
            nameFile = self.write_name_file( consensusFile, selectedFile, selectedSequenceFile )

            # If this isn't the last round, we must re-align and re-filter the new consensus sequences
            if step != self.distance:
                log.info("Iterative clustering not finished, preparing sequences for next iteration")
                alignedFile = self.align_sequences( selectedSequenceFile )
                fileToCluster = self.filter_sequences( alignedFile, trump='.' )
            log.info("Finished iteration #%s - %s" % (i+1, step))

        try:
            os.symlink( selectedSequenceFile, "Final_Output.fasta")
        except:
            pass

        try:
            os.symlink( nameFile, "Final_Output.names")
        except:
            pass
    def run(self):
        print self.step_list
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs( self.sequenceFile )
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq( fastqFile )
            fastaFile, qualFile = self.separate_fastq( filteredFastq )

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences( fastaFile )
        summaryFile = self.summarize_sequences( alignedFile )
        maxStart, minEnd = self.parse_summary_file( summaryFile )
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras( screenedFile )
        self.cleanup_uchime_output( screenedFile )
        if file_exists( chimera_ids ):
            no_chimera_file = self.remove_sequences( screenedFile, chimera_ids )
        else:
            no_chimera_file = screenedFile

        filteredFile = self.filter_sequences( no_chimera_file, trump='.' )
        uniqueFile, nameFile = self.unique_sequences( filteredFile )
        preclusteredFile, nameFile = self.precluster_sequences( uniqueFile, nameFile )
        fileToCluster = preclusteredFile

        clusterFileRoot = '.'.join( fileToCluster.split('.')[:-1] )
        for i, step in enumerate([0.01, 0.03]):
            log.info("Beginning iteration #%s - %s" % (i+1, step))
            iterationInput = clusterFileRoot + '.%s.fasta' % step
            shutil.copyfile( fileToCluster, iterationInput )
            distanceMatrix = self.calculate_distance_matrix( iterationInput )
            listFile = self.cluster_sequences( distanceMatrix, nameFile )

            # Include all clusters during intermediate stages, others use min_cluster_size
            if step == self.distance:
                clusterListFile = self.separate_cluster_sequences( listFile, fastqFile,
                                                                   step, self.min_cluster_size )
            else:
                clusterListFile = self.separate_cluster_sequences( listFile, fastqFile,
                                                                   step, 1 )

            # Generate and combine cluster sequences from the cluster-specific files
            consensusFile = self.generate_consensus_sequences( clusterListFile, step )
            self.cleanup_consensus_folder( consensusFile, step )
            selectedFile = self.select_sequences( consensusFile )
            selectedSequenceFile = self.output_selected_sequences( selectedFile )
            log.info("Finished iteration #%s - %s" % (i+1, step))

            # If this isn't the last iteration, prepare the selected sequences for the next one:
            if step == self.distance:
                 log.info("Iterative clustering finished")
            else:
                 log.info("Iterative clustering not finished, preparing sequences for next iteration")
                 alignedFile = self.align_sequences( selectedSequenceFile )
                 fileToCluster = self.filter_sequences( alignedFile, trump='.' )
                 nameFile = self.write_name_file( consensusFile, selectedFile, selectedSequenceFile )

        try:
            os.symlink( selectedSequenceFile, "Final_Output.fasta")
        except:
            pass
        self.write_name_file( consensusFile, selectedFile )
Esempio n. 9
0
    def run(self):
        print self.step_list
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs(self.sequenceFile)
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq(fastqFile)
            fastaFile, qualFile = self.separate_fastq(filteredFastq)

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences(fastaFile)
        summaryFile = self.summarize_sequences(alignedFile)
        maxStart, minEnd = self.parse_summary_file(summaryFile)
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)

        # Identify and remove chimeric reads
        chimera_ids = self.find_chimeras(screenedFile)
        self.cleanup_uchime_output(screenedFile)
        if file_exists(chimera_ids):
            no_chimera_file = self.remove_sequences(screenedFile, chimera_ids)
        else:
            no_chimera_file = screenedFile

        filteredFile = self.filter_sequences(no_chimera_file, trump='.')
        uniqueFile, nameFile = self.unique_sequences(filteredFile)
        preclusteredFile, nameFile = self.precluster_sequences(
            uniqueFile, nameFile)
        fileToCluster = preclusteredFile

        clusterFileRoot = '.'.join(fileToCluster.split('.')[:-1])
        for i, step in enumerate([0.01, 0.03]):
            log.info("Beginning iteration #%s - %s" % (i + 1, step))
            iterationInput = clusterFileRoot + '.%s.fasta' % step
            shutil.copyfile(fileToCluster, iterationInput)
            distanceMatrix = self.calculate_distance_matrix(iterationInput)
            listFile = self.cluster_sequences(distanceMatrix, nameFile)

            # Include all clusters during intermediate stages, others use min_cluster_size
            if step == self.distance:
                clusterListFile = self.separate_cluster_sequences(
                    listFile, fastqFile, step, self.min_cluster_size)
            else:
                clusterListFile = self.separate_cluster_sequences(
                    listFile, fastqFile, step, 1)

            # Generate and combine cluster sequences from the cluster-specific files
            consensusFile = self.generate_consensus_sequences(
                clusterListFile, step)
            self.cleanup_consensus_folder(consensusFile, step)
            selectedFile = self.select_sequences(consensusFile)
            selectedSequenceFile = self.output_selected_sequences(selectedFile)
            log.info("Finished iteration #%s - %s" % (i + 1, step))

            # If this isn't the last iteration, prepare the selected sequences for the next one:
            if step == self.distance:
                log.info("Iterative clustering finished")
            else:
                log.info(
                    "Iterative clustering not finished, preparing sequences for next iteration"
                )
                alignedFile = self.align_sequences(selectedSequenceFile)
                fileToCluster = self.filter_sequences(alignedFile, trump='.')
                nameFile = self.write_name_file(consensusFile, selectedFile,
                                                selectedSequenceFile)

        try:
            os.symlink(selectedSequenceFile, "Final_Output.fasta")
        except:
            pass
        self.write_name_file(consensusFile, selectedFile)
    def run(self):
        if self.data_type == 'bash5':
            fastqFile = self.extract_raw_ccs(self.sequenceFile)
        elif self.data_type == 'fastq':
            fastqFile = self.sequenceFile
        elif self.data_type == 'fasta':
            fastqFile = None
            fastaFile = self.sequenceFile

        # If we have a Fastq, filter low-quality reads and convert to FASTA
        if fastqFile:
            filteredFastq = self.filter_fastq(fastqFile)
            fastaFile, qualFile = self.separate_fastq(filteredFastq)

        # Align the Fasta sequences and remove partial reads
        alignedFile = self.align_sequences(fastaFile)
        summaryFile = self.summarize_sequences(alignedFile)
        maxStart, minEnd = self.parse_summary_file(summaryFile)
        screenedFile = self.screen_sequences(alignedFile,
                                             start=maxStart,
                                             end=minEnd)
        #filteredFile = self.filter_sequences( screenedFile, trump='.' )
        filteredFile = self.filter_sequences(screenedFile)

        # If masking is enabled, create an aligned FASTQ, mask the
        # low-quality bases and remove over-masked reads
        if self.enable_masking:
            alignedFastqFile = self.add_quality_to_alignment(
                fastqFile, filteredFile)
            maskedFastq = self.mask_fastq_sequences(alignedFastqFile)
            maskedFasta = self.convert_fastq_to_fasta(maskedFastq)
            screenedFile = self.screen_sequences(maskedFasta,
                                                 min_length=self.min_length)
        # Otherwise if masking is disabled, we'll use unique-ify and
        #    pre-cluster our sequences
        else:
            uniqueFile, nameFile = self.unique_sequences(filteredFile)
            screenedFile, nameFile = self.precluster_sequences(
                uniqueFile, nameFile)

        # Identify and remove chimeric reads
        #chimera_ids = self.find_chimeras_denovo( screenedFile, nameFile )
        chimera_ids = self.find_chimeras(screenedFile)

        self.cleanup_uchime_output(screenedFile)
        if file_exists(chimera_ids):
            fileForClustering = self.remove_sequences(screenedFile,
                                                      chimera_ids)
        else:
            fileForClustering = screenedFile

        # If enabled, calculate sequence distances and cluster
        if self.enable_clustering:
            distanceMatrix = self.calculate_distance_matrix(fileForClustering)
            listFile = self.cluster_sequences(distanceMatrix, nameFile)

        # If enabled, generate a consensus for each cluster from above
        if self.enable_consensus:
            clusterListFile = self.separate_cluster_sequences(
                listFile, fastqFile)
            consensusFile = self.generate_consensus_sequences(clusterListFile)
            self.cleanup_consensus_folder(consensusFile)
            selectedFile = self.select_final_sequences(consensusFile)
            finalFile = self.output_final_sequences(selectedFile)