def objectprep(self): """Create objects to store data and metadata for each sample. Also, perform necessary file manipulations""" # Move the files to subfolders and create objects self.runmetadata = createobject.ObjectCreation(self) if self.runmetadata.extension == 'fastq': # To streamline the CLARK process, decompress and combine .gz and paired end files as required printtime( 'Decompressing and combining .fastq files for CLARK analysis', self.start) fileprep.Fileprep(self) else: printtime('Using .fasta files for CLARK analysis', self.start) for sample in self.runmetadata.samples: sample.general.combined = sample.general.fastqfiles[0]
def objectprep(self): from spadespipeline import createobject # Only find the data files if a datapath is provided if self.datapath: self.runmetadata = createobject.ObjectCreation(self) else: for sample in self.runmetadata.samples: sample.general.abundancefile = sample.general.abundance sample.general.assignmentfile = sample.general.classification sample.general.fastqfiles = [sample.general.combined] # Print the metadata to file metadataprinter.MetadataPrinter(self) # Load the results in the csv files into dictionaries self.taxids()
def quality(self): """ Creates sequence objects and runs the quality assessment (FastQC), and quality trimming (bbduk) on the supplied sequences """ from spadespipeline import mMLST from spadespipeline import quaster from spadespipeline import prodigal from spadespipeline import createobject import shutil # Create the objects self.runmetadata = createobject.ObjectCreation(self) # Determine the amount of physical memory in the system from psutil import virtual_memory mem = virtual_memory() # If the total amount of memory is greater than 100GB (this could probably be lowered), run CLARK if mem.total >= 100000000000: # Run CLARK typing on the .fastq and .fasta files from metagenomefilter import automateCLARK automateCLARK.PipelineInit(self, 'typing') else: printtime('Not enough RAM to run CLARK!', self.start) # Create a list of the files to process fasta_files = glob.glob(self.path + "*.fasta") # For each of the fasta files, create a folder based on the name of the file - 2014-SEQ-0276.fasta will # be placed in a folder named 2014-SEQ-0276 for fasta in fasta_files: # Set the name of the folder fasta_dir = os.path.splitext(fasta)[0] # Create the folder make_path(fasta_dir) # Determine the name and extension of the fasta file fastaname = os.path.split(fasta)[-1] # Copy the file into the appropriate folder shutil.copy(fasta, os.path.join(fasta_dir, fastaname)) # Perform quality assessments. After each method completes, print the metadata to file # Run gene predictions prodigal.Prodigal(self) metadataprinter.MetadataPrinter(self) # Run rMLST mMLST.PipelineInit(self, 'rmlst') metadataprinter.MetadataPrinter(self) # Run quast assembly metrics for sample in self.runmetadata.samples: sample.general.filteredfile = sample.general.bestassemblyfile quaster.Quast(self) # Calculate the depth of coverage as well as other quality metrics using Qualimap metadataprinter.MetadataPrinter(self)
def populate(self): from spadespipeline import createobject # Move the files to subfolders and create objects if not self.pipeline: self.metadata = createobject.ObjectCreation(self) # Create and populate the .core attribute for sample in self.metadata.samples: setattr(sample, self.analysistype, GenObject()) sample[self.analysistype].alleles = self.genes sample[self.analysistype].allelenames = [ os.path.split(x)[1].split('.')[0] for x in self.genes ] sample[self.analysistype].profile = self.profile sample[self.analysistype].alleledir = self.coregenelocation sample[self.analysistype].reportdir = os.path.join( sample.general.outputdirectory, self.analysistype)
def annotatethreads(self): """ Perform multi-threaded prokka annotations of each strain """ import spadespipeline.createobject as createobject # Move the files to subfolders and create objects self.runmetadata = createobject.ObjectCreation(self) # Fix headers self.headerthreads() printtime('Performing prokka analyses', self.start) # Create and start threads for i in range(self.cpus): # Send the threads to the appropriate destination function threads = Thread(target=self.annotate, args=()) # Set the daemon to true - something to do with thread management threads.setDaemon(True) # Start the threading threads.start() for sample in self.runmetadata.samples: # Create the prokka attribute in the metadata object setattr(sample, 'prokka', GenObject()) # docker run -v /path/to/sequences:/path/to/sequences coreGenome # prokka 2014-SEQ-0275.fasta --force --genus Escherichia --species coli --usegenus --addgenes # --prefix 2014-SEQ-0275 --locustag EC0275 --outputdir /path/to/sequences/2014-SEQ-0275/prokka sample.prokka.outputdir = os.path.join( sample.general.outputdirectory, 'prokka') # TODO Incorporate MASH/rMLST/user inputted genus, species results in the system call # Create the system call sample.prokka.command = 'docker run -v {}:{} {} ' \ 'prokka {} ' \ '--force ' \ '--genus {} ' \ '--species {} ' \ '--usegenus ' \ '--addgenes ' \ '--prefix {} ' \ '--locustag {} ' \ '--outdir {}' \ .format(self.sequencepath, self.sequencepath, self.dockerimage, sample.general.fixedheaders, self.genus, self.species, sample.name, sample.name, sample.prokka.outputdir) # sample.name.split('-')[-1] self.queue.put(sample) self.queue.join() # Create the core genome self.codingthreads()