def query_prep(self): """ Create metadata objects for each sample """ logging.info('Preparing query files') # Find all the sequence files in the path fastas = sorted(glob(os.path.join(self.query_path, '*.fasta'))) for fasta in fastas: name = os.path.splitext(os.path.basename(fasta))[0] if name != 'combinedtargets': # Create a metadata object for each sample metadata = MetadataObject() metadata.samples = list() # Populate the metadata object with the required attributes metadata.name = name metadata.general = GenObject() metadata.commands = GenObject() metadata.alleles = GenObject() metadata.alleles.outputdirectory = os.path.join(self.query_path, metadata.name) # Set the name of the BLAST output file metadata.alleles.blast_report = os.path.join(metadata.alleles.outputdirectory, '{seq_id}.tsv'.format(seq_id=metadata.name)) try: os.remove(metadata.alleles.blast_report) except FileNotFoundError: pass make_path(metadata.alleles.outputdirectory) metadata.general.bestassemblyfile = relative_symlink(src_file=fasta, output_dir=metadata.alleles.outputdirectory, export_output=True) metadata.samples.append(metadata) self.runmetadata.samples.append(metadata)
def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(inputobject=self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(passed=self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(inputobject=self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(inputobject=self) # Print the metadata to file metadataprinter.MetadataPrinter(inputobject=self)
def typing_reports(self): """ Create empty attributes for analyses that were not performed, so that the metadata report can be created :return: """ for sample in self.runmetadata.samples: sample.confindr = GenObject() sample.mapping = GenObject() sample.quast = GenObject() sample.qualimap = GenObject() sample.verotoxin = GenObject() if not GenObject.isattr(sample, 'sistr'): sample.sistr = GenObject() sample.mapping.MeanInsertSize = 0 sample.mapping.MeanCoveragedata = 0 sample.genesippr.report_output = set() sample.genesippr.results = dict() sample.verotoxin.verotoxin_subtypes_set = sample.legacy_vtyper.toxinprofile try: for gene, percentid in sample.genesippr.blastresults.items(): if percentid > 95: sample.genesippr.report_output.add(gene.split('_')[0]) except AttributeError: sample.genesippr.report_output = list() sample.genesippr.report_output = sorted( list(sample.genesippr.report_output)) # Create a report run_report = reporter.Reporter(self) # Create the standard and legacy reports run_report.metadata_reporter() run_report.legacy_reporter()
def run_blast(self): """ BLAST the alleles against the genomes """ logging.info('BLASTing alleles against sequence files') for query_file in self.query_files: # Create a metadata object to store all the sample-specific information sample = MetadataObject() sample.alleles = GenObject() local_db = os.path.splitext(query_file)[0] sample.name = os.path.basename(local_db) # Set the name of the BLAST output file sample.alleles.blast_report = os.path.join( self.reportpath, '{seq_id}.tsv'.format(seq_id=sample.name)) # Update the list of metadata objects with this sample self.runmetadata.samples.append(sample) self.blast_reports.append(sample.alleles.blast_report) # Run the appropriate BLAST command: BLASTn for nt; tBLASTn for aa against translated nt if self.amino_acid: blast = NcbitblastnCommandline(db=local_db, query=self.target_file, num_alignments=100000000, evalue=0.001, num_threads=self.cpus, task='tblastn', outfmt=self.outfmt, word_size=3, out=sample.alleles.blast_report) else: blast = NcbiblastnCommandline(db=local_db, query=self.target_file, num_alignments=100000000, evalue=0.001, num_threads=self.cpus, task='blastn', outfmt=self.outfmt, out=sample.alleles.blast_report) if not os.path.isfile(sample.alleles.blast_report): # Run BLAST - supply the record sequence as stdin, so BLAST doesn't look for an input file try: blast() # BLAST can have issues with genomes that have very large contigs. Retry the analysis using only one # thread except ApplicationError: os.remove(sample.alleles.blast_report) blast = NcbitblastnCommandline( db=local_db, query=self.target_file, num_alignments=100000000, evalue=0.001, num_threads=1, task='tblastn', outfmt=self.outfmt, word_size=3, out=sample.alleles.blast_report) blast()
def test_sistr_seqsero(): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(var.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() metadata.general.trimmedcorrectedfastqfiles = [ os.path.join(var.sequencepath, 'seqsero', '2014-SEQ-1049_seqsero.fastq.gz') ] # Set the destination folder outputdir = os.path.join(var.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join(outputdir, 'out') metadata.general.logerr = os.path.join(outputdir, 'err') metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'ERR586739' or sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' method.seqsero() for sample in method.runmetadata.samples: assert sample.seqsero.predicted_serotype == '- 9:f,g,t:-' variable_update()
def sketch_reads(self): """ """ # Create the threads for the analysis for i in range(self.cpus): threads = Thread(target=self.sketch, args=()) threads.setDaemon(True) threads.start() for sample in self.metadata: # Create the analysis type-specific GenObject setattr(sample, 'paratyper', GenObject()) sample.paratyper.sketchfilenoext = os.path.join( sample.general.outputdirectory, sample.name) sample.paratyper.sketchfile = sample.paratyper.sketchfilenoext + '.msh' sample.commands.sketch = 'mash sketch -m 2 -r {reads} -o {output_file}' \ .format(reads=sample.general.normalised_reads, output_file=sample.paratyper.sketchfilenoext) self.sketchqueue.put(sample) # Join the threads self.sketchqueue.join()
def taxids(self): for sample in self.runmetadata.samples: # Initialise a list to store the taxIDs of interest sample.general.taxids = list() # Read the abundance file into a dictionary abundancedict = DictReader(open(sample.general.abundancefile)) # Filter abundance to taxIDs with at least self.cutoff% of the total proportion for row in abundancedict: # The UNKNOWN category doesn't contain a 'Lineage' column, and therefore, subsequent columns are # shifted out of proper alignment, and do not contain the appropriate data try: if float(row['Proportion_All(%)']) > self.cutoff: sample.general.taxids.append(row['TaxID'], ) except ValueError: pass for taxid in sample.general.taxids: # Create the an attribute for each taxID setattr(sample, taxid, GenObject()) sample[taxid].readlist = list() # Print the metadata to file metadataprinter.MetadataPrinter(self) # Load the assignment file to memory self.loadassignment()
def estimateabundance(self): """ Estimate the abundance of taxonomic groups """ logging.info('Estimating abundance of taxonomic groups') # Create and start threads for i in range(self.cpus): # Send the threads to the appropriate destination function threads = Thread(target=self.estimate, args=()) # Set the daemon to true - something to do with thread management threads.setDaemon(True) # Start the threading threads.start() with progressbar(self.runmetadata.samples) as bar: for sample in bar: try: if sample.general.combined != 'NA': # Set the name of the abundance report sample.general.abundance = sample.general.combined.split( '.')[0] + '_abundance.csv' # if not hasattr(sample, 'commands'): if not sample.commands.datastore: sample.commands = GenObject() # Define system calls sample.commands.target = self.targetcall sample.commands.classify = self.classifycall sample.commands.abundancecall = \ 'cd {} && ./estimate_abundance.sh -D {} -F {} > {}'.format(self.clarkpath, self.databasepath, sample.general.classification, sample.general.abundance) self.abundancequeue.put(sample) except KeyError: pass self.abundancequeue.join()
def __init__(self, args, pipelinecommit, startingtime, scriptpath): # Initialise variables self.commit = str(pipelinecommit) self.start = startingtime self.homepath = scriptpath # Define variables based on supplied arguments self.args = args self.path = os.path.join(args.path) assert os.path.isdir( self.path ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) self.sequencepath = os.path.join(args.sequencepath, '') assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \ .format(self.sequencepath) self.databasepath = os.path.join(args.databasepath, '') assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \ .format(self.databasepath) # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1 self.cpus = 4 # Set variables from the arguments self.database = args.database self.rank = args.rank self.clarkpath = args.clarkpath self.cutoff = float(args.cutoff) * 100 # Initialise variables for the analysis self.targetcall = str() self.classifycall = str() self.devnull = open(os.devnull, 'wb') self.filelist = os.path.join(self.path, 'sampleList.txt') self.reportlist = os.path.join(self.path, 'reportList.txt') self.abundancequeue = Queue() self.datapath = str() self.reportpath = os.path.join(self.path, 'reports') self.clean_seqs = args.clean_seqs self.light = args.light self.extension = args.extension if self.clean_seqs: try: self.reffilepath = args.reffilepath except AttributeError: self.clean_seqs = False # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects # and variables play nice try: if args.runmetadata: self.runmetadata = args.runmetadata # Create the name of the final report self.report = os.path.join( self.reportpath, 'abundance_{ft}.xlsx'.format(ft=self.extension)) # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK if not os.path.isfile(self.report): logging.info( 'Performing CLARK analysis on {ft} files'.format( ft=self.extension)) if self.extension != 'fastq': for sample in self.runmetadata.samples: sample.general.combined = sample.general.bestassemblyfile # Run the pipeline self.main() else: # Only perform FASTQ analyses if the sample is declared to be a metagenome metagenome = False for sample in self.runmetadata.samples: try: status = sample.run.Description except AttributeError: status = 'unknown' if status == 'metagenome': metagenome = True # If any of the samples are metagenomes, run the CLARK analysis on the raw files if metagenome: fileprep.Fileprep(self) # Run the pipeline self.main() # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects for sample in self.runmetadata.samples: # Create a GenObject to store metadata when this script is run as part of the pipeline clarkextension = 'clark{}'.format(self.extension) setattr(sample, clarkextension, GenObject()) # Create a folder to store all the CLARK files sample[clarkextension].outputpath = os.path.join( sample.general.outputdirectory, 'CLARK') make_path(sample[clarkextension].outputpath) if sample.general.bestassemblyfile != 'NA': # Move the files to the CLARK folder try: move( sample.general.abundance, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.abundance))) move( sample.general.classification, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.classification))) except (AttributeError, FileNotFoundError): pass # Set the CLARK-specific attributes try: sample[ clarkextension].abundance = sample.general.abundance sample[ clarkextension].classification = sample.general.classification sample[ clarkextension].combined = sample.general.combined except AttributeError: pass if self.extension == 'fastq': # Remove the combined .fastq files try: if type(sample[clarkextension].combined ) is list: os.remove( sample[clarkextension].combined) except (OSError, AttributeError): pass # Remove the text files lists of files and reports created by CLARK try: map( lambda x: os.remove(os.path.join(self.path, x) ), ['reportList.txt', 'sampleList.txt']) except OSError: pass else: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() self.main() except AttributeError: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() # Set the run description to 'metagenome' in order to process the samples for sample in self.runmetadata.samples: sample.run.Description = 'metagenome' self.main() # Optionally filter the .fastq reads based on taxonomic assignment if args.filter: filtermetagenome.PipelineInit(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
def probefinder(self): """ Find the longest probe sequences """ logging.info('Finding and filtering probe sequences') for sample in self.samples: # A list to store the metadata object for each alignment sample.gene = list() for align in sample.alignedalleles: # Create an object to store all the information for each alignment file metadata = GenObject() metadata.name = os.path.splitext(os.path.basename(align))[0] metadata.alignmentfile = align # Create an alignment object from the alignment file try: metadata.alignment = AlignIO.read(align, 'fasta') except ValueError: # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences # to be the length of the longest sequence # https://stackoverflow.com/q/32833230 records = SeqIO.parse(align, 'fasta') # Make a copy, otherwise our generator is exhausted after calculating maxlen records = list(records) # Calculate the length of the longest sequence maxlen = max(len(record.seq) for record in records) # Pad sequences so that they all have the same length for record in records: if len(record.seq) != maxlen: sequence = str(record.seq).ljust(maxlen, '.') record.seq = Seq(sequence) assert all(len(record.seq) == maxlen for record in records) # Write to file and do alignment metadata.alignmentfile = '{}_padded.tfa'.format( os.path.splitext(align)[0]) with open(metadata.alignmentfile, 'w') as padded: SeqIO.write(records, padded, 'fasta') # Align the padded sequences metadata.alignment = AlignIO.read(metadata.alignmentfile, 'fasta') metadata.summaryalign = AlignInfo.SummaryInfo( metadata.alignment) # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default # parameters of threshold=.7, and ambiguous='X' are used consensus = metadata.summaryalign.dumb_consensus() metadata.consensus = str(consensus) # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each # location along the entire consensus sequence metadata.pssm = metadata.summaryalign.pos_specific_score_matrix( consensus) metadata.identity = list() # Find the prevalence of each base for every location along the sequence for line in metadata.pssm: try: bases = [ line['A'], line['C'], line['G'], line['T'], line['-'] ] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases[:4]) / sum(bases) * 100))) except KeyError: bases = [line['A'], line['C'], line['G'], line['T']] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases) / sum(bases) * 100))) # List to store metadata objects metadata.windows = list() # Variable to store whether a suitable probe has been found for the current organism + gene pair. # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found passing = False # Create sliding windows of size self.max - self.min from the list of identities for each column # of the alignment for i in reversed(range(self.min, self.max + 1)): if not passing: windowdata = MetadataObject() windowdata.size = i windowdata.max = 0 windowdata.sliding = list() # Create a counter to store the starting location of the window in the sequence n = 0 # Create sliding windows from the range of sizes for the list of identities windows = self.window(metadata.identity, i) # Go through each window from the collection of sliding windows to determine which window(s) # has (have) the best results for window in windows: # Create another object to store all the data for the window slidingdata = MetadataObject() # Only consider the window if every position has a percent identity greater than the cutoff if min(window) > self.cutoff: # Populate the object with the necessary variables slidingdata.location = '{}:{}'.format(n, n + i) slidingdata.min = min(window) slidingdata.mean = float('{:.2f}'.format( numpy.mean(window))) slidingdata.sequence = str(consensus[n:n + i]) # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min # means a better/less overall percent identity, respectively windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \ else windowdata.max windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \ else windowdata.min # Add the object to the list of objects windowdata.sliding.append(slidingdata) passing = True n += 1 # All the object to the list of objects metadata.windows.append(windowdata) # All the object to the list of objects sample.gene.append(metadata)