def run_blast(self): """ BLAST the alleles against the genomes """ logging.info('BLASTing alleles against sequence files') for query_file in self.query_files: # Create a metadata object to store all the sample-specific information sample = MetadataObject() sample.alleles = GenObject() local_db = os.path.splitext(query_file)[0] sample.name = os.path.basename(local_db) # Set the name of the BLAST output file sample.alleles.blast_report = os.path.join( self.reportpath, '{seq_id}.tsv'.format(seq_id=sample.name)) # Update the list of metadata objects with this sample self.runmetadata.samples.append(sample) self.blast_reports.append(sample.alleles.blast_report) # Run the appropriate BLAST command: BLASTn for nt; tBLASTn for aa against translated nt if self.amino_acid: blast = NcbitblastnCommandline(db=local_db, query=self.target_file, num_alignments=100000000, evalue=0.001, num_threads=self.cpus, task='tblastn', outfmt=self.outfmt, word_size=3, out=sample.alleles.blast_report) else: blast = NcbiblastnCommandline(db=local_db, query=self.target_file, num_alignments=100000000, evalue=0.001, num_threads=self.cpus, task='blastn', outfmt=self.outfmt, out=sample.alleles.blast_report) if not os.path.isfile(sample.alleles.blast_report): # Run BLAST - supply the record sequence as stdin, so BLAST doesn't look for an input file try: blast() # BLAST can have issues with genomes that have very large contigs. Retry the analysis using only one # thread except ApplicationError: os.remove(sample.alleles.blast_report) blast = NcbitblastnCommandline( db=local_db, query=self.target_file, num_alignments=100000000, evalue=0.001, num_threads=1, task='tblastn', outfmt=self.outfmt, word_size=3, out=sample.alleles.blast_report) blast()
def record_extraction(self): """ Parse the input FASTA file, and create a dictionary of header: sequence for each entry """ for record in SeqIO.parse(self.file, 'fasta'): metadata = MetadataObject() metadata.name = record.id metadata.records = record.seq.upper() # self.records[record.id] = record.seq.upper() self.samples.append(metadata)
def file_list(self): """ Create metadata objects for every .ab1 file in the supplied sequence path """ # Glob and sort a list of all the paths to the .ab1 files file_list = sorted(glob(os.path.join(self.sequencepath, '*.ab1')), reverse=True) for seq_file in file_list: # P19954_2019FCP-0000034876-4_VI0364_22 _1D06_M13-R17_E11_087.ab1 file_name = os.path.splitext(os.path.basename(seq_file))[0] # Create a metadata object for each sample sample = MetadataObject() sample.name = file_name sample.filepath = seq_file self.samples.append(sample)
def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(inputobject=self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(passed=self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(inputobject=self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(inputobject=self) # Print the metadata to file metadataprinter.MetadataPrinter(inputobject=self)
def query_prep(self): """ Create metadata objects for each sample """ logging.info('Preparing query files') # Find all the sequence files in the path fastas = sorted(glob(os.path.join(self.query_path, '*.fasta'))) for fasta in fastas: name = os.path.splitext(os.path.basename(fasta))[0] if name != 'combinedtargets': # Create a metadata object for each sample metadata = MetadataObject() metadata.samples = list() # Populate the metadata object with the required attributes metadata.name = name metadata.general = GenObject() metadata.commands = GenObject() metadata.alleles = GenObject() metadata.alleles.outputdirectory = os.path.join(self.query_path, metadata.name) # Set the name of the BLAST output file metadata.alleles.blast_report = os.path.join(metadata.alleles.outputdirectory, '{seq_id}.tsv'.format(seq_id=metadata.name)) try: os.remove(metadata.alleles.blast_report) except FileNotFoundError: pass make_path(metadata.alleles.outputdirectory) metadata.general.bestassemblyfile = relative_symlink(src_file=fasta, output_dir=metadata.alleles.outputdirectory, export_output=True) metadata.samples.append(metadata) self.runmetadata.samples.append(metadata)
def create_args(): arguments = ArgumentParser() arguments.sequencepath = test_sequences_path arguments.starttime = time() arguments.reportpath = os.path.join(arguments.sequencepath, 'reports') arguments.runmetadata = MetadataObject() # Create metadata objects for the samples arguments.runmetadata.samples = Filer.filer(arguments) return arguments
def excelparse(self): """ Parses input excel file, and creates objects with headers as keys, and cell data as values for each row """ logging.info('Loading excel file') # A dictionary to store the parsed excel file in a more readable format nesteddictionary = dict() # Use pandas to read in the excel file, and subsequently convert the pandas data frame to a dictionary # (.to_dict()). Only read the first fourteen columns (parse_cols=range(14)), as later columns are not # relevant to this script dictionary = pandas.read_excel(self.file, usecols=range(14)).to_dict() # Iterate through the dictionary - each header from the excel file for header in dictionary: # Sample is the primary key, and value is the value of the cell for that primary key + header combination for sample, value in dictionary[header].items(): # Update the dictionary with the new data try: nesteddictionary[sample].update({header: value}) # Create the nested dictionary if it hasn't been created yet except KeyError: nesteddictionary[sample] = dict() nesteddictionary[sample].update({header: value}) # Create objects for each of the samples, rather than using a nested dictionary. It may have been possible to # skip the creation of the nested dictionary, and create the objects from the original dictionary, but there # seemed to be too many possible places for something to go wrong for line in nesteddictionary: # Create an object for each sample metadata = MetadataObject() # Set the name of the metadata to be the primary key for the sample from the excel file metadata.name = line # Find the headers and values for every sample for header, value in nesteddictionary[line].items(): # Try/except for value.encode() - some of the value are type int, so they cannot be encoded try: # Create each attribute - use the header (in lowercase, and spaces removed) as the attribute name, # and the value as the attribute value setattr(metadata, str(header).replace(' ', '').lower(), str(value)) except TypeError: setattr(metadata, str(header).replace(' ', '').lower(), value) # Append the object to the list of objects self.metadata.append(metadata)
def ultimatum(args): SetupLogging(debug=args.debug) # Create metadata objects for the samples args.runmetadata = MetadataObject() args.runmetadata.samples = Filer.filer(args) finder = Ultimatum(metadataobject=args.runmetadata.samples, sequencepath=args.sequencepath, reportpath=os.path.join(args.sequencepath, 'reports'), primerfile=args.primerfile, primer_format=args.primer_format, mismatches=args.mismatches, export_amplicons=args.export_amplicons) finder.main()
def __init__(self, path, amino_acid): if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) self.allele_path = os.path.join(self.path, 'alleles') self.aa_allele_path = os.path.join(self.path, 'aa_alleles') self.profile_path = os.path.join(self.path, 'profile') self.aa_profile_path = os.path.join(self.path, 'aa_profile') make_path(self.profile_path) self.profile_file = os.path.join(self.profile_path, 'profile.txt') self.aa_profile_file = os.path.join(self.aa_profile_path, 'aa_profile.txt') self.query_path = os.path.join(self.path, 'query') self.report_path = os.path.join(self.path, 'reports') self.aa_report_path = os.path.join(self.path, 'aa_reports') make_path(self.report_path) make_path(self.aa_report_path) novel_alleles = glob(os.path.join(self.report_path, '*.fasta')) for novel_allele in novel_alleles: os.remove(novel_allele) self.aa_notes_path = os.path.join(self.path, 'aa_notes') make_path(self.aa_notes_path) self.aa_profile_notes = os.path.join(self.aa_notes_path, 'aa_profile_notes.tsv') self.amino_acid = amino_acid if not self.amino_acid: self.combined_targets = os.path.join(self.allele_path, 'combinedtargets.fasta') else: self.combined_targets = os.path.join(self.aa_allele_path, 'combinedtargets.fasta') self.gene_names = list() self.runmetadata = MetadataObject() self.runmetadata.samples = list() self.cpus = multiprocessing.cpu_count() - 1 self.profile_report = os.path.join(self.report_path, 'profiles.tsv') self.aa_profile_report = os.path.join(self.aa_report_path, 'aa_profiles.tsv') try: os.remove(self.profile_report) except FileNotFoundError: pass # Fields used for custom outfmt 6 BLAST output: self.fieldnames = ['query_id', 'subject_id', 'identical', 'mismatches', 'gaps', 'evalue', 'bit_score', 'query_length', 'subject_length', 'alignment_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'query_sequence', 'subject_sequence'] self.extended_fieldnames = self.fieldnames.copy() self.extended_fieldnames.insert(14, 'percent_match') self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \ 'qstart qend sstart send qseq sseq' # A string of the header to use for formatting the profile file, and the report headers self.data = str() self.aa_allele_dict = dict() self.aa_nt_allele_link_dict = dict()
def __init__(self, inputobject): # Define variables based on supplied arguments self.start = inputobject.start self.path = inputobject.path self.sequencepath = inputobject.sequencepath self.datapath = inputobject.datapath self.reportpath = inputobject.reportpath # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = inputobject.cpus # Set the cutoff to be a percent self.cutoff = inputobject.cutoff # Initialise a variable to hold the sample objects self.runmetadata = inputobject.runmetadata if inputobject.runmetadata else MetadataObject( ) # Initialise queues self.loadqueue = Queue() self.listqueue = Queue() self.filterqueue = Queue() self.devnull = open(os.devnull, 'wb')
def legacy(args): # Prep the args object to be used in the legacy script SetupLogging(debug=args.debug) args.reportpath = os.path.join(args.sequencepath, 'reports') args.runmetadata = MetadataObject() # Create metadata objects for the samples args.runmetadata.samples = Filer.filer(args) if args.analysistype == 'vtyper': # Perform vtx typing vtyper = Vtyper(inputobject=args, analysistype='vtyper_legacy', mismatches=args.mismatches) vtyper.vtyper() else: epcr = Custom(inputobject=args, analysistype='custom_epcr', primerfile=args.primerfile, ampliconsize=args.maxampliconsize, mismatches=args.mismatches, primer_format=args.primer_format, export_amplicons=args.export_amplicons) epcr.main()
def identity(args): SetupLogging(debug=args.debug) # Create metadata objects for the samples args.runmetadata = MetadataObject() args.runmetadata.samples = Filer.filer(args) if args.analysistype == 'vtyper': epcr = VtyperIP(metadataobject=args.runmetadata.samples, analysistype=args.analysistype, reportpath=os.path.join(args.sequencepath, 'reports')) epcr.vtyper() else: epcr = CustomIP(metadataobject=args.runmetadata.samples, sequencepath=args.sequencepath, reportpath=os.path.join(args.sequencepath, 'reports'), primerfile=args.primerfile, min_amplicon_size=args.minampliconsize, max_amplicon_size=args.maxampliconsize, primer_format=args.primer_format, mismatches=args.mismatches, export_amplicons=args.export_amplicons, contigbreaks=args.contigbreaks) epcr.main()
def __init__(self, start, sequencepath, referencefilepath, scriptpath, debug): """ :param start: :param sequencepath: :param referencefilepath: :param scriptpath: """ self.debug = debug SetupLogging(self.debug) logging.info('Welcome to the CFIA bacterial typing pipeline {}'.format( __version__)) # Define variables from the arguments - there may be a more streamlined way to do this self.sequencepath = os.path.join(sequencepath) self.path = self.sequencepath self.targetpath = os.path.join(referencefilepath) self.reffilepath = self.targetpath # Define the start time self.starttime = start self.start = self.starttime # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = multiprocessing.cpu_count() - 1 # Assertions to ensure that the provided variables are valid assert os.path.isdir(self.sequencepath), 'Supplied path location is not a valid directory {0!r:s}'\ .format(self.sequencepath) self.reportpath = os.path.join(self.sequencepath, 'reports') assert os.path.isdir(self.targetpath), 'Reference file path is not a valid directory {0!r:s}'\ .format(self.targetpath) self.commit = __version__ self.homepath = scriptpath self.analysistype = 'assembly_typing' self.genus_specific = False self.logfile = os.path.join(self.sequencepath, 'logfile') self.pipeline = True # Initialise the metadata object self.metadata = list() self.runmetadata = MetadataObject()
def test_sistr_seqsero(): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(var.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() metadata.general.trimmedcorrectedfastqfiles = [ os.path.join(var.sequencepath, 'seqsero', '2014-SEQ-1049_seqsero.fastq.gz') ] # Set the destination folder outputdir = os.path.join(var.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join(outputdir, 'out') metadata.general.logerr = os.path.join(outputdir, 'err') metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'ERR586739' or sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' method.seqsero() for sample in method.runmetadata.samples: assert sample.seqsero.predicted_serotype == '- 9:f,g,t:-' variable_update()
def variables(): v = MetadataObject() v.sequencepath = os.path.join(testpath, 'testdata') v.referencefilepath = os.path.join(v.sequencepath, 'databases') v.customsamplesheet = os.path.join(v.sequencepath, 'SampleSheet.csv') v.debug = True v.numreads = 2 v.kmerrange = '21' v.preprocess = False v.basicassembly = True v.threads = multiprocessing.cpu_count() v.startingtime = time() v.commit = b'' v.homepath = scriptpath return v
def probefinder(self): """ Find the longest probe sequences """ logging.info('Finding and filtering probe sequences') for sample in self.samples: # A list to store the metadata object for each alignment sample.gene = list() for align in sample.alignedalleles: # Create an object to store all the information for each alignment file metadata = GenObject() metadata.name = os.path.splitext(os.path.basename(align))[0] metadata.alignmentfile = align # Create an alignment object from the alignment file try: metadata.alignment = AlignIO.read(align, 'fasta') except ValueError: # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences # to be the length of the longest sequence # https://stackoverflow.com/q/32833230 records = SeqIO.parse(align, 'fasta') # Make a copy, otherwise our generator is exhausted after calculating maxlen records = list(records) # Calculate the length of the longest sequence maxlen = max(len(record.seq) for record in records) # Pad sequences so that they all have the same length for record in records: if len(record.seq) != maxlen: sequence = str(record.seq).ljust(maxlen, '.') record.seq = Seq(sequence) assert all(len(record.seq) == maxlen for record in records) # Write to file and do alignment metadata.alignmentfile = '{}_padded.tfa'.format( os.path.splitext(align)[0]) with open(metadata.alignmentfile, 'w') as padded: SeqIO.write(records, padded, 'fasta') # Align the padded sequences metadata.alignment = AlignIO.read(metadata.alignmentfile, 'fasta') metadata.summaryalign = AlignInfo.SummaryInfo( metadata.alignment) # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default # parameters of threshold=.7, and ambiguous='X' are used consensus = metadata.summaryalign.dumb_consensus() metadata.consensus = str(consensus) # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each # location along the entire consensus sequence metadata.pssm = metadata.summaryalign.pos_specific_score_matrix( consensus) metadata.identity = list() # Find the prevalence of each base for every location along the sequence for line in metadata.pssm: try: bases = [ line['A'], line['C'], line['G'], line['T'], line['-'] ] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases[:4]) / sum(bases) * 100))) except KeyError: bases = [line['A'], line['C'], line['G'], line['T']] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases) / sum(bases) * 100))) # List to store metadata objects metadata.windows = list() # Variable to store whether a suitable probe has been found for the current organism + gene pair. # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found passing = False # Create sliding windows of size self.max - self.min from the list of identities for each column # of the alignment for i in reversed(range(self.min, self.max + 1)): if not passing: windowdata = MetadataObject() windowdata.size = i windowdata.max = 0 windowdata.sliding = list() # Create a counter to store the starting location of the window in the sequence n = 0 # Create sliding windows from the range of sizes for the list of identities windows = self.window(metadata.identity, i) # Go through each window from the collection of sliding windows to determine which window(s) # has (have) the best results for window in windows: # Create another object to store all the data for the window slidingdata = MetadataObject() # Only consider the window if every position has a percent identity greater than the cutoff if min(window) > self.cutoff: # Populate the object with the necessary variables slidingdata.location = '{}:{}'.format(n, n + i) slidingdata.min = min(window) slidingdata.mean = float('{:.2f}'.format( numpy.mean(window))) slidingdata.sequence = str(consensus[n:n + i]) # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min # means a better/less overall percent identity, respectively windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \ else windowdata.max windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \ else windowdata.min # Add the object to the list of objects windowdata.sliding.append(slidingdata) passing = True n += 1 # All the object to the list of objects metadata.windows.append(windowdata) # All the object to the list of objects sample.gene.append(metadata)
def __init__(self, inputobject, extension='fasta', light=False): # Create an object to mimic the command line arguments necessary for the script args = MetadataObject() args.path = inputobject.path args.sequencepath = inputobject.path args.databasepath = os.path.join(inputobject.reffilepath, 'clark') make_path(args.databasepath) args.clarkpath = os.path.dirname(which('CLARK')) args.clarkpath += '/../opt/clark/' args.cutoff = 0.005 args.database = 'bacteria' args.rank = 'species' args.filter = False args.threads = inputobject.cpus args.runmetadata = inputobject.runmetadata args.clean_seqs = False args.reffilepath = inputobject.reffilepath args.extension = extension args.light = light # Run CLARK CLARK(args, inputobject.commit, inputobject.starttime, inputobject.homepath)
def __init__(self, args, pipelinecommit, startingtime, scriptpath): # Initialise variables self.commit = str(pipelinecommit) self.start = startingtime self.homepath = scriptpath # Define variables based on supplied arguments self.args = args self.path = os.path.join(args.path) assert os.path.isdir( self.path ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) self.sequencepath = os.path.join(args.sequencepath, '') assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \ .format(self.sequencepath) self.databasepath = os.path.join(args.databasepath, '') assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \ .format(self.databasepath) # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1 self.cpus = 4 # Set variables from the arguments self.database = args.database self.rank = args.rank self.clarkpath = args.clarkpath self.cutoff = float(args.cutoff) * 100 # Initialise variables for the analysis self.targetcall = str() self.classifycall = str() self.devnull = open(os.devnull, 'wb') self.filelist = os.path.join(self.path, 'sampleList.txt') self.reportlist = os.path.join(self.path, 'reportList.txt') self.abundancequeue = Queue() self.datapath = str() self.reportpath = os.path.join(self.path, 'reports') self.clean_seqs = args.clean_seqs self.light = args.light self.extension = args.extension if self.clean_seqs: try: self.reffilepath = args.reffilepath except AttributeError: self.clean_seqs = False # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects # and variables play nice try: if args.runmetadata: self.runmetadata = args.runmetadata # Create the name of the final report self.report = os.path.join( self.reportpath, 'abundance_{ft}.xlsx'.format(ft=self.extension)) # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK if not os.path.isfile(self.report): logging.info( 'Performing CLARK analysis on {ft} files'.format( ft=self.extension)) if self.extension != 'fastq': for sample in self.runmetadata.samples: sample.general.combined = sample.general.bestassemblyfile # Run the pipeline self.main() else: # Only perform FASTQ analyses if the sample is declared to be a metagenome metagenome = False for sample in self.runmetadata.samples: try: status = sample.run.Description except AttributeError: status = 'unknown' if status == 'metagenome': metagenome = True # If any of the samples are metagenomes, run the CLARK analysis on the raw files if metagenome: fileprep.Fileprep(self) # Run the pipeline self.main() # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects for sample in self.runmetadata.samples: # Create a GenObject to store metadata when this script is run as part of the pipeline clarkextension = 'clark{}'.format(self.extension) setattr(sample, clarkextension, GenObject()) # Create a folder to store all the CLARK files sample[clarkextension].outputpath = os.path.join( sample.general.outputdirectory, 'CLARK') make_path(sample[clarkextension].outputpath) if sample.general.bestassemblyfile != 'NA': # Move the files to the CLARK folder try: move( sample.general.abundance, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.abundance))) move( sample.general.classification, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.classification))) except (AttributeError, FileNotFoundError): pass # Set the CLARK-specific attributes try: sample[ clarkextension].abundance = sample.general.abundance sample[ clarkextension].classification = sample.general.classification sample[ clarkextension].combined = sample.general.combined except AttributeError: pass if self.extension == 'fastq': # Remove the combined .fastq files try: if type(sample[clarkextension].combined ) is list: os.remove( sample[clarkextension].combined) except (OSError, AttributeError): pass # Remove the text files lists of files and reports created by CLARK try: map( lambda x: os.remove(os.path.join(self.path, x) ), ['reportList.txt', 'sampleList.txt']) except OSError: pass else: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() self.main() except AttributeError: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() # Set the run description to 'metagenome' in order to process the samples for sample in self.runmetadata.samples: sample.run.Description = 'metagenome' self.main() # Optionally filter the .fastq reads based on taxonomic assignment if args.filter: filtermetagenome.PipelineInit(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
class RunAssemble(object): def main(self): """ Run the methods in the correct order """ # Start the assembly self.helper() # Create the quality object self.create_quality_object() # Run the quality analyses self.quality() # Perform assembly self.assemble() # Perform genus-agnostic typing self.agnostictyping() # Perform typing self.typing() # Compress or remove all large, temporary files created by the pipeline if not self.debug: compress.Compress(self) metadataprinter.MetadataPrinter(inputobject=self) def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(inputobject=self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(passed=self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(inputobject=self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(inputobject=self) # Print the metadata to file metadataprinter.MetadataPrinter(inputobject=self) def create_quality_object(self): """ Create the quality object """ self.qualityobject = quality.Quality(inputobject=self) def quality(self): """ Creates quality objects and runs quality assessments and quality processes on the supplied sequences """ # Validate that the FASTQ files are in the proper format, and that there are no issues e.g. different numbers # of forward and reverse reads, read length longer than quality score length, proper extension if not self.debug: self.fastq_validate() # Run FastQC on the unprocessed fastq files self.fastqc_raw() # Perform quality trimming and FastQC on the trimmed files self.quality_trim() # Run FastQC on the trimmed files self.fastqc_trimmed() # Perform error correcting on the reads self.error_correct() # Detect contamination in the reads self.contamination_detection() # Run FastQC on the processed fastq files self.fastqc_trimmedcorrected() # Exit if only pre-processing of data is requested metadataprinter.MetadataPrinter(inputobject=self) if self.preprocess: logging.info('Pre-processing complete') quit() def fastq_validate(self): """ Attempt to detect and fix issues with the FASTQ files """ self.qualityobject.validate_fastq() metadataprinter.MetadataPrinter(inputobject=self) def fastqc_raw(self): """ Run FastQC on the unprocessed FASTQ files """ self.qualityobject.fastqcthreader(level='Raw') metadataprinter.MetadataPrinter(inputobject=self) def quality_trim(self): """ Perform quality trimming and FastQC on the trimmed files """ self.qualityobject.trimquality() metadataprinter.MetadataPrinter(inputobject=self) def fastqc_trimmed(self): """ Run FastQC on the quality trimmed FASTQ files """ self.qualityobject.fastqcthreader(level='Trimmed') metadataprinter.MetadataPrinter(inputobject=self) def error_correct(self): """ Perform error correcting on the reads """ self.qualityobject.error_correction() metadataprinter.MetadataPrinter(inputobject=self) def contamination_detection(self): """ Calculate the levels of contamination in the reads """ self.qualityobject.contamination_finder(report_path=self.reportpath, debug=self.debug) metadataprinter.MetadataPrinter(inputobject=self) def fastqc_trimmedcorrected(self): """ Run FastQC on the processed fastq files """ self.qualityobject.fastqcthreader(level='trimmedcorrected') metadataprinter.MetadataPrinter(inputobject=self) def assemble(self): """ Assemble genomes and perform some basic quality analyses """ # Assemble genomes self.assemble_genomes() # Calculate assembly metrics on raw assemblies self.evaluate_assemblies() # ORF detection self.prodigal() # CLARK analyses self.clark() def assemble_genomes(self): """ Use skesa to assemble genomes """ assembly = skesa.Skesa(inputobject=self) assembly.main() metadataprinter.MetadataPrinter(inputobject=self) def evaluate_assemblies(self): """ Evaluate assemblies with Quast """ qual = evaluate.AssemblyEvaluation(inputobject=self) qual.main() metadataprinter.MetadataPrinter(inputobject=self) def prodigal(self): """ Use prodigal to detect open reading frames in the assemblies """ prodigal.Prodigal(self) metadataprinter.MetadataPrinter(self) def clark(self): """ Run CLARK metagenome analyses on the raw reads and assemblies if the system has adequate resources """ # Run CLARK typing on the .fastq and .fasta files automateCLARK.PipelineInit(inputobject=self, extension='fasta', light=True) automateCLARK.PipelineInit(inputobject=self, extension='fastq', light=True) def agnostictyping(self): """ Perform typing that does not require the genus of the organism to be known """ # Run mash self.mash() # Run rMLST on assemblies self.rmlst_assembled() # Create reports summarising the run and sample qualities self.quality_report() # Run the 16S analyses self.sixteens() # Find genes of interest self.genesippr() # Resistance finding - raw reads self.ressippr() # Resistance finding - assemblies self.resfinder() # Run MOB-suite self.mob_suite() # Prophage detection self.prophages() # Univec contamination search self.univec() # Virulence self.virulence() # cgMLST self.cgmlst() def mash(self): """ Run mash to determine closest refseq genome """ mash.Mash(inputobject=self, analysistype='mash') metadataprinter.MetadataPrinter(inputobject=self) def rmlst_assembled(self): """ Run rMLST analyses on assemblies """ if not os.path.isfile(os.path.join(self.reportpath, 'rmlst.csv')): rmlst = BLAST(args=self, analysistype='rmlst', cutoff=100) rmlst.seekr() else: parse = ReportParse(args=self, analysistype='rmlst') parse.report_parse() metadataprinter.MetadataPrinter(inputobject=self) def quality_report(self): """ Create reports summarising the run and sample quality outputs """ qual_report = reporter.Reporter(self) qual_report.run_quality_reporter() qual_report.sample_quality_report() def sixteens(self): """ Run the 16S analyses """ SixteensFull(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='sixteens_full', cutoff=0.95) metadataprinter.MetadataPrinter(inputobject=self) def genesippr(self): """ Find genes of interest """ GeneSippr(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='genesippr', cutoff=0.95, pipeline=False, revbait=False) metadataprinter.MetadataPrinter(inputobject=self) def mob_suite(self): """ """ mob = MobRecon(metadata=self.runmetadata.samples, analysistype='mobrecon', databasepath=self.reffilepath, threads=self.cpus, logfile=self.logfile, reportpath=self.reportpath) mob.mob_recon() metadataprinter.MetadataPrinter(inputobject=self) def ressippr(self): """ Resistance finding - raw reads """ res = Resistance(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='resfinder', cutoff=0.7, pipeline=False, revbait=True) res.main() metadataprinter.MetadataPrinter(inputobject=self) def resfinder(self): """ Resistance finding - assemblies """ resfinder = BLAST(args=self, analysistype='resfinder_assembled') resfinder.seekr() metadataprinter.MetadataPrinter(inputobject=self) def prophages(self, cutoff=90): """ Prophage detection :param cutoff: cutoff value to be used in the analyses """ prophages = Prophages(args=self, analysistype='prophages', cutoff=cutoff, unique=True) if not os.path.isfile(os.path.join(self.reportpath, 'prophages.csv')): prophages.seekr() metadataprinter.MetadataPrinter(inputobject=self) def univec(self): """ Univec contamination search """ if not os.path.isfile(os.path.join(self.reportpath, 'univec.csv')): univec = Univec(args=self, analysistype='univec', cutoff=80, unique=True) univec.seekr() metadataprinter.MetadataPrinter(inputobject=self) def virulence(self): """ Virulence gene detection """ vir = Virulence(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='virulence', cutoff=0.9, pipeline=False, revbait=True) if not os.path.isfile(os.path.join(self.reportpath, 'virulence.csv')): vir.reporter() metadataprinter.MetadataPrinter(inputobject=self) def cgmlst(self): """ Run rMLST analyses on raw reads """ if not os.path.isfile(os.path.join(self.reportpath, 'cgmlst.csv')): cgmlst = KMAMLST(args=self, pipeline=True, analysistype='cgmlst', cutoff=98, kma_kwargs=' -cge -and') cgmlst.main() else: parse = ReportParse(args=self, analysistype='cgmlst') parse.report_parse() metadataprinter.MetadataPrinter(inputobject=self) def typing(self): """ Perform analyses that use genera-specific databases """ # Run modules and print metadata to file # MLST on assemblies self.mlst_assembled() # Assembly-based serotyping self.ec_typer() # Serotyping self.serosippr() # SeqSero self.seqsero() # Assembly-based vtyper self.legacy_vtyper() # Raw read verotoxin typing self.verotoxin() # Sistr self.sistr() # Calculate the presence/absence of GDCS self.run_gdcs() # Create a final summary report self.run_report() def mlst_assembled(self): """ Run rMLST analyses on assemblies """ if not os.path.isfile(os.path.join(self.reportpath, 'mlst.csv')): mlst = BLAST(args=self, analysistype='mlst', cutoff=100, genus_specific=True) mlst.seekr() else: parse = ReportParse(args=self, analysistype='mlst') parse.report_parse() metadataprinter.MetadataPrinter(inputobject=self) def ec_typer(self): """ Assembly-based serotyping """ ec = ECTyper(metadata=self.runmetadata, report_path=self.reportpath, assembly_path=os.path.join(self.path, 'raw_assemblies'), threads=self.cpus, logfile=self.logfile) ec.main() metadataprinter.MetadataPrinter(inputobject=self) def serosippr(self): """ Serotyping analyses """ Serotype(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='serosippr', cutoff=0.90, pipeline=True) metadataprinter.MetadataPrinter(inputobject=self) def seqsero(self): """ Run SeqSero2 on Salmonella samples """ seqsero = SeqSero(self) seqsero.main() metadataprinter.MetadataPrinter(inputobject=self) def legacy_vtyper(self): """ Legacy vtyper - uses ePCR """ legacy_vtyper = LegacyVtyper(inputobject=self, analysistype='legacy_vtyper', mismatches=2) legacy_vtyper.vtyper() metadataprinter.MetadataPrinter(inputobject=self) def verotoxin(self): """ Raw read verotoxin typing """ vero = Verotoxin(args=self, pipeline=True, analysistype='verotoxin', cutoff=90) vero.main() def sistr(self): """ Sistr """ sistr_obj = sistr.Sistr(inputobject=self, analysistype='sistr') sistr_obj.main() metadataprinter.MetadataPrinter(inputobject=self) def run_gdcs(self): """ Determine the presence of genomically-dispersed conserved sequences (genes from MLST, rMLST, and cgMLST analyses) """ # Run the GDCS analysis gdcs = GDCS(inputobject=self) gdcs.main() metadataprinter.MetadataPrinter(inputobject=self) def run_report(self): """ Create the final combinedMetadata report """ run_report = reporter.Reporter(self) # Create the standard and legacy reports run_report.metadata_reporter() run_report.legacy_reporter() # Clean the large attributes from the metadata objects run_report.clean_object() def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ self.debug = args.debug SetupLogging(self.debug) logging.info( 'Welcome to the CFIA OLC Workflow for Bacterial Assembly and Typing (COWBAT) version {version}' .format(version=__version__)) # Define variables from the arguments - there may be a more streamlined way to do this self.args = args if args.sequencepath.startswith('~'): self.path = os.path.abspath( os.path.expanduser(os.path.join(args.sequencepath))) else: self.path = os.path.abspath(os.path.join(args.sequencepath)) self.sequencepath = self.path if args.referencefilepath.startswith('~'): self.reffilepath = os.path.expanduser( os.path.abspath(os.path.join(args.referencefilepath))) else: self.reffilepath = os.path.abspath( os.path.join(args.referencefilepath)) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime if args.customsamplesheet: if args.customsamplesheet.startswith('~'): self.customsamplesheet = os.path.expanduser( os.path.abspath(os.path.join(self.customsamplesheet))) else: self.customsamplesheet = os.path.abspath( os.path.join(args.customsamplesheet)) else: self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {css}' \ .format(css=self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile( os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True logging.warning( 'Could not find a sample sheet. Performing basic assembly (no run metadata captured)' ) # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count( ) - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir( self.path ), 'Supplied path location is not a valid directory {0!r:s}'.format( self.path) self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}' \ .format(self.reffilepath) self.commit = __version__ self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ self.debug = args.debug SetupLogging(self.debug) logging.info( 'Welcome to the CFIA OLC Workflow for Bacterial Assembly and Typing (COWBAT) version {version}' .format(version=__version__)) # Define variables from the arguments - there may be a more streamlined way to do this self.args = args if args.sequencepath.startswith('~'): self.path = os.path.abspath( os.path.expanduser(os.path.join(args.sequencepath))) else: self.path = os.path.abspath(os.path.join(args.sequencepath)) self.sequencepath = self.path if args.referencefilepath.startswith('~'): self.reffilepath = os.path.expanduser( os.path.abspath(os.path.join(args.referencefilepath))) else: self.reffilepath = os.path.abspath( os.path.join(args.referencefilepath)) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime if args.customsamplesheet: if args.customsamplesheet.startswith('~'): self.customsamplesheet = os.path.expanduser( os.path.abspath(os.path.join(self.customsamplesheet))) else: self.customsamplesheet = os.path.abspath( os.path.join(args.customsamplesheet)) else: self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {css}' \ .format(css=self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile( os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True logging.warning( 'Could not find a sample sheet. Performing basic assembly (no run metadata captured)' ) # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count( ) - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir( self.path ), 'Supplied path location is not a valid directory {0!r:s}'.format( self.path) self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}' \ .format(self.reffilepath) self.commit = __version__ self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def __init__(self, path, fasta_path, records, amino_acid): if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) if fasta_path.startswith('~'): self.fasta_path = os.path.abspath( os.path.expanduser(os.path.join(fasta_path))) else: self.fasta_path = os.path.abspath(os.path.join(fasta_path)) self.working_path = os.path.join(self.path, 'strain_profiles') self.sequencepath = os.path.join(self.working_path, 'query') make_path(self.sequencepath) target_files = [ fasta for fasta in sorted(glob(os.path.join(self.fasta_path, '*.fasta'))) if os.path.basename(fasta) != 'combinedtargets.fasta' or os.path.basename(fasta) != 'custom.tfa' ] self.query_files = list() # Create symlinks of the target files in the local path for target in target_files: try: query_file = os.path.join( self.sequencepath, os.path.basename(target).replace('.tfa', '.fasta')) self.query_files.append(query_file) os.symlink(target, query_file) except FileExistsError: pass self.targetpath = os.path.join(self.working_path, 'targets') make_path(self.targetpath) self.profilepath = os.path.join(self.working_path, 'sequence_profile') make_path(self.profilepath) self.profile_file = os.path.join(self.profilepath, 'profile.txt') self.target_file = os.path.join(self.targetpath, 'combinedtargets.fasta') shutil.copyfile(src=os.path.join(os.path.join(self.path, 'alleles'), 'combinedtargets.fasta'), dst=self.target_file) self.reportpath = os.path.join(self.working_path, 'reports') make_path(self.reportpath) self.strain_profile_path = os.path.join(self.working_path, 'strain_profiles') make_path(self.strain_profile_path) self.profile_report = os.path.join(self.strain_profile_path, 'profiles.tsv') self.cpus = multiprocessing.cpu_count() - 1 self.starttime = time() self.start = self.starttime self.runmetadata = MetadataObject() self.runmetadata.samples = list() self.records = records # Create an object for performing BLAST analyses if amino_acid: self.amino_acid = amino_acid else: self.amino_acid = None if amino_acid: self.program = 'tblastn' else: self.program = 'blastn' # Fields used for custom outfmt 6 BLAST output: self.fieldnames = [ 'query_id', 'subject_id', 'identical', 'mismatches', 'gaps', 'evalue', 'bit_score', 'query_length', 'subject_length', 'alignment_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'query_sequence', 'subject_sequence' ] self.extended_fieldnames = self.fieldnames.copy() self.extended_fieldnames.insert(14, 'percent_match') self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \ 'qstart qend sstart send qseq sseq' self.blast_reports = list() self.profile_dict = dict() self.profile_data = dict() self.profile_set = list() self.sequence_profile = dict() self.profile_matches = dict() self.new_profiles = list() # A string of the header to use for formatting the profile file, and the report headers genes = '\t'.join(sorted(self.records)) self.data = 'ST\t{genes}\n'.format(genes=genes.rstrip()) self.gene_names = list()
def __init__(self): # Get the current commit of the pipeline from git # Extract the path of the current script from the full path + file name homepath = os.path.split(os.path.abspath(__file__))[0] # Find the commit of the script by running a command to change to the directory containing the script and # run a git command to return the short version of the commit hash commit = subprocess.Popen( 'cd {} && git tag | tail -n 1'.format(homepath), shell=True, stdout=subprocess.PIPE).communicate()[0].rstrip() # Parser for arguments parser = ArgumentParser( description='Filter reads based on taxonomic assignment') parser.add_argument('-v', '--version', action='version', version='%(prog)s commit {}'.format(commit)) parser.add_argument('path', help='Specify path') parser.add_argument( '-t', '--threads', help= 'Number of threads. Default is the number of cpus in the system' ) parser.add_argument('-s', '--sequencepath', required=True, help='Path of .fastq(.gz) files to process.') parser.add_argument( '-d', '--datapath', required=True, help= 'Path of .csv files created by CLARK with read ID, length, and assignment.' ) parser.add_argument( '-c', '--cutoff', default=0.01, help= 'Cutoff value for deciding which taxIDs to use when sorting .fastq files. ' 'Defaults to 1 percent. Please note that you must use a decimal format: enter 0.05' ' to get a 5 percent cutoff value') parser.add_argument( '-x', '--taxids', help='NOT IMPLEMENTED: CSV of desired taxIDs from each sample. ' ) # Get the arguments into an object args = parser.parse_args() self.start = time() # Define variables based on supplied arguments self.path = os.path.join(args.path) assert os.path.isdir( self.path ), 'Supplied path is not a valid directory {path}'.format( path=self.path) self.sequencepath = os.path.join(args.sequencepath) assert os.path.isdir(self.sequencepath), 'Sequence location supplied is not a valid directory {seq_path}' \ .format(seq_path=self.sequencepath) self.datapath = os.path.join(args.datapath) self.reportpath = os.path.join(self.path, 'reports') # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count( ) # Set the cutoff to be a percent self.cutoff = args.cutoff * 100 # Run the pipeline self.runmetadata = MetadataObject() genome = FilterGenome(self) genome.objectprep() logging.info('Filtering complete')