def __init__(self, inputobject): self.metadata = inputobject.runmetadata.samples self.start = inputobject.starttime self.commit = inputobject.commit # Determine the versions of the software used printtime('Populating metadata', self.start) self.python = sys.version.replace('\n', '') self.arch = ", ".join(os.uname()) self.blast = get_version(['blastn', '-version']).decode('utf-8').split('\n')[0].split()[1] self.spades = get_version(['spades.py', '-v']).decode('utf-8').split('\n')[0].split()[1] self.bowversion = Bowtie2CommandLine(version=True)()[0].split('\n')[0].split()[-1] self.samversion = get_version(['samtools', '--version']).decode('utf-8').split('\n')[0].split()[1] # Qualimap seems to have an Java warning message that doesn't necessarily show up on every system # Only capture the line that starts with 'Qualimap' qualimaplist = get_version(['qualimap', '--help']).decode('utf-8').split('\n') for line in qualimaplist: if 'QualiMap' in line: self.qualimap = line.split()[1] self.mash = get_version(['mash']).decode('utf-8').split('\n')[1].split()[2] self.prodigal = get_version(['prodigal', '-v']).decode('utf-8').split('\n')[1].split()[1] self.bbmap = get_version(['bbversion.sh']).decode('utf-8') self.fastqc = get_version(['fastqc', '--version']).decode('utf-8').split('\n')[0].split()[1] # Uncomment this once you figure ou where this file is stored. self.bcl2fastq = "2" self.perl = get_version(['perl', '-v']).decode('utf-8').split('\n')[1].split('This is ')[1] self.biopython = Bio.__version__ self.java = get_version(['java', '-showversion']).decode('utf-8').split('\n')[0].split()[2].replace('"', '') # self.docker = get_version(['docker', 'version']).split('\n')[1].split()[1] self.versions()
def getrmlsthelper(self): """ Makes a system call to rest_auth.py, a Python script modified from https://github.com/kjolley/BIGSdb/tree/develop/scripts/test And downloads the most up-to-date rMLST profile and alleles """ printtime('Downloading {} alleles'.format(self.analysistype), self.start) # Extract the path of the current script from the full path + file name homepath = os.path.split(os.path.abspath(__file__))[0] # Set the path/name of the folder to contain the new alleles and profile newfolder = os.path.join(self.path, self.analysistype) # Create the path make_path(newfolder) # Create arguments to feed into the rest_auth_class script args = ArgumentParser args.secret_file = os.path.join(homepath, 'secret.txt') args.file_path = homepath args.output_path = newfolder args.start = self.start rmlst = rest_auth_class.REST(args) # Download the profile and alleles rmlst.main() # Get the new alleles into a list, and create the combinedAlleles file alleles = glob(os.path.join(newfolder, '*.tfa')) self.combinealleles(newfolder, alleles)
def remove(self): """Removes unnecessary temporary files generated by the pipeline""" import shutil printtime('Removing large and/or temporary files', self.start) removefolder = list() for sample in self.metadata: # Use os.walk to iterate through all the files in the sample output directory for path, dirs, files in os.walk(sample.general.outputdirectory): for item in files: # Use regex to find files to remove if re.search(".fastq$", item) or re.search(".fastq.gz$", item) or re.search(".bam$", item) \ or re.search(".bt2$", item) or re.search(".tab$", item) or re.search("^before", item) \ or re.search("^baitedtargets", item) or re.search("_combined.csv$", item) \ or re.search("^scaffolds", item) or re.search(".fastg$", item) or re.search(".gfa$", item) \ or re.search(".bai$", item) or 'coregenome' in path or 'prophages' in path: # Keep the baitedtargets.fa, core genome, and merged metagenome files if item != 'baitedtargets.fa' and not re.search("coregenome", item) \ and not re.search("paired", item): # Remove the unnecessary files try: os.remove(os.path.join(path, item)) except IOError: pass # Clear out the folders for folder in removefolder: try: shutil.rmtree(folder) except (OSError, TypeError): pass
def reporter(self): """ Creates a report of the results """ printtime('Creating {} report'.format(self.analysistype), self.starttime) # Create the path in which the reports are stored make_path(self.reportpath) header = 'Strain,Serotype\n' data = '' with open( os.path.join(self.reportpath, '{}.csv'.format(self.analysistype)), 'w') as report: for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': data += sample.name + ',' if sample[self.analysistype].results: serotype = '{oset} ({opid}):{hset} ({hpid}),' \ .format(oset=';'.join(sample.serosippr.o_set), opid=sample.serosippr.best_o_pid, hset=';'.join(sample.serosippr.h_set), hpid=sample.serosippr.best_h_pid) data += '{}\n'.format(serotype) else: data += '\n' report.write(header) report.write(data)
def parse_qaml(self): """ Parse the GenomeQAML report, and populate metadata objects """ printtime('Parsing GenomeQAML outputs', self.start) # A dictionary to store the parsed excel file in a more readable format nesteddictionary = dict() # Use pandas to read in the CSV file, and convert the pandas data frame to a dictionary (.to_dict()) dictionary = pandas.read_csv(self.qaml_report).to_dict() # Iterate through the dictionary - each header from the CSV file for header in dictionary: # Sample is the primary key, and value is the value of the cell for that primary key + header combination for sample, value in dictionary[header].items(): # Update the dictionary with the new data try: nesteddictionary[sample].update({header: value}) # Create the nested dictionary if it hasn't been created yet except KeyError: nesteddictionary[sample] = dict() nesteddictionary[sample].update({header: value}) # Get the results into the metadata object for sample in self.metadata: # Initialise the plasmid extractor genobject setattr(sample, self.analysistype, GenObject()) # Initialise the list of all plasmids sample[self.analysistype].prediction = str() # Iterate through the dictionary of results for line in nesteddictionary: # Extract the sample name from the dictionary name = nesteddictionary[line]['Sample'] # Ensure that the names match if name == sample.name: # Append the plasmid name extracted from the dictionary to the list of plasmids sample[self.analysistype].prediction = nesteddictionary[ line]['Predicted_Class']
def mashing(self): printtime('Performing {} analyses'.format(self.analysistype), self.starttime) # Create the threads for the analysis for i in range(self.cpus): threads = Thread(target=self.mash, args=()) threads.setDaemon(True) threads.start() # Populate threads for each gene, genome combination for sample in self.metadata: sample[self.analysistype].mashresults = os.path.join(sample[self.analysistype].reportdir, '{}.tab'.format( sample.name)) sample.commands.mash = \ 'mash dist -p {} {} {} | sort -gk3 > {}'.format(self.threads, sample[self.analysistype].refseqsketch, sample[self.analysistype].sketchfile, sample[self.analysistype].mashresults) try: self.mashqueue.put(sample) except (KeyboardInterrupt, SystemExit): printtime('Received keyboard interrupt, quitting threads', self.starttime) quit() # Join the threads self.mashqueue.join() self.parse()
def targets(self): """ Using the data from the BLAST analyses, set the targets folder, and create the 'mapping file'. This is the genera-specific FASTA file that will be used for all the reference mapping; it replaces the 'bait file' in the code """ printtime('Performing analysis with {} targets folder'.format( self.analysistype), self.start, output=self.portallog) for sample in self.runmetadata: if sample.general.bestassemblyfile != 'NA': sample[self.analysistype].targetpath = \ os.path.join(self.targetpath, 'genera', sample[self.analysistype].genus, '') # There is a relatively strict databasing scheme necessary for the custom targets. Eventually, # there will be a helper script to combine individual files into a properly formatted combined file try: sample[self.analysistype].mappingfile = glob( '{}*.fa'.format( sample[self.analysistype].targetpath))[0] # If the fasta file is missing, raise a custom error except IndexError as e: # noinspection PyPropertyAccess e.args = [ 'Cannot find the combined fasta file in {}. Please note that the file must have a ' '.fasta extension'.format( sample[self.analysistype].targetpath) ] if os.path.isdir(sample[self.analysistype].targetpath): raise else: sample.general.bestassemblyfile = 'NA'
def download_profile(self): """ Download the profile from the database """ printtime('Downloading profile', self.start) # Set the name of the profile file profile_file = os.path.join(self.output_path, 'profile.txt') size = 0 # Ensure that the file exists, and that it is not too small; likely indicating a failed download try: stats = os.stat(profile_file) size = stats.st_size except FileNotFoundError: pass # Only download the profile if the file doesn't exist, or is likely truncated if not os.path.isfile(profile_file) or size <= 100: # Create a new session session = OAuth1Session(self.consumer_key, self.consumer_secret, access_token=self.session_token, access_token_secret=self.session_secret) # The profile file is called profiles_csv on the server. Updated the URL appropriately r = session.get(self.profile + '/1/profiles_csv') # On a successful GET request, parse the returned data appropriately if r.status_code == 200 or r.status_code == 201: if re.search('json', r.headers['content-type'], flags=0): decoded = r.json() else: decoded = r.text # Write the profile file to disk with open(profile_file, 'w') as profile: profile.write(decoded)
def numberofsamples(self): """Count the number of samples is the samplesheet""" # Initialise variables to store line data idline = 0 linenumber = 0 # Parse the sample sheet to find the number of samples with open(self.samplesheet, "rb") as ssheet: # Use enumerate to iterate through the lines in the sample sheet to retrieve the line number and the data for linenumber, entry in enumerate(ssheet): # Once Sample_ID is encountered if "Sample_ID" in entry: # Set the id line as the current line number idline = linenumber # :samplecount is the last line number in the file minus the line number of Sample_ID self.samplecount = linenumber - idline printtime( 'There are {} samples in this run. ' 'Running off-hours module with the following parameters:\n' 'MiSeqPath: {},\n' 'MiSeqFolder: {},\n' 'SampleSheet: {}'.format(self.samplecount, self.miseqpath, self.miseqfolder, self.samplesheet), self.start) # Run the fastqmover module now that the number of sequences is known self.fastqlinker()
def clean_sequences(self): """Removes reads/contigs that contain plasmids, and masks phage sequences.""" printtime('Removing plasmids and masking phages', self.start) plasmid_db = os.path.join(self.reffilepath, 'plasmidfinder', 'plasmid_database.fa') phage_db = os.path.join(self.reffilepath, 'prophages', 'combinedtargets.tfa') for sample in self.runmetadata.samples: plasmid_removal = 'bbduk.sh ref={} in={} out={} overwrite'\ .format(plasmid_db, sample.general.combined, sample.general.combined.replace('.f', '_noplasmid.f')) subprocess.call(plasmid_removal, shell=True, stdout=self.devnull, stderr=self.devnull) phage_masking = 'bbduk.sh ref={} in={} out={} kmask=N overwrite'\ .format(phage_db, sample.general.combined.replace('.f', '_noplasmid.f'), sample.general.combined.replace('.f', '_clean.f')) subprocess.call(phage_masking, shell=True, stdout=self.devnull, stderr=self.devnull) os.remove(sample.general.combined) os.rename(sample.general.combined.replace('.f', '_clean.f'), sample.general.combined) os.remove(sample.general.combined.replace('.f', '_noplasmid.f'))
def classifymetagenome(self): """Run the classify metagenome of the CLARK package on the samples""" printtime('Classifying metagenomes', self.start) # Define the system call self.classifycall = 'cd {} && ./classify_metagenome.sh -O {} -R {} -n {} --light'\ .format(self.clarkpath, self.filelist, self.reportlist, self.cpus) # Variable to store classification state classify = True for sample in self.runmetadata.samples: try: # Define the name of the .csv classification file sample.general.classification = sample.general.combined.split( '.')[0] + '.csv' # If the file exists, then set classify to False if os.path.isfile(sample.general.classification): classify = False except KeyError: pass # Run the system call if the samples have not been classified if classify: # Run the call subprocess.call(self.classifycall, shell=True, stdout=self.devnull, stderr=self.devnull)
def normalise_reads(self): """ Use bbnorm from the bbmap suite of tools to perform read normalisation """ printtime('Normalising reads to a kmer depth of 100', self.start) for sample in self.metadata: # Set the name of the normalised read files sample.general.normalisedreads = [ fastq.split('.fastq.gz')[0] + '_normalised.fastq.gz' for fastq in sorted(sample.general.fastqfiles) ] try: # Run the normalisation command out, err, cmd = bbtools.bbnorm( forward_in=sorted( sample.general.trimmedcorrectedfastqfiles)[0], forward_out=sample.general.normalisedreads[0], returncmd=True, threads=self.cpus) sample[self.analysistype].normalisecmd = cmd write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, None, None) except CalledProcessError: sample.general.normalisedreads = sample.general.trimmedfastqfiles except IndexError: sample.general.normalisedreads = list()
def quality(self): """ Creates quality objects and runs quality assessments and quality processes on the supplied sequences """ # Validate that the FASTQ files are in the proper format, and that there are no issues e.g. different numbers # of forward and reverse reads, read length longer than quality score length, proper extension self.fastq_validate() # Run FastQC on the unprocessed fastq files self.fastqc_raw() # Perform quality trimming and FastQC on the trimmed files self.quality_trim() # Run FastQC on the trimmed files self.fastqc_trimmed() # Perform error correcting on the reads self.error_correct() # Detect contamination in the reads self.contamination_detection() # Run FastQC on the processed fastq files self.fastqc_trimmedcorrected() # Exit if only pre-processing of data is requested metadataprinter.MetadataPrinter(self) if self.preprocess: printtime('Pre-processing complete', self.starttime) quit()
def estimateabundance(self): """ Estimate the abundance of taxonomic groups """ printtime('Estimating abundance of taxonomic groups', self.start) # Create and start threads for i in range(self.cpus): # Send the threads to the appropriate destination function threads = Thread(target=self.estimate, args=()) # Set the daemon to true - something to do with thread management threads.setDaemon(True) # Start the threading threads.start() for sample in self.runmetadata.samples: try: if sample.general.combined != 'NA': # Set the name of the abundance report sample.general.abundance = sample.general.combined.split( '.')[0] + '_abundance.csv' # if not hasattr(sample, 'commands'): if not sample.commands.datastore: sample.commands = GenObject() # Define system calls sample.commands.target = self.targetcall sample.commands.classify = self.classifycall sample.commands.abundancecall = \ 'cd {} && ./estimate_abundance.sh -D {} -F {} > {}'.format(self.clarkpath, self.databasepath, sample.general.classification, sample.general.abundance) self.abundancequeue.put(sample) except KeyError: pass self.abundancequeue.join()
def reporter(self): """ Runs the necessary methods to parse raw read outputs """ printtime('Preparing reports', self.starttime) # Populate self.plusdict in order to reuse parsing code from an assembly-based method for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': for gene in sample[self.analysistype].allelenames: for allele, percentidentity in sample[ self.analysistype].results.items(): if gene in allele: # Split the allele number from the gene name using the appropriate delimiter if '_' in allele: splitter = '_' elif '-' in allele: splitter = '-' else: splitter = '' # Create the plusdict dictionary as in the assembly-based (r)MLST method. Allows all the # parsing and sequence typing code to be reused. try: self.plusdict[sample.name][gene][allele.split(splitter)[1]][percentidentity] \ = sample[self.analysistype].avgdepth[allele] except IndexError: pass self.profiler() self.sequencetyper() self.mlstreporter()
def extract_rmlst_reads(self, fastq_pairs, fastq_singles): """ Extracts rmlst reads and puts them in a folder. :param fastq_pairs: List of fastqpairs in nested array [[forward1, reverse1], [forward2, reverse2]] :param fastq_singles: List of fastq singles. :return: Zip, zilch, nada. """ for pair in fastq_pairs: cmd = 'bbduk.sh ref={} in1={} in2={} outm={}' \ ' outm2={}'.format(self.database, pair[0], pair[1], self.output_file + 'rmlsttmp/' + pair[0].split('/')[-1], self.output_file + 'rmlsttmp/' + pair[1].split('/')[-1]) with open(self.output_file + 'tmp/junk.txt', 'w') as outjunk: try: # This should give bbduk more than enough time to run, unless user's computer is super slow. # Maybe adjust the value later. subprocess.call(cmd, shell=True, stderr=outjunk, timeout=3600) except subprocess.TimeoutExpired: printtime(pair[0] + ' appears to be making BBDUK run forever. Killing...', self.start) os.remove(self.output_file + 'rmlsttmp/' + pair[0].split('/')[-1]) os.remove(self.output_file + 'rmlsttmp/' + pair[1].split('/')[-1]) for single in fastq_singles: cmd = 'bbduk.sh ref=database.fasta in={} outm={}' \ ''.format(single, self.output_file + 'rmlsttmp/' + single.split('/')[-1]) with open(self.output_file + 'tmp/junk.txt', 'w') as outjunk: try: # This should give bbduk more than enough time to run, unless user's computer is super slow. # Maybe adjust the value later. subprocess.call(cmd, shell=True, stderr=outjunk, timeout=3600) except subprocess.TimeoutExpired: printtime(pair[0] + ' appears to be making BBDUK run forever. Killing...', self.start) os.remove(self.output_file + 'rmlsttmp/' + single.split('/')[-1])
def blast(self): """ Run BLAST analyses of the subsampled FASTQ reads against the NCBI 16S reference database """ printtime('BLASTing FASTA files against {} database'.format( self.analysistype), self.starttime, output=self.portallog) for _ in range(self.cpus): threads = Thread(target=self.blastthreads, args=()) threads.setDaemon(True) threads.start() for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': # Set the name of the BLAST report sample[self.analysistype].blastreport = os.path.join( sample[self.analysistype].outputdir, '{}_{}_blastresults.csv'.format(sample.name, self.analysistype)) # Use the NCBI BLASTn command line wrapper module from BioPython to set the parameters of the search blastn = NcbiblastnCommandline( query=sample[self.analysistype].fasta, db=os.path.splitext(sample[self.analysistype].baitfile)[0], max_target_seqs=1, num_threads=self.threads, outfmt="'6 qseqid sseqid positive mismatch gaps " "evalue bitscore slen length qstart qend qseq sstart send sseq'", out=sample[self.analysistype].blastreport) # Add a string of the command to the metadata object sample[self.analysistype].blastcall = str(blastn) # Add the object and the command to the BLAST queue self.blastqueue.put((sample, blastn)) self.blastqueue.join()
def quast(self): printtime('Performing Quast analyses', self.start) for i in range( len([ sample.general for sample in self.metadata if sample.general.bestassemblyfile != 'NA' ])): # Send the threads to the merge method. :args is empty threads = Thread(target=self.runquast, args=()) # Set the daemon to true - something to do with thread management threads.setDaemon(True) # Start the threading threads.start() for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': # Create the quast output directory quastoutputdirectory = '{}/quast_results/'.format( sample.general.outputdirectory) make_path(quastoutputdirectory) # Set the quast system call quastcall = 'quast.py {} -o {}'.format( sample.general.filteredfile, quastoutputdirectory) # Add the command to the metadata sample.commands.quast = quastcall self.quastqueue.put((sample, quastoutputdirectory)) else: sample.commands.quast = 'NA' self.quastqueue.join()
def get_session_token(self): """ Use the accession token to request a new session token """ printtime('Getting session token', self.start) # Rather than testing any previous session tokens to see if they are still valid, simply delete old tokens in # preparation of the creation of new ones try: os.remove(os.path.join(self.file_path, 'session_token')) except FileNotFoundError: pass # Create a new session session_request = OAuth1Session(self.consumer_key, self.consumer_secret, access_token=self.access_token, access_token_secret=self.access_secret) # Set the URL appropriately url = self.test_rest_url + '/oauth/get_session_token' # Perform a GET request with the appropriate keys and tokens r = session_request.get(url) # If the status code is '200' (OK), proceed if r.status_code == 200: # Save the JSON-decoded token secret and token self.session_token = r.json()['oauth_token'] self.session_secret = r.json()['oauth_token_secret'] # Write the token and secret to file self.write_token('session_token', self.session_token, self.session_secret) # Any other status than 200 is considered a failure else: print('Failed:') print(r.json()['message'])
def error_correction(self): """ Use tadpole from the bbmap suite of tools to perform error correction of the reads """ printtime('Error correcting reads', self.start) for sample in self.metadata: sample.general.trimmedcorrectedfastqfiles = [ fastq.split('.fastq.gz')[0] + '_trimmed_corrected.fastq.gz' for fastq in sorted(sample.general.fastqfiles) ] try: out, err, cmd = bbtools.tadpole( forward_in=sorted(sample.general.trimmedfastqfiles)[0], forward_out=sample.general.trimmedcorrectedfastqfiles[0], returncmd=True, mode='correct', threads=self.cpus) # Set the command in the object sample[self.analysistype].errorcorrectcmd = cmd write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, None, None) except CalledProcessError: sample.general.trimmedcorrectedfastqfiles = sample.general.trimmedfastqfiles except KeyError: sample.general.trimmedcorrectedfastqfiles = list()
def profiler(self): """Creates a dictionary from the profile scheme(s)""" printtime('Loading profiles', self.starttime) from csv import DictReader # Initialise variables profiledata = defaultdict(make_dict) profileset = set() genedict = dict() # Find all the unique profiles to use with a set for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': if sample[self.analysistype].profile != 'NA': profileset.add(sample[self.analysistype].profile) # Extract the profiles for each set for sequenceprofile in profileset: # Clear the list of genes genelist = list() for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': if sequenceprofile == sample[self.analysistype].profile: genelist = [ allele for allele in sample[self.analysistype].alleles ] try: # Open the sequence profile file as a dictionary profile = DictReader(open(sequenceprofile), dialect='excel-tab') # Revert to standard comma separated values except KeyError: # Open the sequence profile file as a dictionary profile = DictReader(open(sequenceprofile)) # Iterate through the rows for row in profile: # Iterate through the genes for gene in genelist: # Add the sequence profile, and type, the gene name and the allele number to the dictionary try: profiledata[sequenceprofile][ row['ST']][gene] = row[gene] except KeyError: try: profiledata[sequenceprofile][ row['rST']][gene] = row[gene] except KeyError: raise # Add the gene list to a dictionary genedict[sequenceprofile] = sorted(genelist) # Add the profile data, and gene list to each sample for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': if sequenceprofile == sample[self.analysistype].profile: # Populate the metadata with the profile data sample[self.analysistype].profiledata = profiledata[ sample[self.analysistype].profile] dotter()
def create_database_folder(self, database): """ Create an appropriately named folder in which the database is to be stored :param database: the name of the database folder to create :return: the absolute path of the folder """ printtime('Setting up {} database'.format(database), self.start) # Define the path to store the database files databasepath = os.path.join(self.databasepath, database) # Create the path as required make_path(databasepath) return databasepath
def settargets(self): """Set the targets to be used in the analyses. Involves the path of the database files, the database files to use, and the level of classification for the analysis""" # Define the set targets call. Include the path to the script, the database path and files, as well # as the taxonomic rank to use printtime('Setting up database', self.start) self.targetcall = 'cd {} && ./set_targets.sh {} {} --{}'.format( self.clarkpath, self.databasepath, self.database, self.rank) # subprocess.call(self.targetcall, shell=True, stdout=self.devnull, stderr=self.devnull)
def reporter(self): """ Create a report of the results """ printtime('Writing report', self.starttime) data = 'Strain,Profile\n' for sample in self.runmetadata.samples: # Only add to the string if there are results if sample[self.analysistype].toxinprofile: data += '{},{}\n'.format(sample.name, sample[self.analysistype].toxinprofile) # Create the report, and write to it with open('{}/{}.csv'.format(self.reportpath, self.analysistype), 'wb') as report: report.write(data)
def main(self): """ Run the necessary methods in the correct order """ printtime('Starting {} analysis pipeline'.format(self.analysistype), self.starttime) # Create the objects to be used in the analyses objects = Objectprep(self) objects.objectprep() self.runmetadata = objects.samples self.threads = int(self.cpus / len(self.runmetadata.samples)) if self.cpus / len(self.runmetadata.samples) > 1 \ else 1 # Run the genesippr analyses self.analysistype = 'genesippr' self.targetpath = os.path.join(self.reffilepath, self.analysistype, '') Sippr(self, 0.90) # Create the reports self.reports = Reports(self) Reports.reporter(self.reports) # Run the 16S analyses using the filtered database self.targetpath = self.reffilepath # Run the 16S analyses self.analysistype = 'sixteens_full' SixteensFull(self, self.commit, self.starttime, self.homepath, 'sixteens_full', 0.985) # ResFinding Resistance(self, self.commit, self.starttime, self.homepath, 'resfinder', 0.90, False, True) # Run the GDCS analysis self.analysistype = 'GDCS' self.pipeline = True self.targetpath = os.path.join(self.targetpath, self.analysistype) Sippr(self, 0.95) # Create the reports Reports.gdcsreporter(self.reports) # Perform serotyping for samples classified as Escherichia for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': sample.mash = GenObject() try: sample.mash.closestrefseqgenus = sample.general.closestrefseqgenus for genus, species in self.taxonomy.items(): if genus == sample.mash.closestrefseqgenus: sample.mash.closestrefseqspecies = species except KeyError: sample.mash.closestrefseqgenus = 'NA' sample.mash.closestrefseqspecies = 'NA' else: sample.mash.closestrefseqgenus = 'NA' sample.mash.closestrefseqspecies = 'NA' SeroSippr(self, self.commit, self.starttime, self.homepath, 'serosippr', 0.95, True) # Print the metadata printer = MetadataPrinter(self) printer.printmetadata()
def sistr(self): """Perform sistr analyses on Salmonella""" printtime('Performing sistr analyses', self.start) for sample in self.metadata: # Create the analysis-type specific attribute setattr(sample, self.analysistype, GenObject()) if sample.general.bestassemblyfile != 'NA': try: # Only process strains that have been determined to be Salmonella if sample.general.referencegenus == 'Salmonella': # Set and create the path of the directory to store the strain-specific reports sample[self.analysistype].reportdir = os.path.join( sample.general.outputdirectory, self.analysistype) # Name of the .json output file sample[self.analysistype].jsonoutput = os.path.join( sample[self.analysistype].reportdir, '{}.json'.format(sample.name)) # Set the sistr system call sample.commands.sistr = \ 'sistr -f json -o {} -t {} -T {} {}'\ .format(sample[self.analysistype].jsonoutput, self.cpus, os.path.join(sample[self.analysistype].reportdir, 'tmp'), sample.general.bestassemblyfile) # sample[self.analysistype].logout = os.path.join( sample[self.analysistype].reportdir, 'logout') sample[self.analysistype].logerr = os.path.join( sample[self.analysistype].reportdir, 'logerr') # Only run the analyses if the output json file does not exist if not os.path.isfile( sample[self.analysistype].jsonoutput): out, err = run_subprocess(sample.commands.sistr) write_to_logfile(sample.commands.sistr, sample.commands.sistr, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) self.queue.task_done() except (ValueError, KeyError): pass self.queue.join() self.report()
def objectprep(self): """Create objects to store data and metadata for each sample. Also, perform necessary file manipulations""" # Move the files to subfolders and create objects self.runmetadata = createobject.ObjectCreation(self) if self.runmetadata.extension == 'fastq': # To streamline the CLARK process, decompress and combine .gz and paired end files as required printtime( 'Decompressing and combining .fastq files for CLARK analysis', self.start) fileprep.Fileprep(self) else: printtime('Using .fasta files for CLARK analysis', self.start) for sample in self.runmetadata.samples: sample.general.combined = sample.general.fastqfiles[0]
def predictthreads(self): printtime('Performing gene predictions', self.start) # Create the threads for the analyses for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.predict, args=()) threads.setDaemon(True) threads.start() for sample in self.metadata: # Create the .prodigal attribute sample.prodigal = GenObject() if sample.general.bestassemblyfile != 'NA': self.predictqueue.put(sample) self.predictqueue.join()
def movefastq(self): """Find .fastq files for each sample and move them to an appropriately named folder""" printtime('Moving FASTQ files', self.start) # Iterate through each sample for sample in self.metadata.runmetadata.samples: # Retrieve the output directory outputdir = os.path.join(self.path, sample.name) # Find any fastq files with the sample name fastqfiles = sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \ if sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \ else sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \ if sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \ else sorted(glob(os.path.join(self.path, '{}*.fastq*'.format(sample.name)))) # Only try and move the files if the files exist if fastqfiles: make_path(outputdir) # Symlink the fastq files to the directory try: list( map( lambda x: os.symlink( os.path.join('..', os.path.basename(x)), os.path.join(outputdir, os.path.basename(x))), fastqfiles)) except OSError: pass # Find any fastq files with the sample name fastqfiles = [ fastq for fastq in sorted( glob( os.path.join(outputdir, '{}*.fastq*'.format( sample.name)))) if 'trimmed' not in fastq and 'normalised' not in fastq and 'corrected' not in fastq and 'paired' not in fastq and 'unpaired' not in fastq ] else: if outputdir: # Find any fastq files with the sample name fastqfiles = [ fastq for fastq in sorted( glob( os.path.join( outputdir, '{}*.fastq*'.format( outputdir, sample.name)))) if 'trimmed' not in fastq and 'normalised' not in fastq and 'corrected' not in fastq and 'paired' not in fastq and 'unpaired' not in fastq ] sample.general.fastqfiles = fastqfiles
def __init__(self, inputobject): self.metadata = inputobject.runmetadata.samples self.start = inputobject.starttime self.kmers = inputobject.kmers self.cpus = inputobject.cpus try: self.threads = int(self.cpus / len( self.metadata)) if self.cpus / len(self.metadata) > 1 else 1 except TypeError: self.threads = self.cpus self.path = inputobject.path self.logfile = inputobject.logfile self.assemblequeue = Queue(maxsize=self.threads) printtime('Assembling sequences', self.start) self.spades()
def combinealleles(self, allelepath, alleles): printtime('Creating combined rMLST allele file', self.start) with open(os.path.join(allelepath, 'rMLST_combined.fasta'), 'w') as combinedfile: # Open each allele file for allele in sorted(alleles): # with open(allele, 'rU') as fasta: for record in SeqIO.parse(open(allele, "rU"), "fasta"): # Extract the sequence record from each entry in the multifasta # Replace and dashes in the record.id with underscores record.id = record.id.replace('-', '_') # Remove and dashes or 'N's from the sequence data - makeblastdb can't handle sequences # with gaps # noinspection PyProtectedMember record.seq._data = record.seq._data.replace('-', '').replace('N', '') # Clear the name and description attributes of the record record.name = '' record.description = '' # Write each record to the combined file SeqIO.write(record, combinedfile, 'fasta')
def find_loci(self): """ Finds the URLs for all allele files """ printtime('Downloading alleles', self.start) session = OAuth1Session(self.consumer_key, self.consumer_secret, access_token=self.session_token, access_token_secret=self.session_secret) # Use the URL for all loci determined above r = session.get(self.loci) if r.status_code == 200 or r.status_code == 201: if re.search('json', r.headers['content-type'], flags=0): decoded = r.json() else: decoded = r.text # Extract all the URLs in the decoded dictionary under the key 'loci' for locus in decoded['loci']: # Add each URL to the list self.loci_url.append(locus)