def basic(self): # Grab any .fastq files in the path fastqfiles = glob(os.path.join(self.path, '*.fastq*')) # Extract the base name of the globbed name + path provided fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles)) # Iterate through the names of the fastq files for fastqname in sorted(fastqnames): # Set the name metadata = MetadataObject() metadata.name = fastqname # Set the destination folder outputdir = os.path.join(self.path, fastqname) # Make the destination folder make_path(outputdir) # Get the fastq files specific to the fastqname specificfastq = glob( os.path.join(self.path, '{}*.fastq*'.format(fastqname))) # Link the files to the output folder try: # Link the .gz files to :self.path/:filename list( map( lambda x: os.symlink( '../{}'.format(os.path.basename(x)), '{}/{}'. format(outputdir, os.path.basename(x))), specificfastq)) # Except os errors except OSError as exception: # If there is an exception other than the file exists, raise it if exception.errno != errno.EEXIST: raise # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Populate the .fastqfiles category of :self.metadata metadata.general.fastqfiles = [ fastq for fastq in sorted( glob( os.path.join(outputdir, '{}*.fastq*'.format( metadata.name)))) if 'trimmed' not in fastq and 'normalised' not in fastq and 'corrected' not in fastq and 'paired' not in fastq and 'unpaired' not in fastq ] # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join( self.path, metadata.name, '{}_log_out.txt'.format(metadata.name)) metadata.general.logerr = os.path.join( self.path, metadata.name, '{}_log_err.txt'.format(metadata.name)) # Append the metadata to the list of samples self.samples.append(metadata) # Grab metadata from previous runs previousmetadata = metadataReader.MetadataReader(self) # Update self.samples (if required) if previousmetadata.samples: self.samples = previousmetadata.samples # Run the read length method self.readlength()
def test_sistr(variables): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() # Set the destination folder outputdir = os.path.join(variables.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' variable_update()
def createobject(self): # Grab any .fastq files in the path fastqfiles = glob(os.path.join(self.path, '*.fastq*')) # Extract the base name of the globbed name + path provided fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles)) # Iterate through the names of the fastq files for fastqname in sorted(fastqnames): # Set the name metadata = MetadataObject() metadata.name = fastqname # Set the destination folder outputdir = os.path.join(self.path, fastqname) # Make the destination folder make_path(outputdir) # Get the fastq files specific to the fastqname specificfastq = glob( os.path.join(self.path, '{}*.fastq*'.format(fastqname))) # Make relative symlinks to the files in :self.path try: for fastq in specificfastq: # Get the basename of the file fastqfile = os.path.split(fastq)[-1] # Set the destination fastq path as the base name plus the destination folder destinationfastq = os.path.join(outputdir, fastqfile) # Symlink the files os.symlink('../{}'.format(fastqfile), destinationfastq) # Except os errors except OSError as exception: # If there is an exception other than the file exists, raise it if exception.errno != errno.EEXIST: raise # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Populate the .fastqfiles category of :self.metadata metadata.general.fastqfiles = [ fastq for fastq in glob( os.path.join(outputdir, '{}*.fastq*'.format(fastqname))) if 'trimmed' not in fastq ] # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True metadata.general.trimmedcorrectedfastqfiles = metadata.general.fastqfiles metadata.general.logout = os.path.join( metadata.general.outputdirectory, 'logout') metadata.general.logerr = os.path.join( metadata.general.outputdirectory, 'logerr') # Initialise an attribute to store commands metadata.commands = GenObject() # Append the metadata to the list of samples self.samples.append(metadata)
def estimateabundance(self): """ Estimate the abundance of taxonomic groups """ printtime('Estimating abundance of taxonomic groups', self.start) # Create and start threads for i in range(self.cpus): # Send the threads to the appropriate destination function threads = Thread(target=self.estimate, args=()) # Set the daemon to true - something to do with thread management threads.setDaemon(True) # Start the threading threads.start() for sample in self.runmetadata.samples: try: if sample.general.combined != 'NA': # Set the name of the abundance report sample.general.abundance = sample.general.combined.split( '.')[0] + '_abundance.csv' # if not hasattr(sample, 'commands'): if not sample.commands.datastore: sample.commands = GenObject() # Define system calls sample.commands.target = self.targetcall sample.commands.classify = self.classifycall sample.commands.abundancecall = \ 'cd {} && ./estimate_abundance.sh -D {} -F {} > {}'.format(self.clarkpath, self.databasepath, sample.general.classification, sample.general.abundance) self.abundancequeue.put(sample) except KeyError: pass self.abundancequeue.join()
def metaparse(self, sample, quastoutputdirectory): import functools # Tuples of strings to replace when parsing the results file repls = ('>=', 'Over'), ('000 Bp', 'kbp'), ('#', 'Num'), \ ("'", ''), ('(', ''), (')', ''), (' ', ''), ('>', 'Less'), ('Gc%', 'GC%') # Initialise the results dictionary quast = dict() # The results file is gage_report.tsv if that file exists, otherwise it is report.tsv resfile = "{0:s}/gage_report.tsv".format(quastoutputdirectory) \ if os.path.isfile("{0:s}/gage_report.tsv".format(quastoutputdirectory)) \ else "{0:s}/report.tsv".format(quastoutputdirectory) with open(resfile) as report: for line in report: # Use headings in report as keys for the GenObject supplied from generator and replace incrementally # with reduce and lambda function below k, v = [ functools.reduce(lambda a, kv: a.replace(*kv), repls, s.title()) for s in line.rstrip().split('\t') ] quast[k] = v # Create the quast metadata object sample.quast = GenObject(quast) sample.quast.outputdirectory = quastoutputdirectory sample.quast.kmers = self.kmers
def __init__(self, inputobject): self.metadata = inputobject.runmetadata.samples self.cpus = inputobject.cpus try: self.threads = int(self.cpus / len( self.metadata)) if self.cpus / len(self.metadata) > 1 else 1 except TypeError: self.threads = self.cpus # self.devnull = open(os.devnull, 'wb') self.qcqueue = Queue(maxsize=self.cpus) self.trimqueue = Queue(maxsize=self.cpus) self.correctqueue = Queue(maxsize=self.cpus) self.start = inputobject.starttime try: self.forwardlength = inputobject.forwardlength self.reverselength = inputobject.reverselength except AttributeError: self.forwardlength = 'full' self.reverselength = 'full' self.numreads = inputobject.numreads self.logfile = inputobject.logfile self.path = inputobject.path self.analysistype = 'quality' self.reffilepath = inputobject.reffilepath # Initialise the quality attribute in the metadata object for sample in self.metadata: setattr(sample, self.analysistype, GenObject())
def __init__(self, passed): """Initialise variables""" self.path = passed.path self.runinfo = passed.runinfo self.flowcell = "NA" self.instrument = "NA" self.samples = list() self.ids = list() self.date = str() self.totalreads = 0 self.runid = str() self.runnumber = str() self.commit = passed.commit # Create and start to populate the header object self.header = GenObject() # If a custom sample sheet has been provided, use it if passed.customsamplesheet: self.samplesheet = passed.customsamplesheet assert os.path.isfile(self.samplesheet), u'Could not find CustomSampleSheet as entered: {0!r:s}'\ .format(self.samplesheet) else: self.samplesheet = os.path.join(self.path, "SampleSheet.csv") # Extract data from SampleSheet.csv self.parsesamplesheet()
def parse_qaml(self): """ Parse the GenomeQAML report, and populate metadata objects """ printtime('Parsing GenomeQAML outputs', self.start) # A dictionary to store the parsed excel file in a more readable format nesteddictionary = dict() # Use pandas to read in the CSV file, and convert the pandas data frame to a dictionary (.to_dict()) dictionary = pandas.read_csv(self.qaml_report).to_dict() # Iterate through the dictionary - each header from the CSV file for header in dictionary: # Sample is the primary key, and value is the value of the cell for that primary key + header combination for sample, value in dictionary[header].items(): # Update the dictionary with the new data try: nesteddictionary[sample].update({header: value}) # Create the nested dictionary if it hasn't been created yet except KeyError: nesteddictionary[sample] = dict() nesteddictionary[sample].update({header: value}) # Get the results into the metadata object for sample in self.metadata: # Initialise the plasmid extractor genobject setattr(sample, self.analysistype, GenObject()) # Initialise the list of all plasmids sample[self.analysistype].prediction = str() # Iterate through the dictionary of results for line in nesteddictionary: # Extract the sample name from the dictionary name = nesteddictionary[line]['Sample'] # Ensure that the names match if name == sample.name: # Append the plasmid name extracted from the dictionary to the list of plasmids sample[self.analysistype].prediction = nesteddictionary[ line]['Predicted_Class']
def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
def primers(self): """Setup and create threads for ePCR""" # Create the threads for the ePCR analysis for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.epcr, args=()) threads.setDaemon(True) threads.start() for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': setattr(sample, self.analysistype, GenObject()) # Get the primers ready try: sample[self.analysistype].primers = glob(os.path.join(self.reffilepath, self.analysistype, sample.general.referencegenus, 'primers', '*.txt'))[0] # Find the name of the probe file sample[self.analysistype].probes = glob(os.path.join(self.reffilepath, self.analysistype, sample.general.referencegenus, 'probes', '*.fa'))[0] # Create the BLAST database of the probes (if necessary) self.makeblastdb(sample[self.analysistype].probes) # Initialise a list to store the names of the targets sample[self.analysistype].targets = list() # Open the primer file, and read the names of the targets into a list with open(sample[self.analysistype].primers, 'r') as primerfile: for line in primerfile: sample[self.analysistype].targets.append(line.split('\t')[0]) # Organisms without primer/probe files will fail. Populate metadata with 'NA' values except IndexError: sample[self.analysistype].primers = 'NA' sample[self.analysistype].probes = 'NA' # Only try to process organisms with primer files if sample[self.analysistype].primers != 'NA': # Make the output path sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].reportdir) # Set the base name of the output file outfile = sample[self.analysistype].reportdir + sample.name # Set the hashing and mapping commands sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext) sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile) # re-PCR uses the subtyping primers list to search the contigs file using the following parameters # -S {hash file} (Perform STS lookup using hash-file), -r + (Enable/disable reverse STS lookup) # -m 10000 (Set variability for STS size for lookup), # -n 1 (Set max allowed mismatches per primer for lookup) # -g 0 (Set max allowed indels per primer for lookup), # -G (Print alignments in comments), -o {output file} sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 2 -g 0 -G -q -o {}.txt {}' \ .format(outfile, outfile, sample[self.analysistype].primers) # Add the variables to the queue self.epcrqueue.put((sample, outfile)) self.epcrqueue.join()
def error(sample, message): """ Check to see if the run GenObject exists. If so, update the run.Description to reflect the error :param sample: metadata sample object :param message: error message to add to the sample.run.Description attribute """ # Set the .fastqfiles attribute to 'NA' to remove this strain from the analyses sample.general.fastqfiles = ['NA'] # Ensure that the run attribute exists if GenObject.isattr(sample, 'run'): # If the Description attribute exists, overwrite it, otherwise create and populate it if GenObject.isattr(sample.run, 'status'): sample.run.status = message else: setattr(sample.run, 'status', message) # Otherwise create and populate the attribute else: setattr(sample, 'run', GenObject()) sample.run.Description = message
def extract_rmlst_reads(self): """ rMLST read extraction. Should be the first thing called after parsing the fastq directory. """ for sample in self.metadata: # Create the object to store the variables setattr(sample, self.analysistype, GenObject()) # Initialise variables sample[self.analysistype].snv_count = list() # Initialise a starting value for the number of unique kmers found in each sample sample[self.analysistype].unique_kmers = -1 # Set and create the output directory try: sample[self.analysistype].outputdir = os.path.join( sample.run.outputdirectory, self.analysistype) except KeyError: sample[self.analysistype].outputdir = os.path.join( sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].outputdir) sample[self.analysistype].logout = os.path.join( sample[self.analysistype].outputdir, 'logout.txt') sample[self.analysistype].logerr = os.path.join( sample[self.analysistype].outputdir, 'logerr.txt') sample[self.analysistype].baitedfastq = os.path.join( sample[self.analysistype].outputdir, '{}_targetMatches.fastq.gz'.format(self.analysistype)) # Create the command to run the baiting - paired inputs and a single, zipped output sample[self.analysistype].bbdukcmd = 'bbduk.sh ref={} in1={} in2={} threads={} outm={}'\ .format(self.database, sample.general.trimmedcorrectedfastqfiles[0], sample.general.trimmedcorrectedfastqfiles[1], str(self.threads), sample[self.analysistype].baitedfastq) # Sometimes bbduk hangs forever, so that needs to be handled. Give it a very generous timeout. try: # Run the call, and write any errors to the logfile command = sample[self.analysistype].bbdukcmd if self.analyse: out, err = run_subprocess(command) else: out = str() err = str() write_to_logfile(command, command, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) except TimeoutExpired: print('ERROR: Could not extract rMLST reads from sample {}'. format(sample.name))
def setup(self): """ Set up the metadata object to be passed to Vtyper() """ from glob import glob files = sorted(glob('{}*.fasta'.format(self.sequencepath))) samples = list() # Create the metadata for each file for fasta in files: # Create a metadata object to store all metadata associated with each strain metadata = MetadataObject() metadata.general = GenObject() metadata.commands = GenObject() # Set the name metadata.name = os.path.basename(fasta).split('.')[0] metadata.general.bestassemblyfile = fasta metadata.general.stx = True metadata.general.outputdirectory = self.path metadata.general.filenoext = fasta.split('.')[0] metadata.general.fastqfiles = list() samples.append(metadata) return samples
def main(self): """ Run the necessary methods in the correct order """ printtime('Starting {} analysis pipeline'.format(self.analysistype), self.starttime) # Create the objects to be used in the analyses objects = Objectprep(self) objects.objectprep() self.runmetadata = objects.samples self.threads = int(self.cpus / len(self.runmetadata.samples)) if self.cpus / len(self.runmetadata.samples) > 1 \ else 1 # Run the genesippr analyses self.analysistype = 'genesippr' self.targetpath = os.path.join(self.reffilepath, self.analysistype, '') Sippr(self, 0.90) # Create the reports self.reports = Reports(self) Reports.reporter(self.reports) # Run the 16S analyses using the filtered database self.targetpath = self.reffilepath # Run the 16S analyses self.analysistype = 'sixteens_full' SixteensFull(self, self.commit, self.starttime, self.homepath, 'sixteens_full', 0.985) # ResFinding Resistance(self, self.commit, self.starttime, self.homepath, 'resfinder', 0.90, False, True) # Run the GDCS analysis self.analysistype = 'GDCS' self.pipeline = True self.targetpath = os.path.join(self.targetpath, self.analysistype) Sippr(self, 0.95) # Create the reports Reports.gdcsreporter(self.reports) # Perform serotyping for samples classified as Escherichia for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': sample.mash = GenObject() try: sample.mash.closestrefseqgenus = sample.general.closestrefseqgenus for genus, species in self.taxonomy.items(): if genus == sample.mash.closestrefseqgenus: sample.mash.closestrefseqspecies = species except KeyError: sample.mash.closestrefseqgenus = 'NA' sample.mash.closestrefseqspecies = 'NA' else: sample.mash.closestrefseqgenus = 'NA' sample.mash.closestrefseqspecies = 'NA' SeroSippr(self, self.commit, self.starttime, self.homepath, 'serosippr', 0.95, True) # Print the metadata printer = MetadataPrinter(self) printer.printmetadata()
def predictthreads(self): printtime('Performing gene predictions', self.start) # Create the threads for the analyses for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.predict, args=()) threads.setDaemon(True) threads.start() for sample in self.metadata: # Create the .prodigal attribute sample.prodigal = GenObject() if sample.general.bestassemblyfile != 'NA': self.predictqueue.put(sample) self.predictqueue.join()
def sistr(self): """Perform sistr analyses on Salmonella""" printtime('Performing sistr analyses', self.start) for sample in self.metadata: # Create the analysis-type specific attribute setattr(sample, self.analysistype, GenObject()) if sample.general.bestassemblyfile != 'NA': try: # Only process strains that have been determined to be Salmonella if sample.general.referencegenus == 'Salmonella': # Set and create the path of the directory to store the strain-specific reports sample[self.analysistype].reportdir = os.path.join( sample.general.outputdirectory, self.analysistype) # Name of the .json output file sample[self.analysistype].jsonoutput = os.path.join( sample[self.analysistype].reportdir, '{}.json'.format(sample.name)) # Set the sistr system call sample.commands.sistr = \ 'sistr -f json -o {} -t {} -T {} {}'\ .format(sample[self.analysistype].jsonoutput, self.cpus, os.path.join(sample[self.analysistype].reportdir, 'tmp'), sample.general.bestassemblyfile) # sample[self.analysistype].logout = os.path.join( sample[self.analysistype].reportdir, 'logout') sample[self.analysistype].logerr = os.path.join( sample[self.analysistype].reportdir, 'logerr') # Only run the analyses if the output json file does not exist if not os.path.isfile( sample[self.analysistype].jsonoutput): out, err = run_subprocess(sample.commands.sistr) write_to_logfile(sample.commands.sistr, sample.commands.sistr, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) self.queue.task_done() except (ValueError, KeyError): pass self.queue.join() self.report()
def fasta_records(self): """ Use SeqIO to create dictionaries of all records for each FASTA file """ for sample in self.metadata: # Create the analysis-type specific attribute setattr(sample, self.analysistype, GenObject()) # Create a dictionary of records for each file try: record_dict = SeqIO.to_dict( SeqIO.parse(sample.general.bestassemblyfile, "fasta")) except FileNotFoundError: record_dict = dict() # Set the records dictionary as the attribute for the object sample[self.analysistype].record_dict = record_dict
def epcrparse(self): """ Parse the ePCR text file outputs """ printtime('Parsing ePCR results', self.start) for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': if 'stx' in sample.general.datastore: # Initialise count - this allows for the population of vtyperresults with unique values uniquecount = 0 # This populates vtyperresults with the verotoxin subtypes toxinlist = [] if os.path.isfile(sample[self.analysistype].resultsfile): epcrresults = open(sample[self.analysistype].resultsfile, 'r') for result in epcrresults: # Only the lines without a # contain results if "#" not in result: uniquecount += 1 # Split on \t data = result.split('\t') # The subtyping primer pair is the first entry on lines with results vttype = data[0].split('_')[0] # Push the name of the primer pair - stripped of anything after a _ to the dictionary if vttype not in toxinlist: toxinlist.append(vttype) # Create a string of the entries in list1 joined with ";" toxinstring = ";".join(sorted(toxinlist)) # Save the string to the metadata sample[self.analysistype].toxinprofile = toxinstring else: setattr(sample, self.analysistype, GenObject()) sample[self.analysistype].toxinprofile = 'NA' else: setattr(sample, self.analysistype, GenObject()) sample[self.analysistype].toxinprofile = 'NA'
def reader(self): import os import json from accessoryFunctions.accessoryFunctions import GenObject, MetadataObject for sample in self.metadata: metadatafile = '{}{}/{}_metadata.json'.format( self.path, sample.name, sample.name) if os.path.isfile(metadatafile): size = os.stat(metadatafile).st_size if size != 0: try: with open(metadatafile) as metadatareport: jsondata = json.load(metadatareport) # Create the metadata objects metadata = MetadataObject() # Initialise the metadata categories as GenObjects created using the appropriate key for attr in jsondata: if not isinstance(jsondata[attr], dict): setattr(metadata, attr, jsondata[attr]) else: setattr(metadata, attr, GenObject(jsondata[attr])) # As files often need to be reanalysed after being moved, test to see if it possible to use the # metadata from the previous assembly jsonfile = '{}/{}_metadata.json'.format( metadata.general.outputdirectory, sample.name) try: # Open the metadata file to write with open( jsonfile, 'w' ) as metadatafile: # Change from wb to w since this is text in python3 # Write the json dump of the object dump to the metadata file json.dump(sample.dump(), metadatafile, sort_keys=True, indent=4, separators=(',', ': ')) # Set the name metadata.name = sample.name self.samples.append(metadata) except IOError: self.samples.append(sample) except ValueError: self.samples.append(sample) else: self.samples.append(sample)
def vtyper(self): """Setup and create threads for ePCR""" printtime('Running ePCR', self.start) # Create the threads for the BLAST analysis for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.epcr, args=()) threads.setDaemon(True) threads.start() # Create the system calls for famap, fahash, and ePCR for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': if 'stx' in sample.general.datastore: setattr(sample, self.analysistype, GenObject()) # Get the primers ready if self.reffilepath: sample[self.analysistype].primers = '{}{}/vtx_subtyping_primers.txt'\ .format(self.reffilepath, self.analysistype) else: sample[self.analysistype].primers = self.primerfile # Make the output path sample[self.analysistype].reportdir = '{}/{}/'.format(sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].reportdir) outfile = sample[self.analysistype].reportdir + sample.name # Set the hashing and mapping commands sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext) sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile) # re-PCR uses the subtyping primers list to search the contigs file using the following parameters # -S {hash file} (Perform STS lookup using hash-file), # -r + (Enable/disable reverse STS lookup) # -m 10000 (Set variability for STS size for lookup), # -n 1 (Set max allowed mismatches per primer for lookup) # -g 0 (Set max allowed indels per primer for lookup), # -G (Print alignments in comments), # -q quiet # -o {output file}, sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 1 -g 0 -G -q -o {}.txt {}'\ .format(outfile, outfile, sample[self.analysistype].primers) sample[self.analysistype].resultsfile = '{}.txt'.format(outfile) self.epcrqueue.put((sample, outfile)) self.epcrqueue.join() self.epcrparse()
def runner(self): """ Run the necessary methods in the correct order """ printtime('Starting {} analysis pipeline'.format(self.analysistype), self.starttime, output=self.portallog) if not self.pipeline: # If the metadata has been passed from the method script, self.pipeline must still be false in order to # get Sippr() to function correctly, but the metadata shouldn't be recreated try: eq = vars(self.runmetadata)['samples'] except KeyError: # Create the objects to be used in the analyses objects = Objectprep(self) objects.objectprep() self.runmetadata = objects.samples else: for sample in self.runmetadata.samples: setattr(sample, self.analysistype, GenObject()) sample.run.outputdirectory = sample.general.outputdirectory self.threads = int(self.cpus / len(self.runmetadata.samples)) \ if self.cpus / len(self.runmetadata.samples) > 1 \ else 1 # Use a custom sippr method to use the full reference database as bait, and run mirabait against the FASTQ # reads - do not perform reference mapping yet SixteenSBait(self, self.cutoff) # Subsample 1000 reads from the FASTQ files self.subsample() # Convert the subsampled FASTQ files to FASTA format self.fasta() # Create BLAST databases if required self.makeblastdb() # Run BLAST analyses of the subsampled FASTA files against the NCBI 16S reference database self.blast() # Parse the BLAST results self.blastparse() # Feed the BLAST results into a modified sippr method to perform reference mapping using the calculated # genus of the sample as the mapping file SixteenSSipper(self, self.cutoff) # Create reports self.reporter()
def sketching(self): printtime('Indexing assemblies for mash analysis', self.starttime) # Create the threads for the analysis for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.sketch, args=()) threads.setDaemon(True) threads.start() # Populate threads for each gene, genome combination for sample in self.metadata: # Create the analysis type-specific GenObject setattr(sample, self.analysistype, GenObject()) if sample.general.bestassemblyfile != 'NA': # Set attributes sample[self.analysistype].reportdir = os.path.join( sample.general.outputdirectory, self.analysistype) sample[self.analysistype].targetpath = os.path.join( self.referencefilepath, self.analysistype) sample[self.analysistype].refseqsketch = \ sample[self.analysistype].targetpath + '/RefSeqSketchesDefaults.msh' sample[self.analysistype].sketchfilenoext = '{}/{}'.format( sample[self.analysistype].reportdir, sample.name) sample[self.analysistype].sketchfile = sample[ self.analysistype].sketchfilenoext + '.msh' # Make the mash output directory if necessary make_path(sample[self.analysistype].reportdir) # Create a file containing the path/name of the filtered, corrected fastq files sample[self. analysistype].filelist = '{}/{}_fastqfiles.txt'.format( sample[self.analysistype].reportdir, sample.name) with open(sample[self.analysistype].filelist, 'w') as filelist: filelist.write('\n'.join( sample.general.trimmedcorrectedfastqfiles)) # Create the system call sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \ .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext) # Add each sample to the threads self.sketchqueue.put(sample) # Join the threads self.sketchqueue.join() self.mashing()
def sketching(self): printtime('Indexing files for {} analysis'.format(self.analysistype), self.starttime) # Create the threads for the analysis for i in range(self.cpus): threads = Thread(target=self.sketch, args=()) threads.setDaemon(True) threads.start() # Populate threads for each gene, genome combination for sample in self.metadata: # Create the analysis type-specific GenObject setattr(sample, self.analysistype, GenObject()) # Set attributes sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].reportdir) sample[self.analysistype].targetpath = self.referencefilepath if not self.pipeline else os.path.join( self.referencefilepath, self.analysistype) sample[self.analysistype].refseqsketch = os.path.join(sample[self.analysistype].targetpath, 'RefSeqSketchesDefaults.msh') sample[self.analysistype].sketchfilenoext = os.path.join(sample[self.analysistype].reportdir, sample.name) sample[self.analysistype].sketchfile = sample[self.analysistype].sketchfilenoext + '.msh' # Make the mash output directory if necessary make_path(sample[self.analysistype].reportdir) # Create a file containing the path/name of the filtered, corrected fastq files sample[self.analysistype].filelist = os.path.join(sample[self.analysistype].reportdir, '{}_fastqfiles.txt'.format(sample.name)) with open(sample[self.analysistype].filelist, 'w') as filelist: filelist.write('\n'.join(sample.general.trimmedcorrectedfastqfiles)) # Create the system call sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \ .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext) # Add each sample to the threads try: self.sketchqueue.put(sample) except (KeyboardInterrupt, SystemExit): printtime('Received keyboard interrupt, quitting threads', self.starttime) quit() # Join the threads self.sketchqueue.join() self.mashing()
def versions(self): for sample in self.metadata: # Initialise the attribute sample.software = GenObject() # Populate the versions of the software used ss = sample.software ss.python = self.python ss.arch = self.arch ss.blast = self.blast ss.bowtie2 = self.bowversion ss.samtools = self.samversion ss.qualimap = self.qualimap ss.mash = self.mash ss.prodigal = self.prodigal ss.pipeline = self.commit ss.spades = self.spades ss.bbmap = self.bbmap ss.fastqc = self.fastqc ss.blc2fastq = self.bcl2fastq ss.perl = self.perl ss.biopython = self.biopython ss.java = self.java
def targets(self): """ Create the GenObject for the analysis type, create the hash file for baiting (if necessary) """ for sample in self.runmetadata: if sample.general.bestassemblyfile != 'NA': setattr(sample, self.analysistype, GenObject()) sample[self.analysistype].runanalysis = True sample[self.analysistype].targetpath = self.targetpath baitpath = os.path.join(self.targetpath, 'bait') sample[self.analysistype].baitfile = glob( os.path.join(baitpath, '*.fa'))[0] sample[self.analysistype].outputdir = os.path.join( sample.run.outputdirectory, self.analysistype) sample[self.analysistype].logout = os.path.join( sample[self.analysistype].outputdir, 'logout.txt') sample[self.analysistype].logerr = os.path.join( sample[self.analysistype].outputdir, 'logerr.txt') sample[self.analysistype].baitedfastq = os.path.join( sample[self.analysistype].outputdir, '{}_targetMatches.fastq'.format(self.analysistype)) sample[self.analysistype].complete = False
arguments = parser.parse_args() # Define the start time arguments.starttime = time.time() # Find the files fastas = sorted(glob(os.path.join(arguments.sequencepath, '*.fa*'))) # Create a metadata object arguments.runmetadata = MetadataObject() arguments.runmetadata.samples = list() for fasta in fastas: metadata = MetadataObject() metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Set the destination folder outputdir = os.path.join(arguments.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta arguments.runmetadata.samples.append(metadata)
def contamination_finder(self, input_path=None, report_path=None, portal_log=None): """ Helper function to get confindr integrated into the assembly pipeline """ if portal_log is not None: printtime('Calculating contamination in reads', self.start, output=portal_log) else: printtime('Calculating contamination in reads', self.start) if input_path is not None: input_dir = input_path else: input_dir = self.path if report_path is not None: reportpath = report_path else: reportpath = os.path.join(input_dir, 'confindr') report = os.path.join(reportpath, 'confindr_report.csv') if not os.path.isfile(report): # Create an object to store attributes to pass to confinder args = MetadataObject args.input_directory = input_dir args.output_name = reportpath args.databases = os.path.join(self.reffilepath, 'ConFindr', 'databases') args.forward_id = '_R1' args.reverse_id = '_R2' args.threads = self.cpus args.kmer_size = 31 args.number_subsamples = 3 args.subsample_depth = 20 args.kmer_cutoff = 2 try: shutil.rmtree(args.output_name) except IOError: pass make_path(reportpath) # Open the output report file. with open(os.path.join(report), 'w') as f: f.write( 'Strain,Genus,NumContamSNVs,NumUniqueKmers,ContamStatus\n') for sample in self.metadata: if len(sample.general.trimmedcorrectedfastqfiles) == 2: confindr.find_contamination( sample.general.trimmedcorrectedfastqfiles, args) elif len(sample.general.trimmedcorrectedfastqfiles) == 1: confindr.find_contamination_unpaired( args, sample.general.trimmedcorrectedfastqfiles[0]) if portal_log: printtime('Contamination detection complete!', self.start, output=portal_log) else: printtime('Contamination detection complete!', self.start) # Load the confindr report into a dictionary using pandas # https://stackoverflow.com/questions/33620982/reading-csv-file-as-dictionary-using-pandas confindr_results = pandas.read_csv(report, index_col=0).T.to_dict() # Find the results for each of the samples for sample in self.metadata: # Create a GenObject to store the results sample.confindr = GenObject() # Iterate through the dictionary to find the outputs for each sample for line in confindr_results: # If the current line corresponds to the sample of interest if sample.name in line: # Set the values using the appropriate keys as the attributes sample.confindr.genus = confindr_results[line]['Genus'] sample.confindr.num_contaminated_snvs = confindr_results[ line]['NumContamSNVs'] sample.confindr.unique_kmers = confindr_results[line][ 'NumUniqueKmers'] try: sample.confindr.cross_contamination = confindr_results[ line]['CrossContamination'] except KeyError: sample.confindr.cross_contamination = str() sample.confindr.contam_status = confindr_results[line][ 'ContamStatus'] if sample.confindr.contam_status is True: sample.confindr.contam_status = 'Contaminated' elif sample.confindr.contam_status is False: sample.confindr.contam_status = 'Clean' # Re-write the output to be consistent with the rest of the pipeline with open(os.path.join(reportpath, 'confindr_report.csv'), 'w') as csv: data = 'Strain,Genus,NumContamSNVs,NumUniqueKmers,ContamStatus\n' for sample in self.metadata: data += '{str},{genus},{numcontamsnv},{numuniqkmer},{status}\n'.format( str=sample.name, genus=sample.confindr.genus, numcontamsnv=sample.confindr.num_contaminated_snvs, numuniqkmer=sample.confindr.unique_kmers, status=sample.confindr.contam_status) csv.write(data)
def probefinder(self): """ Find the longest probe sequences """ logging.info('Finding and filtering probe sequences') for sample in self.samples: # A list to store the metadata object for each alignment sample.gene = list() for align in sample.alignedalleles: # Create an object to store all the information for each alignment file metadata = GenObject() metadata.name = os.path.splitext(os.path.basename(align))[0] metadata.alignmentfile = align # Create an alignment object from the alignment file try: metadata.alignment = AlignIO.read(align, 'fasta') except ValueError: # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences # to be the length of the longest sequence # https://stackoverflow.com/questions/32833230/biopython-alignio-valueerror-says-strings-must-be-same-length records = SeqIO.parse(align, 'fasta') # Make a copy, otherwise our generator is exhausted after calculating maxlen records = list(records) # Calculate the length of the longest sequence maxlen = max(len(record.seq) for record in records) # Pad sequences so that they all have the same length for record in records: if len(record.seq) != maxlen: sequence = str(record.seq).ljust(maxlen, '.') record.seq = Seq(sequence) assert all(len(record.seq) == maxlen for record in records) # Write to file and do alignment metadata.alignmentfile = '{}_padded.tfa'.format( os.path.splitext(align)[0]) with open(metadata.alignmentfile, 'w') as padded: SeqIO.write(records, padded, 'fasta') # Align the padded sequences metadata.alignment = AlignIO.read(metadata.alignmentfile, 'fasta') metadata.summaryalign = AlignInfo.SummaryInfo( metadata.alignment) # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default # parameters of threshold=.7, and ambiguous='X' are used consensus = metadata.summaryalign.dumb_consensus() metadata.consensus = str(consensus) # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each # location along the entire consensus sequence metadata.pssm = metadata.summaryalign.pos_specific_score_matrix( consensus) metadata.identity = list() # Find the prevalence of each base for every location along the sequence for line in metadata.pssm: try: bases = [ line['A'], line['C'], line['G'], line['T'], line['-'] ] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases[:4]) / sum(bases) * 100))) except KeyError: bases = [line['A'], line['C'], line['G'], line['T']] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases) / sum(bases) * 100))) # List to store metadata objects metadata.windows = list() # Variable to store whether a suitable probe has been found for the current organism + gene pair. # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found passing = False # Create sliding windows of size self.max - self.min from the list of identities for each column # of the alignment for i in reversed(range(self.min, self.max + 1)): if not passing: windowdata = MetadataObject() windowdata.size = i windowdata.max = 0 windowdata.sliding = list() # Create a counter to store the starting location of the window in the sequence n = 0 # Create sliding windows from the range of sizes for the list of identities windows = self.window(metadata.identity, i) # Go through each window from the collection of sliding windows to determine which window(s) # has (have) the best results for window in windows: # Create another object to store all the data for the window slidingdata = MetadataObject() # Only consider the window if every position has a percent identity greater than the cutoff if min(window) > self.cutoff: # Populate the object with the necessary variables slidingdata.location = '{}:{}'.format(n, n + i) slidingdata.min = min(window) slidingdata.mean = float('{:.2f}'.format( numpy.mean(window))) slidingdata.sequence = str(consensus[n:n + i]) # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min # means a better/less overall percent identity, respectively windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \ else windowdata.max windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \ else windowdata.min # Add the object to the list of objects windowdata.sliding.append(slidingdata) passing = True n += 1 # All the object to the list of objects metadata.windows.append(windowdata) # All the object to the list of objects sample.gene.append(metadata)
def __init__(self, args, pipelinecommit, startingtime, scriptpath): # Initialise variables self.commit = str(pipelinecommit) self.start = startingtime self.homepath = scriptpath # Define variables based on supplied arguments self.args = args self.path = os.path.join(args.path, '') assert os.path.isdir( self.path ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) self.sequencepath = os.path.join(args.sequencepath, '') assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \ .format(self.sequencepath) self.databasepath = os.path.join(args.databasepath, '') assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \ .format(self.databasepath) # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1 self.cpus = 1 # Set variables from the arguments self.database = args.database self.rank = args.rank self.clarkpath = args.clarkpath self.cutoff = float(args.cutoff) * 100 # Initialise variables for the analysis self.targetcall = str() self.classifycall = str() self.devnull = open(os.devnull, 'wb') self.filelist = os.path.join(self.path, 'sampleList.txt') self.reportlist = os.path.join(self.path, 'reportList.txt') self.abundancequeue = Queue() self.datapath = str() self.reportpath = os.path.join(self.path, 'reports') self.clean_seqs = args.clean_seqs self.light = args.light if self.clean_seqs: try: self.reffilepath = args.reffilepath except AttributeError: self.clean_seqs = False # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects # and variables play nice try: if args.runmetadata: self.runmetadata = args.runmetadata self.extension = self.runmetadata.extension # Create the name of the final report self.report = os.path.join( self.reportpath, '{}'.format('abundance{}.xlsx'.format(self.extension))) # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK if not os.path.isfile(self.report): printtime( 'Performing CLARK analysis on {} files'.format( self.extension), self.start) if self.extension != 'fastq': for sample in self.runmetadata.samples: sample.general.combined = sample.general.bestassemblyfile # Run the pipeline self.main() else: # Only perform FASTQ analyses if the sample is declared to be a metagenome metagenome = False for sample in self.runmetadata.samples: try: status = sample.run.Description except KeyError: status = 'unknown' if status == 'metagenome': metagenome = True # If any of the samples are metagenomes, run the CLARK analysis on the raw files if metagenome: fileprep.Fileprep(self) # Run the pipeline self.main() # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': # Create a GenObject to store metadata when this script is run as part of the pipeline clarkextension = 'clark{}'.format(self.extension) setattr(sample, clarkextension, GenObject()) # Create a folder to store all the CLARK files sample[clarkextension].outputpath = os.path.join( sample.general.outputdirectory, 'CLARK') make_path(sample[clarkextension].outputpath) # Move the files to the CLARK folder try: move( sample.general.abundance, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.abundance))) move( sample.general.classification, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.classification))) except (KeyError, FileNotFoundError): pass # Set the CLARK-specific attributes try: sample[ clarkextension].abundance = sample.general.abundance sample[ clarkextension].classification = sample.general.classification sample[ clarkextension].combined = sample.general.combined except KeyError: pass if self.extension == 'fastq': # Remove the combined .fastq files try: if type(sample[clarkextension].combined ) is list: os.remove( sample[clarkextension].combined) except (OSError, KeyError): pass # Remove all the attributes from .general map(lambda x: delattr(sample.general, x), ['abundance', 'classification', 'combined']) # Remove the text files lists of files and reports created by CLARK try: map( lambda x: os.remove(os.path.join(self.path, x) ), ['reportList.txt', 'sampleList.txt']) except OSError: pass else: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() self.main() except AttributeError: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() self.main() # Optionally filter the .fastq reads based on taxonomic assignment if args.filter: filtermetagenome.PipelineInit(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
def createfastq(self): """Uses bcl2fastq to create .fastq files from a MiSeqRun""" # Initialise samplecount samplecount = 0 # If the fastq destination folder is not provided, make the default value of :path/:miseqfoldername self.fastqdestination = self.fastqdestination if self.fastqdestination else self.path + self.miseqfoldername # Make the path make_path(self.fastqdestination) # Initialise variables for storing index information index = '' indexlength = int() # bcl2fastq requires an older version of the sample sheet, this recreates the required version # Create the new sample sheet with open('{}/SampleSheet_modified.csv'.format(self.fastqdestination), "w") as modifiedsamplesheet: # Write the required headings to the file modifiedsamplesheet.write( "FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject\n" ) for strain in self.samples: # Create a combined index of index1-index2 try: strain.run.modifiedindex = '{}-{}'.format( strain.run.index, strain.run.index2) indexlength = 16 index = 'I8,I8' except KeyError: strain.run.modifiedindex = strain.run.index indexlength = 6 index = 'I6' # The list of items to print to each line of the modified sample sheet printlist = [ self.flowcell, '1', strain.name, str(strain.run.SampleNumber), strain.run.modifiedindex, strain.run.Description, 'N', 'NA', strain.run.InvestigatorName, self.projectname ] modifiedsamplesheet.write('{}\n'.format(",".join(printlist))) samplecount += 1 # Set :forward/reverse length to :header.forward/reverse length if the argument is not provided, or it's 'full', # otherwise use the supplied argument self.forwardlength = self.metadata.header.forwardlength if self.forwardlength.lower()\ == 'full' else self.forwardlength # Set :reverselength to :header.reverselength self.reverselength = self.metadata.header.reverselength if self.reverselength.lower() \ == 'full' else self.reverselength # As the number of cycles required is the number of forward reads + the index(8) + the second index(8) # Also set the basemask variable as required if self.reverselength != '0': self.readsneeded = int(self.forwardlength) + int( self.reverselength) + indexlength basemask = "Y{}n*,{},Y{}n*".format(self.forwardlength, index, self.reverselength) nohup = "nohup make -j 16 > nohup.out" else: # + 1 self.readsneeded = int(self.forwardlength) + indexlength basemask = "Y{}n*,{},n*".format(self.forwardlength, index) nohup = "nohup make -j 16 r1 > nohup.out" # Handle plurality appropriately samples = 'samples' if samplecount > 1 else 'sample' number = 'are' if samplecount > 1 else 'is' printtime( 'There {} {} {} in this run. ' 'Running fastq creating module with the following parameters:\n' 'MiSeqPath: {},\n' 'MiSeqFolder: {},\n' 'Fastq destination: {},\n' 'SampleSheet: {}'.format( number, samplecount, samples, self.miseqpath, self.miseqfolder, self.fastqdestination, '{}/SampleSheet_modified.csv'.format(self.fastqdestination)), self.start) # Count the number of completed cycles in the run of interest cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format( self.miseqfolder)) while len(cycles) < self.readsneeded: printtime( 'Currently at {} cycles. Waiting until the MiSeq reaches cycle {}' .format(len(cycles), self.readsneeded), self.start) sleep(1800) cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format( self.miseqfolder)) # configureBClToFastq requires :self.miseqfolder//Data/Intensities/BaseCalls/config.xml in order to work # When you download runs from BaseSpace, this file is not provided. There is an empty config.xml file that # can be populated with run-specific values and moved to the appropriate folder if not os.path.isfile('{}Data/Intensities/BaseCalls/config.xml'.format( self.miseqfolder)): self.configfilepopulator() # Define the bcl2fastq system call bclcall = "configureBclToFastq.pl --input-dir {}Data/Intensities/BaseCalls " \ "--output-dir {} --force --sample-sheet {}/SampleSheet_modified.csv " \ "--mismatches 1 --no-eamss --fastq-cluster-count 0 --compression none --use-bases-mask {}"\ .format(self.miseqfolder, self.fastqdestination, self.fastqdestination, basemask) # Define the nohup system call nohupcall = "cd {} && {}".format(self.fastqdestination, nohup) # fnull = open(os.devnull, 'wb') if not os.path.isdir("{}/Project_{}".format(self.fastqdestination, self.projectname)): # Call configureBclToFastq.pl printtime('Running bcl2fastq', self.start) # Run the commands threadlock = threading.Lock() outstr = '' outerr = '' out, err = run_subprocess(bclcall) outstr += out outerr += out out, err = run_subprocess(nohupcall) outstr += out outerr += out # call(bclcall, shell=True, stdout=fnull, stderr=fnull) # call(nohupcall, shell=True, stdout=fnull, stderr=fnull) threadlock.acquire() write_to_logfile(bclcall, bclcall, self.logfile) write_to_logfile(nohupcall, nohupcall, self.logfile) write_to_logfile(outstr, outerr, self.logfile) threadlock.release() # Populate the metadata for sample in self.metadata.samples: sample.commands = GenObject() sample.commands.nohup = nohupcall sample.commands.bcl = bclcall sample.run.forwardlength = self.forwardlength sample.run.reverselength = self.reverselength # Copy the fastq files to a central folder so they can be processed self.fastqmover()