Example #1
0
 def basic(self):
     # Grab any .fastq files in the path
     fastqfiles = glob(os.path.join(self.path, '*.fastq*'))
     # Extract the base name of the globbed name + path provided
     fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles))
     # Iterate through the names of the fastq files
     for fastqname in sorted(fastqnames):
         # Set the name
         metadata = MetadataObject()
         metadata.name = fastqname
         # Set the destination folder
         outputdir = os.path.join(self.path, fastqname)
         # Make the destination folder
         make_path(outputdir)
         # Get the fastq files specific to the fastqname
         specificfastq = glob(
             os.path.join(self.path, '{}*.fastq*'.format(fastqname)))
         # Link the files to the output folder
         try:
             # Link the .gz files to :self.path/:filename
             list(
                 map(
                     lambda x: os.symlink(
                         '../{}'.format(os.path.basename(x)), '{}/{}'.
                         format(outputdir, os.path.basename(x))),
                     specificfastq))
         # Except os errors
         except OSError as exception:
             # If there is an exception other than the file exists, raise it
             if exception.errno != errno.EEXIST:
                 raise
         # Initialise the general and run categories
         metadata.general = GenObject()
         metadata.run = GenObject()
         # Populate the .fastqfiles category of :self.metadata
         metadata.general.fastqfiles = [
             fastq for fastq in sorted(
                 glob(
                     os.path.join(outputdir, '{}*.fastq*'.format(
                         metadata.name)))) if 'trimmed' not in fastq
             and 'normalised' not in fastq and 'corrected' not in fastq
             and 'paired' not in fastq and 'unpaired' not in fastq
         ]
         # Add the output directory to the metadata
         metadata.general.outputdirectory = outputdir
         metadata.general.logout = os.path.join(
             self.path, metadata.name,
             '{}_log_out.txt'.format(metadata.name))
         metadata.general.logerr = os.path.join(
             self.path, metadata.name,
             '{}_log_err.txt'.format(metadata.name))
         # Append the metadata to the list of samples
         self.samples.append(metadata)
     # Grab metadata from previous runs
     previousmetadata = metadataReader.MetadataReader(self)
     # Update self.samples (if required)
     if previousmetadata.samples:
         self.samples = previousmetadata.samples
     # Run the read length method
     self.readlength()
Example #2
0
def test_sistr(variables):
    metadata = MetadataObject()
    method.runmetadata.samples = list()
    fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta')
    metadata.name = os.path.split(fasta)[1].split('.')[0]
    # Initialise the general and run categories
    metadata.general = GenObject()
    metadata.run = GenObject()
    metadata.general.fastqfiles = list()
    # Set the destination folder
    outputdir = os.path.join(variables.sequencepath, metadata.name)
    make_path(outputdir)
    # Add the output directory to the metadata
    metadata.general.outputdirectory = outputdir
    metadata.run.outputdirectory = outputdir
    metadata.general.bestassemblyfile = True
    # Initialise an attribute to store commands
    metadata.commands = GenObject()
    # Assume that all samples are Salmonella
    metadata.general.referencegenus = 'Salmonella'
    # Set the .fasta file as the best assembly
    metadata.general.bestassemblyfile = fasta
    method.runmetadata.samples.append(metadata)
    method.sistr()
    for sample in method.runmetadata.samples:
        assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA'
    variable_update()
Example #3
0
 def createobject(self):
     # Grab any .fastq files in the path
     fastqfiles = glob(os.path.join(self.path, '*.fastq*'))
     # Extract the base name of the globbed name + path provided
     fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles))
     # Iterate through the names of the fastq files
     for fastqname in sorted(fastqnames):
         # Set the name
         metadata = MetadataObject()
         metadata.name = fastqname
         # Set the destination folder
         outputdir = os.path.join(self.path, fastqname)
         # Make the destination folder
         make_path(outputdir)
         # Get the fastq files specific to the fastqname
         specificfastq = glob(
             os.path.join(self.path, '{}*.fastq*'.format(fastqname)))
         # Make relative symlinks to the files in :self.path
         try:
             for fastq in specificfastq:
                 # Get the basename of the file
                 fastqfile = os.path.split(fastq)[-1]
                 # Set the destination fastq path as the base name plus the destination folder
                 destinationfastq = os.path.join(outputdir, fastqfile)
                 # Symlink the files
                 os.symlink('../{}'.format(fastqfile), destinationfastq)
         # Except os errors
         except OSError as exception:
             # If there is an exception other than the file exists, raise it
             if exception.errno != errno.EEXIST:
                 raise
         # Initialise the general and run categories
         metadata.general = GenObject()
         metadata.run = GenObject()
         # Populate the .fastqfiles category of :self.metadata
         metadata.general.fastqfiles = [
             fastq for fastq in glob(
                 os.path.join(outputdir, '{}*.fastq*'.format(fastqname)))
             if 'trimmed' not in fastq
         ]
         # Add the output directory to the metadata
         metadata.general.outputdirectory = outputdir
         metadata.run.outputdirectory = outputdir
         metadata.general.bestassemblyfile = True
         metadata.general.trimmedcorrectedfastqfiles = metadata.general.fastqfiles
         metadata.general.logout = os.path.join(
             metadata.general.outputdirectory, 'logout')
         metadata.general.logerr = os.path.join(
             metadata.general.outputdirectory, 'logerr')
         # Initialise an attribute to store commands
         metadata.commands = GenObject()
         # Append the metadata to the list of samples
         self.samples.append(metadata)
Example #4
0
    def estimateabundance(self):
        """
        Estimate the abundance of taxonomic groups
        """
        printtime('Estimating abundance of taxonomic groups', self.start)
        # Create and start threads
        for i in range(self.cpus):
            # Send the threads to the appropriate destination function
            threads = Thread(target=self.estimate, args=())
            # Set the daemon to true - something to do with thread management
            threads.setDaemon(True)
            # Start the threading
            threads.start()
        for sample in self.runmetadata.samples:
            try:
                if sample.general.combined != 'NA':
                    # Set the name of the abundance report
                    sample.general.abundance = sample.general.combined.split(
                        '.')[0] + '_abundance.csv'
                    # if not hasattr(sample, 'commands'):
                    if not sample.commands.datastore:
                        sample.commands = GenObject()

                    # Define system calls
                    sample.commands.target = self.targetcall
                    sample.commands.classify = self.classifycall
                    sample.commands.abundancecall = \
                        'cd {} && ./estimate_abundance.sh -D {} -F {} > {}'.format(self.clarkpath,
                                                                                   self.databasepath,
                                                                                   sample.general.classification,
                                                                                   sample.general.abundance)
                    self.abundancequeue.put(sample)
            except KeyError:
                pass
        self.abundancequeue.join()
Example #5
0
 def metaparse(self, sample, quastoutputdirectory):
     import functools
     # Tuples of strings to replace when parsing the results file
     repls = ('>=', 'Over'), ('000 Bp', 'kbp'), ('#', 'Num'), \
             ("'", ''), ('(', ''), (')', ''), (' ', ''), ('>', 'Less'), ('Gc%', 'GC%')
     # Initialise the results dictionary
     quast = dict()
     # The results file is gage_report.tsv if that file exists, otherwise it is report.tsv
     resfile = "{0:s}/gage_report.tsv".format(quastoutputdirectory) \
         if os.path.isfile("{0:s}/gage_report.tsv".format(quastoutputdirectory)) \
         else "{0:s}/report.tsv".format(quastoutputdirectory)
     with open(resfile) as report:
         for line in report:
             # Use headings in report as keys for the GenObject supplied from generator and replace incrementally
             # with reduce and lambda function below
             k, v = [
                 functools.reduce(lambda a, kv: a.replace(*kv), repls,
                                  s.title())
                 for s in line.rstrip().split('\t')
             ]
             quast[k] = v
     # Create the quast metadata object
     sample.quast = GenObject(quast)
     sample.quast.outputdirectory = quastoutputdirectory
     sample.quast.kmers = self.kmers
Example #6
0
 def __init__(self, inputobject):
     self.metadata = inputobject.runmetadata.samples
     self.cpus = inputobject.cpus
     try:
         self.threads = int(self.cpus / len(
             self.metadata)) if self.cpus / len(self.metadata) > 1 else 1
     except TypeError:
         self.threads = self.cpus
     # self.devnull = open(os.devnull, 'wb')
     self.qcqueue = Queue(maxsize=self.cpus)
     self.trimqueue = Queue(maxsize=self.cpus)
     self.correctqueue = Queue(maxsize=self.cpus)
     self.start = inputobject.starttime
     try:
         self.forwardlength = inputobject.forwardlength
         self.reverselength = inputobject.reverselength
     except AttributeError:
         self.forwardlength = 'full'
         self.reverselength = 'full'
     self.numreads = inputobject.numreads
     self.logfile = inputobject.logfile
     self.path = inputobject.path
     self.analysistype = 'quality'
     self.reffilepath = inputobject.reffilepath
     # Initialise the quality attribute in the metadata object
     for sample in self.metadata:
         setattr(sample, self.analysistype, GenObject())
Example #7
0
    def __init__(self, passed):
        """Initialise variables"""
        self.path = passed.path
        self.runinfo = passed.runinfo
        self.flowcell = "NA"
        self.instrument = "NA"
        self.samples = list()
        self.ids = list()
        self.date = str()
        self.totalreads = 0
        self.runid = str()
        self.runnumber = str()
        self.commit = passed.commit

        # Create and start to populate the header object
        self.header = GenObject()
        # If a custom sample sheet has been provided, use it
        if passed.customsamplesheet:
            self.samplesheet = passed.customsamplesheet
            assert os.path.isfile(self.samplesheet), u'Could not find CustomSampleSheet as entered: {0!r:s}'\
                .format(self.samplesheet)
        else:
            self.samplesheet = os.path.join(self.path, "SampleSheet.csv")
        # Extract data from SampleSheet.csv
        self.parsesamplesheet()
Example #8
0
 def parse_qaml(self):
     """
     Parse the GenomeQAML report, and populate metadata objects
     """
     printtime('Parsing GenomeQAML outputs', self.start)
     # A dictionary to store the parsed excel file in a more readable format
     nesteddictionary = dict()
     # Use pandas to read in the CSV file, and convert the pandas data frame to a dictionary (.to_dict())
     dictionary = pandas.read_csv(self.qaml_report).to_dict()
     # Iterate through the dictionary - each header from the CSV file
     for header in dictionary:
         # Sample is the primary key, and value is the value of the cell for that primary key + header combination
         for sample, value in dictionary[header].items():
             # Update the dictionary with the new data
             try:
                 nesteddictionary[sample].update({header: value})
             # Create the nested dictionary if it hasn't been created yet
             except KeyError:
                 nesteddictionary[sample] = dict()
                 nesteddictionary[sample].update({header: value})
     # Get the results into the metadata object
     for sample in self.metadata:
         # Initialise the plasmid extractor genobject
         setattr(sample, self.analysistype, GenObject())
         # Initialise the list of all plasmids
         sample[self.analysistype].prediction = str()
         # Iterate through the dictionary of results
         for line in nesteddictionary:
             # Extract the sample name from the dictionary
             name = nesteddictionary[line]['Sample']
             # Ensure that the names match
             if name == sample.name:
                 # Append the plasmid name extracted from the dictionary to the list of plasmids
                 sample[self.analysistype].prediction = nesteddictionary[
                     line]['Predicted_Class']
Example #9
0
 def helper(self):
     """Helper function for file creation (if desired), manipulation, quality assessment,
     and trimming as well as the assembly"""
     # Simple assembly without requiring accessory files (SampleSheet.csv, etc).
     if self.basicassembly:
         self.runmetadata = Basic(self)
     else:
         # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and
         # RunInfo.xml files
         self.runinfo = os.path.join(self.path, 'RunInfo.xml')
         self.runmetadata = runMetadata.Metadata(self)
         # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided
         self.runmetadata.parseruninfo()
         # Extract PhiX mapping information from the run
         phi = phix.PhiX(self)
         phi.main()
         # Populate the lack of bclcall and nohup call into the metadata sheet
         for sample in self.runmetadata.samples:
             sample.commands = GenObject()
             sample.commands.nohupcall = 'NA'
             sample.commands.bclcall = 'NA'
         # Move/link the FASTQ files to strain-specific working directories
         fastqmover.FastqMover(self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(self)
Example #10
0
 def primers(self):
     """Setup and create threads for ePCR"""
     # Create the threads for the ePCR analysis
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             threads = Thread(target=self.epcr, args=())
             threads.setDaemon(True)
             threads.start()
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             setattr(sample, self.analysistype, GenObject())
             # Get the primers ready
             try:
                 sample[self.analysistype].primers = glob(os.path.join(self.reffilepath,
                                                                       self.analysistype,
                                                                       sample.general.referencegenus,
                                                                       'primers',
                                                                       '*.txt'))[0]
                 # Find the name of the probe file
                 sample[self.analysistype].probes = glob(os.path.join(self.reffilepath,
                                                                      self.analysistype,
                                                                      sample.general.referencegenus,
                                                                      'probes',
                                                                      '*.fa'))[0]
                 # Create the BLAST database of the probes (if necessary)
                 self.makeblastdb(sample[self.analysistype].probes)
                 # Initialise a list to store the names of the targets
                 sample[self.analysistype].targets = list()
                 # Open the primer file, and read the names of the targets into a list
                 with open(sample[self.analysistype].primers, 'r') as primerfile:
                     for line in primerfile:
                         sample[self.analysistype].targets.append(line.split('\t')[0])
             # Organisms without primer/probe files will fail. Populate metadata with 'NA' values
             except IndexError:
                 sample[self.analysistype].primers = 'NA'
                 sample[self.analysistype].probes = 'NA'
             # Only try to process organisms with primer files
             if sample[self.analysistype].primers != 'NA':
                 # Make the output path
                 sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory,
                                                                    self.analysistype)
                 make_path(sample[self.analysistype].reportdir)
                 # Set the base name of the output file
                 outfile = sample[self.analysistype].reportdir + sample.name
                 # Set the hashing and mapping commands
                 sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext)
                 sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile)
                 # re-PCR uses the subtyping primers list to search the contigs file using the following parameters
                 # -S {hash file} (Perform STS lookup using hash-file), -r + (Enable/disable reverse STS lookup)
                 # -m 10000 (Set variability for STS size for lookup),
                 # -n 1 (Set max allowed mismatches per primer for lookup)
                 # -g 0 (Set max allowed indels per primer for lookup),
                 # -G (Print alignments in comments), -o {output file}
                 sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 2 -g 0 -G -q -o {}.txt {}' \
                     .format(outfile, outfile, sample[self.analysistype].primers)
                 # Add the variables to the queue
                 self.epcrqueue.put((sample, outfile))
     self.epcrqueue.join()
Example #11
0
 def error(sample, message):
     """
     Check to see if the run GenObject exists. If so, update the run.Description to reflect the error
     :param sample: metadata sample object
     :param message: error message to add to the sample.run.Description attribute
     """
     # Set the .fastqfiles attribute to 'NA' to remove this strain from the analyses
     sample.general.fastqfiles = ['NA']
     # Ensure that the run attribute exists
     if GenObject.isattr(sample, 'run'):
         # If the Description attribute exists, overwrite it, otherwise create and populate it
         if GenObject.isattr(sample.run, 'status'):
             sample.run.status = message
         else:
             setattr(sample.run, 'status', message)
     # Otherwise create and populate the attribute
     else:
         setattr(sample, 'run', GenObject())
         sample.run.Description = message
 def extract_rmlst_reads(self):
     """
     rMLST read extraction. Should be the first thing called after parsing the fastq directory.
     """
     for sample in self.metadata:
         # Create the object to store the variables
         setattr(sample, self.analysistype, GenObject())
         # Initialise variables
         sample[self.analysistype].snv_count = list()
         # Initialise a starting value for the number of unique kmers found in each sample
         sample[self.analysistype].unique_kmers = -1
         # Set and create the output directory
         try:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.run.outputdirectory, self.analysistype)
         except KeyError:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.general.outputdirectory, self.analysistype)
         make_path(sample[self.analysistype].outputdir)
         sample[self.analysistype].logout = os.path.join(
             sample[self.analysistype].outputdir, 'logout.txt')
         sample[self.analysistype].logerr = os.path.join(
             sample[self.analysistype].outputdir, 'logerr.txt')
         sample[self.analysistype].baitedfastq = os.path.join(
             sample[self.analysistype].outputdir,
             '{}_targetMatches.fastq.gz'.format(self.analysistype))
         # Create the command to run the baiting - paired inputs and a single, zipped output
         sample[self.analysistype].bbdukcmd = 'bbduk.sh ref={} in1={} in2={} threads={} outm={}'\
             .format(self.database,
                     sample.general.trimmedcorrectedfastqfiles[0],
                     sample.general.trimmedcorrectedfastqfiles[1],
                     str(self.threads),
                     sample[self.analysistype].baitedfastq)
         # Sometimes bbduk hangs forever, so that needs to be handled. Give it a very generous timeout.
         try:
             # Run the call, and write any errors to the logfile
             command = sample[self.analysistype].bbdukcmd
             if self.analyse:
                 out, err = run_subprocess(command)
             else:
                 out = str()
                 err = str()
             write_to_logfile(command, command, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         except TimeoutExpired:
             print('ERROR: Could not extract rMLST reads from sample {}'.
                   format(sample.name))
Example #13
0
 def setup(self):
     """
     Set up the metadata object to be passed to Vtyper()
     """
     from glob import glob
     files = sorted(glob('{}*.fasta'.format(self.sequencepath)))
     samples = list()
     # Create the metadata for each file
     for fasta in files:
         # Create a metadata object to store all metadata associated with each strain
         metadata = MetadataObject()
         metadata.general = GenObject()
         metadata.commands = GenObject()
         # Set the name
         metadata.name = os.path.basename(fasta).split('.')[0]
         metadata.general.bestassemblyfile = fasta
         metadata.general.stx = True
         metadata.general.outputdirectory = self.path
         metadata.general.filenoext = fasta.split('.')[0]
         metadata.general.fastqfiles = list()
         samples.append(metadata)
     return samples
Example #14
0
 def main(self):
     """
     Run the necessary methods in the correct order
     """
     printtime('Starting {} analysis pipeline'.format(self.analysistype), self.starttime)
     # Create the objects to be used in the analyses
     objects = Objectprep(self)
     objects.objectprep()
     self.runmetadata = objects.samples
     self.threads = int(self.cpus / len(self.runmetadata.samples)) if self.cpus / len(self.runmetadata.samples) > 1 \
         else 1
     # Run the genesippr analyses
     self.analysistype = 'genesippr'
     self.targetpath = os.path.join(self.reffilepath, self.analysistype, '')
     Sippr(self, 0.90)
     # Create the reports
     self.reports = Reports(self)
     Reports.reporter(self.reports)
     # Run the 16S analyses using the filtered database
     self.targetpath = self.reffilepath
     # Run the 16S analyses
     self.analysistype = 'sixteens_full'
     SixteensFull(self, self.commit, self.starttime, self.homepath, 'sixteens_full', 0.985)
     # ResFinding
     Resistance(self, self.commit, self.starttime, self.homepath, 'resfinder', 0.90, False, True)
     # Run the GDCS analysis
     self.analysistype = 'GDCS'
     self.pipeline = True
     self.targetpath = os.path.join(self.targetpath, self.analysistype)
     Sippr(self, 0.95)
     # Create the reports
     Reports.gdcsreporter(self.reports)
     # Perform serotyping for samples classified as Escherichia
     for sample in self.runmetadata.samples:
         if sample.general.bestassemblyfile != 'NA':
             sample.mash = GenObject()
             try:
                 sample.mash.closestrefseqgenus = sample.general.closestrefseqgenus
                 for genus, species in self.taxonomy.items():
                     if genus == sample.mash.closestrefseqgenus:
                         sample.mash.closestrefseqspecies = species
             except KeyError:
                 sample.mash.closestrefseqgenus = 'NA'
                 sample.mash.closestrefseqspecies = 'NA'
         else:
             sample.mash.closestrefseqgenus = 'NA'
             sample.mash.closestrefseqspecies = 'NA'
     SeroSippr(self, self.commit, self.starttime, self.homepath, 'serosippr', 0.95, True)
     # Print the metadata
     printer = MetadataPrinter(self)
     printer.printmetadata()
Example #15
0
 def predictthreads(self):
     printtime('Performing gene predictions', self.start)
     # Create the threads for the analyses
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             threads = Thread(target=self.predict, args=())
             threads.setDaemon(True)
             threads.start()
     for sample in self.metadata:
         # Create the .prodigal attribute
         sample.prodigal = GenObject()
         if sample.general.bestassemblyfile != 'NA':
             self.predictqueue.put(sample)
     self.predictqueue.join()
Example #16
0
 def sistr(self):
     """Perform sistr analyses on Salmonella"""
     printtime('Performing sistr analyses', self.start)
     for sample in self.metadata:
         # Create the analysis-type specific attribute
         setattr(sample, self.analysistype, GenObject())
         if sample.general.bestassemblyfile != 'NA':
             try:
                 # Only process strains that have been determined to be Salmonella
                 if sample.general.referencegenus == 'Salmonella':
                     # Set and create the path of the directory to store the strain-specific reports
                     sample[self.analysistype].reportdir = os.path.join(
                         sample.general.outputdirectory, self.analysistype)
                     # Name of the .json output file
                     sample[self.analysistype].jsonoutput = os.path.join(
                         sample[self.analysistype].reportdir,
                         '{}.json'.format(sample.name))
                     # Set the sistr system call
                     sample.commands.sistr = \
                         'sistr -f json -o {} -t {} -T {} {}'\
                         .format(sample[self.analysistype].jsonoutput,
                                 self.cpus,
                                 os.path.join(sample[self.analysistype].reportdir, 'tmp'),
                                 sample.general.bestassemblyfile)
                     #
                     sample[self.analysistype].logout = os.path.join(
                         sample[self.analysistype].reportdir, 'logout')
                     sample[self.analysistype].logerr = os.path.join(
                         sample[self.analysistype].reportdir, 'logerr')
                     # Only run the analyses if the output json file does not exist
                     if not os.path.isfile(
                             sample[self.analysistype].jsonoutput):
                         out, err = run_subprocess(sample.commands.sistr)
                         write_to_logfile(sample.commands.sistr,
                                          sample.commands.sistr,
                                          self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                         write_to_logfile(out, err, self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                     self.queue.task_done()
             except (ValueError, KeyError):
                 pass
     self.queue.join()
     self.report()
Example #17
0
 def fasta_records(self):
     """
     Use SeqIO to create dictionaries of all records for each FASTA file
     """
     for sample in self.metadata:
         # Create the analysis-type specific attribute
         setattr(sample, self.analysistype, GenObject())
         # Create a dictionary of records for each file
         try:
             record_dict = SeqIO.to_dict(
                 SeqIO.parse(sample.general.bestassemblyfile, "fasta"))
         except FileNotFoundError:
             record_dict = dict()
         # Set the records dictionary as the attribute for the object
         sample[self.analysistype].record_dict = record_dict
Example #18
0
    def epcrparse(self):
        """
        Parse the ePCR text file outputs
        """
        printtime('Parsing ePCR results', self.start)
        for sample in self.metadata:
            if sample.general.bestassemblyfile != 'NA':
                if 'stx' in sample.general.datastore:
                    # Initialise count - this allows for the population of vtyperresults with unique values
                    uniquecount = 0
                    # This populates vtyperresults with the verotoxin subtypes
                    toxinlist = []
                    if os.path.isfile(sample[self.analysistype].resultsfile):
                        epcrresults = open(sample[self.analysistype].resultsfile, 'r')
                        for result in epcrresults:
                            # Only the lines without a # contain results
                            if "#" not in result:
                                uniquecount += 1
                                # Split on \t
                                data = result.split('\t')
                                # The subtyping primer pair is the first entry on lines with results
                                vttype = data[0].split('_')[0]
                                # Push the name of the primer pair - stripped of anything after a _ to the dictionary
                                if vttype not in toxinlist:
                                    toxinlist.append(vttype)

                    # Create a string of the entries in list1 joined with ";"
                    toxinstring = ";".join(sorted(toxinlist))
                    # Save the string to the metadata
                    sample[self.analysistype].toxinprofile = toxinstring
                else:
                    setattr(sample, self.analysistype, GenObject())
                    sample[self.analysistype].toxinprofile = 'NA'
            else:
                setattr(sample, self.analysistype, GenObject())
                sample[self.analysistype].toxinprofile = 'NA'
Example #19
0
 def reader(self):
     import os
     import json
     from accessoryFunctions.accessoryFunctions import GenObject, MetadataObject
     for sample in self.metadata:
         metadatafile = '{}{}/{}_metadata.json'.format(
             self.path, sample.name, sample.name)
         if os.path.isfile(metadatafile):
             size = os.stat(metadatafile).st_size
             if size != 0:
                 try:
                     with open(metadatafile) as metadatareport:
                         jsondata = json.load(metadatareport)
                     # Create the metadata objects
                     metadata = MetadataObject()
                     # Initialise the metadata categories as GenObjects created using the appropriate key
                     for attr in jsondata:
                         if not isinstance(jsondata[attr], dict):
                             setattr(metadata, attr, jsondata[attr])
                         else:
                             setattr(metadata, attr,
                                     GenObject(jsondata[attr]))
                     # As files often need to be reanalysed after being moved, test to see if it possible to use the
                     # metadata from the previous assembly
                     jsonfile = '{}/{}_metadata.json'.format(
                         metadata.general.outputdirectory, sample.name)
                     try:
                         # Open the metadata file to write
                         with open(
                                 jsonfile, 'w'
                         ) as metadatafile:  # Change from wb to w since this is text in python3
                             # Write the json dump of the object dump to the metadata file
                             json.dump(sample.dump(),
                                       metadatafile,
                                       sort_keys=True,
                                       indent=4,
                                       separators=(',', ': '))
                         # Set the name
                         metadata.name = sample.name
                         self.samples.append(metadata)
                     except IOError:
                         self.samples.append(sample)
                 except ValueError:
                     self.samples.append(sample)
         else:
             self.samples.append(sample)
Example #20
0
 def vtyper(self):
     """Setup and create  threads for ePCR"""
     printtime('Running ePCR', self.start)
     # Create the threads for the BLAST analysis
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             threads = Thread(target=self.epcr, args=())
             threads.setDaemon(True)
             threads.start()
     # Create the system calls for famap, fahash, and ePCR
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             if 'stx' in sample.general.datastore:
                 setattr(sample, self.analysistype, GenObject())
                 # Get the primers ready
                 if self.reffilepath:
                     sample[self.analysistype].primers = '{}{}/vtx_subtyping_primers.txt'\
                         .format(self.reffilepath, self.analysistype)
                 else:
                     sample[self.analysistype].primers = self.primerfile
                 # Make the output path
                 sample[self.analysistype].reportdir = '{}/{}/'.format(sample.general.outputdirectory,
                                                                       self.analysistype)
                 make_path(sample[self.analysistype].reportdir)
                 outfile = sample[self.analysistype].reportdir + sample.name
                 # Set the hashing and mapping commands
                 sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext)
                 sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile)
                 # re-PCR uses the subtyping primers list to search the contigs file using the following parameters
                 # -S {hash file} (Perform STS lookup using hash-file),
                 # -r + (Enable/disable reverse STS lookup)
                 # -m 10000 (Set variability for STS size for lookup),
                 # -n 1 (Set max allowed mismatches per primer for lookup)
                 # -g 0 (Set max allowed indels per primer for lookup),
                 # -G (Print alignments in comments),
                 # -q quiet
                 # -o {output file},
                 sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 1 -g 0 -G -q -o {}.txt {}'\
                     .format(outfile, outfile, sample[self.analysistype].primers)
                 sample[self.analysistype].resultsfile = '{}.txt'.format(outfile)
                 self.epcrqueue.put((sample, outfile))
     self.epcrqueue.join()
     self.epcrparse()
Example #21
0
    def runner(self):
        """
        Run the necessary methods in the correct order
        """
        printtime('Starting {} analysis pipeline'.format(self.analysistype),
                  self.starttime,
                  output=self.portallog)
        if not self.pipeline:
            # If the metadata has been passed from the method script, self.pipeline must still be false in order to
            # get Sippr() to function correctly, but the metadata shouldn't be recreated
            try:
                eq = vars(self.runmetadata)['samples']
            except KeyError:
                # Create the objects to be used in the analyses
                objects = Objectprep(self)
                objects.objectprep()
                self.runmetadata = objects.samples

        else:
            for sample in self.runmetadata.samples:
                setattr(sample, self.analysistype, GenObject())
                sample.run.outputdirectory = sample.general.outputdirectory
        self.threads = int(self.cpus / len(self.runmetadata.samples)) \
            if self.cpus / len(self.runmetadata.samples) > 1 \
            else 1
        # Use a custom sippr method to use the full reference database as bait, and run mirabait against the FASTQ
        # reads - do not perform reference mapping yet
        SixteenSBait(self, self.cutoff)
        # Subsample 1000 reads from the FASTQ files
        self.subsample()
        # Convert the subsampled FASTQ files to FASTA format
        self.fasta()
        # Create BLAST databases if required
        self.makeblastdb()
        # Run BLAST analyses of the subsampled FASTA files against the NCBI 16S reference database
        self.blast()
        # Parse the BLAST results
        self.blastparse()
        # Feed the BLAST results into a modified sippr method to perform reference mapping using the calculated
        # genus of the sample as the mapping file
        SixteenSSipper(self, self.cutoff)
        # Create reports
        self.reporter()
Example #22
0
    def sketching(self):
        printtime('Indexing assemblies for mash analysis', self.starttime)
        # Create the threads for the analysis
        for sample in self.metadata:
            if sample.general.bestassemblyfile != 'NA':
                threads = Thread(target=self.sketch, args=())
                threads.setDaemon(True)
                threads.start()
        # Populate threads for each gene, genome combination
        for sample in self.metadata:
            # Create the analysis type-specific GenObject
            setattr(sample, self.analysistype, GenObject())
            if sample.general.bestassemblyfile != 'NA':
                # Set attributes
                sample[self.analysistype].reportdir = os.path.join(
                    sample.general.outputdirectory, self.analysistype)
                sample[self.analysistype].targetpath = os.path.join(
                    self.referencefilepath, self.analysistype)
                sample[self.analysistype].refseqsketch = \
                    sample[self.analysistype].targetpath + '/RefSeqSketchesDefaults.msh'
                sample[self.analysistype].sketchfilenoext = '{}/{}'.format(
                    sample[self.analysistype].reportdir, sample.name)
                sample[self.analysistype].sketchfile = sample[
                    self.analysistype].sketchfilenoext + '.msh'
                # Make the mash output directory if necessary
                make_path(sample[self.analysistype].reportdir)
                # Create a file containing the path/name of the filtered, corrected fastq files
                sample[self.
                       analysistype].filelist = '{}/{}_fastqfiles.txt'.format(
                           sample[self.analysistype].reportdir, sample.name)
                with open(sample[self.analysistype].filelist, 'w') as filelist:
                    filelist.write('\n'.join(
                        sample.general.trimmedcorrectedfastqfiles))

                # Create the system call
                sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \
                    .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext)
                # Add each sample to the threads
                self.sketchqueue.put(sample)
        # Join the threads
        self.sketchqueue.join()
        self.mashing()
Example #23
0
    def sketching(self):
        printtime('Indexing files for {} analysis'.format(self.analysistype), self.starttime)
        # Create the threads for the analysis
        for i in range(self.cpus):
            threads = Thread(target=self.sketch, args=())
            threads.setDaemon(True)
            threads.start()
        # Populate threads for each gene, genome combination
        for sample in self.metadata:
            # Create the analysis type-specific GenObject
            setattr(sample, self.analysistype, GenObject())
            # Set attributes
            sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype)
            make_path(sample[self.analysistype].reportdir)
            sample[self.analysistype].targetpath = self.referencefilepath if not self.pipeline else os.path.join(
                self.referencefilepath, self.analysistype)
            sample[self.analysistype].refseqsketch = os.path.join(sample[self.analysistype].targetpath,
                                                                  'RefSeqSketchesDefaults.msh')
            sample[self.analysistype].sketchfilenoext = os.path.join(sample[self.analysistype].reportdir, sample.name)
            sample[self.analysistype].sketchfile = sample[self.analysistype].sketchfilenoext + '.msh'
            # Make the mash output directory if necessary
            make_path(sample[self.analysistype].reportdir)
            # Create a file containing the path/name of the filtered, corrected fastq files
            sample[self.analysistype].filelist = os.path.join(sample[self.analysistype].reportdir,
                                                              '{}_fastqfiles.txt'.format(sample.name))
            with open(sample[self.analysistype].filelist, 'w') as filelist:
                filelist.write('\n'.join(sample.general.trimmedcorrectedfastqfiles))

            # Create the system call
            sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \
                .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext)
            # Add each sample to the threads
            try:
                self.sketchqueue.put(sample)
            except (KeyboardInterrupt, SystemExit):
                printtime('Received keyboard interrupt, quitting threads', self.starttime)
                quit()
        # Join the threads
        self.sketchqueue.join()
        self.mashing()
Example #24
0
 def versions(self):
     for sample in self.metadata:
         # Initialise the attribute
         sample.software = GenObject()
         # Populate the versions of the software used
         ss = sample.software
         ss.python = self.python
         ss.arch = self.arch
         ss.blast = self.blast
         ss.bowtie2 = self.bowversion
         ss.samtools = self.samversion
         ss.qualimap = self.qualimap
         ss.mash = self.mash
         ss.prodigal = self.prodigal
         ss.pipeline = self.commit
         ss.spades = self.spades
         ss.bbmap = self.bbmap
         ss.fastqc = self.fastqc
         ss.blc2fastq = self.bcl2fastq
         ss.perl = self.perl
         ss.biopython = self.biopython
         ss.java = self.java
Example #25
0
 def targets(self):
     """
     Create the GenObject for the analysis type, create the hash file for baiting (if necessary)
     """
     for sample in self.runmetadata:
         if sample.general.bestassemblyfile != 'NA':
             setattr(sample, self.analysistype, GenObject())
             sample[self.analysistype].runanalysis = True
             sample[self.analysistype].targetpath = self.targetpath
             baitpath = os.path.join(self.targetpath, 'bait')
             sample[self.analysistype].baitfile = glob(
                 os.path.join(baitpath, '*.fa'))[0]
             sample[self.analysistype].outputdir = os.path.join(
                 sample.run.outputdirectory, self.analysistype)
             sample[self.analysistype].logout = os.path.join(
                 sample[self.analysistype].outputdir, 'logout.txt')
             sample[self.analysistype].logerr = os.path.join(
                 sample[self.analysistype].outputdir, 'logerr.txt')
             sample[self.analysistype].baitedfastq = os.path.join(
                 sample[self.analysistype].outputdir,
                 '{}_targetMatches.fastq'.format(self.analysistype))
             sample[self.analysistype].complete = False
Example #26
0
    arguments = parser.parse_args()

    # Define the start time
    arguments.starttime = time.time()

    # Find the files
    fastas = sorted(glob(os.path.join(arguments.sequencepath, '*.fa*')))

    # Create a metadata object
    arguments.runmetadata = MetadataObject()
    arguments.runmetadata.samples = list()
    for fasta in fastas:
        metadata = MetadataObject()
        metadata.name = os.path.split(fasta)[1].split('.')[0]
        # Initialise the general and run categories
        metadata.general = GenObject()
        metadata.run = GenObject()
        # Set the destination folder
        outputdir = os.path.join(arguments.sequencepath, metadata.name)
        make_path(outputdir)
        # Add the output directory to the metadata
        metadata.general.outputdirectory = outputdir
        metadata.run.outputdirectory = outputdir
        metadata.general.bestassemblyfile = True
        # Initialise an attribute to store commands
        metadata.commands = GenObject()
        # Assume that all samples are Salmonella
        metadata.general.referencegenus = 'Salmonella'
        # Set the .fasta file as the best assembly
        metadata.general.bestassemblyfile = fasta
        arguments.runmetadata.samples.append(metadata)
Example #27
0
    def contamination_finder(self,
                             input_path=None,
                             report_path=None,
                             portal_log=None):
        """
        Helper function to get confindr integrated into the assembly pipeline
        """
        if portal_log is not None:
            printtime('Calculating contamination in reads',
                      self.start,
                      output=portal_log)
        else:
            printtime('Calculating contamination in reads', self.start)
        if input_path is not None:
            input_dir = input_path
        else:
            input_dir = self.path
        if report_path is not None:
            reportpath = report_path
        else:
            reportpath = os.path.join(input_dir, 'confindr')
        report = os.path.join(reportpath, 'confindr_report.csv')

        if not os.path.isfile(report):
            # Create an object to store attributes to pass to confinder
            args = MetadataObject
            args.input_directory = input_dir
            args.output_name = reportpath
            args.databases = os.path.join(self.reffilepath, 'ConFindr',
                                          'databases')
            args.forward_id = '_R1'
            args.reverse_id = '_R2'
            args.threads = self.cpus
            args.kmer_size = 31
            args.number_subsamples = 3
            args.subsample_depth = 20
            args.kmer_cutoff = 2
            try:
                shutil.rmtree(args.output_name)
            except IOError:
                pass
            make_path(reportpath)
            # Open the output report file.
            with open(os.path.join(report), 'w') as f:
                f.write(
                    'Strain,Genus,NumContamSNVs,NumUniqueKmers,ContamStatus\n')
            for sample in self.metadata:
                if len(sample.general.trimmedcorrectedfastqfiles) == 2:
                    confindr.find_contamination(
                        sample.general.trimmedcorrectedfastqfiles, args)
                elif len(sample.general.trimmedcorrectedfastqfiles) == 1:
                    confindr.find_contamination_unpaired(
                        args, sample.general.trimmedcorrectedfastqfiles[0])
            if portal_log:
                printtime('Contamination detection complete!',
                          self.start,
                          output=portal_log)
            else:
                printtime('Contamination detection complete!', self.start)
        # Load the confindr report into a dictionary using pandas
        # https://stackoverflow.com/questions/33620982/reading-csv-file-as-dictionary-using-pandas
        confindr_results = pandas.read_csv(report, index_col=0).T.to_dict()
        # Find the results for each of the samples
        for sample in self.metadata:
            # Create a GenObject to store the results
            sample.confindr = GenObject()
            # Iterate through the dictionary to find the outputs for each sample
            for line in confindr_results:
                # If the current line corresponds to the sample of interest
                if sample.name in line:
                    # Set the values using the appropriate keys as the attributes
                    sample.confindr.genus = confindr_results[line]['Genus']
                    sample.confindr.num_contaminated_snvs = confindr_results[
                        line]['NumContamSNVs']
                    sample.confindr.unique_kmers = confindr_results[line][
                        'NumUniqueKmers']
                    try:
                        sample.confindr.cross_contamination = confindr_results[
                            line]['CrossContamination']
                    except KeyError:
                        sample.confindr.cross_contamination = str()
                    sample.confindr.contam_status = confindr_results[line][
                        'ContamStatus']
                    if sample.confindr.contam_status is True:
                        sample.confindr.contam_status = 'Contaminated'
                    elif sample.confindr.contam_status is False:
                        sample.confindr.contam_status = 'Clean'
        # Re-write the output to be consistent with the rest of the pipeline
        with open(os.path.join(reportpath, 'confindr_report.csv'), 'w') as csv:
            data = 'Strain,Genus,NumContamSNVs,NumUniqueKmers,ContamStatus\n'
            for sample in self.metadata:
                data += '{str},{genus},{numcontamsnv},{numuniqkmer},{status}\n'.format(
                    str=sample.name,
                    genus=sample.confindr.genus,
                    numcontamsnv=sample.confindr.num_contaminated_snvs,
                    numuniqkmer=sample.confindr.unique_kmers,
                    status=sample.confindr.contam_status)
            csv.write(data)
Example #28
0
    def probefinder(self):
        """
        Find the longest probe sequences
        """
        logging.info('Finding and filtering probe sequences')
        for sample in self.samples:
            # A list to store the metadata object for each alignment
            sample.gene = list()
            for align in sample.alignedalleles:
                # Create an object to store all the information for each alignment file
                metadata = GenObject()
                metadata.name = os.path.splitext(os.path.basename(align))[0]
                metadata.alignmentfile = align
                # Create an alignment object from the alignment file
                try:
                    metadata.alignment = AlignIO.read(align, 'fasta')
                except ValueError:
                    # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences
                    # to be the length of the longest sequence
                    # https://stackoverflow.com/questions/32833230/biopython-alignio-valueerror-says-strings-must-be-same-length
                    records = SeqIO.parse(align, 'fasta')
                    # Make a copy, otherwise our generator is exhausted after calculating maxlen
                    records = list(records)
                    # Calculate the length of the longest sequence
                    maxlen = max(len(record.seq) for record in records)
                    # Pad sequences so that they all have the same length
                    for record in records:
                        if len(record.seq) != maxlen:
                            sequence = str(record.seq).ljust(maxlen, '.')
                            record.seq = Seq(sequence)
                    assert all(len(record.seq) == maxlen for record in records)
                    # Write to file and do alignment
                    metadata.alignmentfile = '{}_padded.tfa'.format(
                        os.path.splitext(align)[0])
                    with open(metadata.alignmentfile, 'w') as padded:
                        SeqIO.write(records, padded, 'fasta')
                    # Align the padded sequences
                    metadata.alignment = AlignIO.read(metadata.alignmentfile,
                                                      'fasta')

                metadata.summaryalign = AlignInfo.SummaryInfo(
                    metadata.alignment)
                # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default
                # parameters of threshold=.7, and ambiguous='X' are used
                consensus = metadata.summaryalign.dumb_consensus()
                metadata.consensus = str(consensus)
                # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each
                # location along the entire consensus sequence
                metadata.pssm = metadata.summaryalign.pos_specific_score_matrix(
                    consensus)
                metadata.identity = list()
                # Find the prevalence of each base for every location along the sequence
                for line in metadata.pssm:
                    try:
                        bases = [
                            line['A'], line['C'], line['G'], line['T'],
                            line['-']
                        ]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases[:4]) / sum(bases) * 100)))
                    except KeyError:
                        bases = [line['A'], line['C'], line['G'], line['T']]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases) / sum(bases) * 100)))
                # List to store metadata objects
                metadata.windows = list()
                # Variable to store whether a suitable probe has been found for the current organism + gene pair.
                # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the
                # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found
                passing = False
                # Create sliding windows of size self.max - self.min from the list of identities for each column
                # of the alignment
                for i in reversed(range(self.min, self.max + 1)):
                    if not passing:
                        windowdata = MetadataObject()
                        windowdata.size = i
                        windowdata.max = 0
                        windowdata.sliding = list()
                        # Create a counter to store the starting location of the window in the sequence
                        n = 0
                        # Create sliding windows from the range of sizes for the list of identities
                        windows = self.window(metadata.identity, i)
                        # Go through each window from the collection of sliding windows to determine which window(s)
                        # has (have) the best results
                        for window in windows:
                            # Create another object to store all the data for the window
                            slidingdata = MetadataObject()
                            # Only consider the window if every position has a percent identity greater than the cutoff
                            if min(window) > self.cutoff:
                                # Populate the object with the necessary variables
                                slidingdata.location = '{}:{}'.format(n, n + i)
                                slidingdata.min = min(window)
                                slidingdata.mean = float('{:.2f}'.format(
                                    numpy.mean(window)))
                                slidingdata.sequence = str(consensus[n:n + i])
                                # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min
                                #  means a better/less overall percent identity, respectively
                                windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \
                                    else windowdata.max
                                windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \
                                    else windowdata.min
                                # Add the object to the list of objects
                                windowdata.sliding.append(slidingdata)
                                passing = True
                            n += 1
                        # All the object to the list of objects
                        metadata.windows.append(windowdata)
                # All the object to the list of objects
                sample.gene.append(metadata)
Example #29
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath):
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.start = startingtime
     self.homepath = scriptpath
     # Define variables based on supplied arguments
     self.args = args
     self.path = os.path.join(args.path, '')
     assert os.path.isdir(
         self.path
     ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     self.sequencepath = os.path.join(args.sequencepath, '')
     assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \
         .format(self.sequencepath)
     self.databasepath = os.path.join(args.databasepath, '')
     assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \
         .format(self.databasepath)
     # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1
     self.cpus = 1
     # Set variables from the arguments
     self.database = args.database
     self.rank = args.rank
     self.clarkpath = args.clarkpath
     self.cutoff = float(args.cutoff) * 100
     # Initialise variables for the analysis
     self.targetcall = str()
     self.classifycall = str()
     self.devnull = open(os.devnull, 'wb')
     self.filelist = os.path.join(self.path, 'sampleList.txt')
     self.reportlist = os.path.join(self.path, 'reportList.txt')
     self.abundancequeue = Queue()
     self.datapath = str()
     self.reportpath = os.path.join(self.path, 'reports')
     self.clean_seqs = args.clean_seqs
     self.light = args.light
     if self.clean_seqs:
         try:
             self.reffilepath = args.reffilepath
         except AttributeError:
             self.clean_seqs = False
     # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects
     # and variables play nice
     try:
         if args.runmetadata:
             self.runmetadata = args.runmetadata
             self.extension = self.runmetadata.extension
             # Create the name of the final report
             self.report = os.path.join(
                 self.reportpath,
                 '{}'.format('abundance{}.xlsx'.format(self.extension)))
             # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK
             if not os.path.isfile(self.report):
                 printtime(
                     'Performing CLARK analysis on {} files'.format(
                         self.extension), self.start)
                 if self.extension != 'fastq':
                     for sample in self.runmetadata.samples:
                         sample.general.combined = sample.general.bestassemblyfile
                     # Run the pipeline
                     self.main()
                 else:
                     # Only perform FASTQ analyses if the sample is declared to be a metagenome
                     metagenome = False
                     for sample in self.runmetadata.samples:
                         try:
                             status = sample.run.Description
                         except KeyError:
                             status = 'unknown'
                         if status == 'metagenome':
                             metagenome = True
                     # If any of the samples are metagenomes, run the CLARK analysis on the raw files
                     if metagenome:
                         fileprep.Fileprep(self)
                         # Run the pipeline
                         self.main()
                 # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects
                 for sample in self.runmetadata.samples:
                     if sample.general.bestassemblyfile != 'NA':
                         # Create a GenObject to store metadata when this script is run as part of the pipeline
                         clarkextension = 'clark{}'.format(self.extension)
                         setattr(sample, clarkextension, GenObject())
                         # Create a folder to store all the CLARK files
                         sample[clarkextension].outputpath = os.path.join(
                             sample.general.outputdirectory, 'CLARK')
                         make_path(sample[clarkextension].outputpath)
                         # Move the files to the CLARK folder
                         try:
                             move(
                                 sample.general.abundance,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.abundance)))
                             move(
                                 sample.general.classification,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.classification)))
                         except (KeyError, FileNotFoundError):
                             pass
                         # Set the CLARK-specific attributes
                         try:
                             sample[
                                 clarkextension].abundance = sample.general.abundance
                             sample[
                                 clarkextension].classification = sample.general.classification
                             sample[
                                 clarkextension].combined = sample.general.combined
                         except KeyError:
                             pass
                         if self.extension == 'fastq':
                             # Remove the combined .fastq files
                             try:
                                 if type(sample[clarkextension].combined
                                         ) is list:
                                     os.remove(
                                         sample[clarkextension].combined)
                             except (OSError, KeyError):
                                 pass
                     # Remove all the attributes from .general
                     map(lambda x: delattr(sample.general, x),
                         ['abundance', 'classification', 'combined'])
                     # Remove the text files lists of files and reports created by CLARK
                     try:
                         map(
                             lambda x: os.remove(os.path.join(self.path, x)
                                                 ),
                             ['reportList.txt', 'sampleList.txt'])
                     except OSError:
                         pass
         else:
             self.runmetadata = MetadataObject()
             self.report = os.path.join(self.reportpath, 'abundance.xlsx')
             # Create the objects
             self.objectprep()
             self.main()
     except AttributeError:
         self.runmetadata = MetadataObject()
         self.report = os.path.join(self.reportpath, 'abundance.xlsx')
         # Create the objects
         self.objectprep()
         self.main()
     # Optionally filter the .fastq reads based on taxonomic assignment
     if args.filter:
         filtermetagenome.PipelineInit(self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(self)
Example #30
0
 def createfastq(self):
     """Uses bcl2fastq to create .fastq files from a MiSeqRun"""
     # Initialise samplecount
     samplecount = 0
     # If the fastq destination folder is not provided, make the default value of :path/:miseqfoldername
     self.fastqdestination = self.fastqdestination if self.fastqdestination else self.path + self.miseqfoldername
     # Make the path
     make_path(self.fastqdestination)
     # Initialise variables for storing index information
     index = ''
     indexlength = int()
     # bcl2fastq requires an older version of the sample sheet, this recreates the required version
     # Create the new sample sheet
     with open('{}/SampleSheet_modified.csv'.format(self.fastqdestination),
               "w") as modifiedsamplesheet:
         # Write the required headings to the file
         modifiedsamplesheet.write(
             "FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject\n"
         )
         for strain in self.samples:
             # Create a combined index of index1-index2
             try:
                 strain.run.modifiedindex = '{}-{}'.format(
                     strain.run.index, strain.run.index2)
                 indexlength = 16
                 index = 'I8,I8'
             except KeyError:
                 strain.run.modifiedindex = strain.run.index
                 indexlength = 6
                 index = 'I6'
             # The list of items to print to each line of the modified sample sheet
             printlist = [
                 self.flowcell, '1', strain.name,
                 str(strain.run.SampleNumber), strain.run.modifiedindex,
                 strain.run.Description, 'N', 'NA',
                 strain.run.InvestigatorName, self.projectname
             ]
             modifiedsamplesheet.write('{}\n'.format(",".join(printlist)))
             samplecount += 1
     # Set :forward/reverse length to :header.forward/reverse length if the argument is not provided, or it's 'full',
     # otherwise  use the supplied argument
     self.forwardlength = self.metadata.header.forwardlength if self.forwardlength.lower()\
         == 'full' else self.forwardlength
     # Set :reverselength to :header.reverselength
     self.reverselength = self.metadata.header.reverselength if self.reverselength.lower() \
         == 'full' else self.reverselength
     # As the number of cycles required is the number of forward reads + the index(8) + the second index(8)
     # Also set the basemask variable as required
     if self.reverselength != '0':
         self.readsneeded = int(self.forwardlength) + int(
             self.reverselength) + indexlength
         basemask = "Y{}n*,{},Y{}n*".format(self.forwardlength, index,
                                            self.reverselength)
         nohup = "nohup make -j 16 > nohup.out"
     else:
         #  + 1
         self.readsneeded = int(self.forwardlength) + indexlength
         basemask = "Y{}n*,{},n*".format(self.forwardlength, index)
         nohup = "nohup make -j 16 r1 > nohup.out"
     # Handle plurality appropriately
     samples = 'samples' if samplecount > 1 else 'sample'
     number = 'are' if samplecount > 1 else 'is'
     printtime(
         'There {} {} {} in this run. '
         'Running fastq creating module with the following parameters:\n'
         'MiSeqPath: {},\n'
         'MiSeqFolder: {},\n'
         'Fastq destination: {},\n'
         'SampleSheet: {}'.format(
             number, samplecount, samples, self.miseqpath, self.miseqfolder,
             self.fastqdestination,
             '{}/SampleSheet_modified.csv'.format(self.fastqdestination)),
         self.start)
     # Count the number of completed cycles in the run of interest
     cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format(
         self.miseqfolder))
     while len(cycles) < self.readsneeded:
         printtime(
             'Currently at {} cycles. Waiting until the MiSeq reaches cycle {}'
             .format(len(cycles), self.readsneeded), self.start)
         sleep(1800)
         cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format(
             self.miseqfolder))
     # configureBClToFastq requires :self.miseqfolder//Data/Intensities/BaseCalls/config.xml in order to work
     # When you download runs from BaseSpace, this file is not provided. There is an empty config.xml file that
     # can be populated with run-specific values and moved to the appropriate folder
     if not os.path.isfile('{}Data/Intensities/BaseCalls/config.xml'.format(
             self.miseqfolder)):
         self.configfilepopulator()
     # Define the bcl2fastq system call
     bclcall = "configureBclToFastq.pl --input-dir {}Data/Intensities/BaseCalls " \
               "--output-dir {} --force --sample-sheet {}/SampleSheet_modified.csv " \
               "--mismatches 1 --no-eamss --fastq-cluster-count 0 --compression none --use-bases-mask {}"\
         .format(self.miseqfolder, self.fastqdestination, self.fastqdestination, basemask)
     # Define the nohup system call
     nohupcall = "cd {} && {}".format(self.fastqdestination, nohup)
     # fnull = open(os.devnull, 'wb')
     if not os.path.isdir("{}/Project_{}".format(self.fastqdestination,
                                                 self.projectname)):
         # Call configureBclToFastq.pl
         printtime('Running bcl2fastq', self.start)
         # Run the commands
         threadlock = threading.Lock()
         outstr = ''
         outerr = ''
         out, err = run_subprocess(bclcall)
         outstr += out
         outerr += out
         out, err = run_subprocess(nohupcall)
         outstr += out
         outerr += out
         # call(bclcall, shell=True, stdout=fnull, stderr=fnull)
         # call(nohupcall, shell=True, stdout=fnull, stderr=fnull)
         threadlock.acquire()
         write_to_logfile(bclcall, bclcall, self.logfile)
         write_to_logfile(nohupcall, nohupcall, self.logfile)
         write_to_logfile(outstr, outerr, self.logfile)
         threadlock.release()
     # Populate the metadata
     for sample in self.metadata.samples:
         sample.commands = GenObject()
         sample.commands.nohup = nohupcall
         sample.commands.bcl = bclcall
         sample.run.forwardlength = self.forwardlength
         sample.run.reverselength = self.reverselength
     # Copy the fastq files to a central folder so they can be processed
     self.fastqmover()