Ejemplo n.º 1
0
 def quast(self):
     printtime('Performing Quast analyses', self.start)
     for i in range(
             len([
                 sample.general for sample in self.metadata
                 if sample.general.bestassemblyfile != 'NA'
             ])):
         # Send the threads to the merge method. :args is empty
         threads = Thread(target=self.runquast, args=())
         # Set the daemon to true - something to do with thread management
         threads.setDaemon(True)
         # Start the threading
         threads.start()
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             # Create the quast output directory
             quastoutputdirectory = '{}/quast_results/'.format(
                 sample.general.outputdirectory)
             make_path(quastoutputdirectory)
             # Set the quast system call
             quastcall = 'quast.py {} -o {}'.format(
                 sample.general.filteredfile, quastoutputdirectory)
             # Add the command to the metadata
             sample.commands.quast = quastcall
             self.quastqueue.put((sample, quastoutputdirectory))
         else:
             sample.commands.quast = 'NA'
     self.quastqueue.join()
Ejemplo n.º 2
0
def test_sistr(variables):
    metadata = MetadataObject()
    method.runmetadata.samples = list()
    fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta')
    metadata.name = os.path.split(fasta)[1].split('.')[0]
    # Initialise the general and run categories
    metadata.general = GenObject()
    metadata.run = GenObject()
    metadata.general.fastqfiles = list()
    # Set the destination folder
    outputdir = os.path.join(variables.sequencepath, metadata.name)
    make_path(outputdir)
    # Add the output directory to the metadata
    metadata.general.outputdirectory = outputdir
    metadata.run.outputdirectory = outputdir
    metadata.general.bestassemblyfile = True
    # Initialise an attribute to store commands
    metadata.commands = GenObject()
    # Assume that all samples are Salmonella
    metadata.general.referencegenus = 'Salmonella'
    # Set the .fasta file as the best assembly
    metadata.general.bestassemblyfile = fasta
    method.runmetadata.samples.append(metadata)
    method.sistr()
    for sample in method.runmetadata.samples:
        assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA'
    variable_update()
Ejemplo n.º 3
0
 def basic(self):
     # Grab any .fastq files in the path
     fastqfiles = glob(os.path.join(self.path, '*.fastq*'))
     # Extract the base name of the globbed name + path provided
     fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles))
     # Iterate through the names of the fastq files
     for fastqname in sorted(fastqnames):
         # Set the name
         metadata = MetadataObject()
         metadata.name = fastqname
         # Set the destination folder
         outputdir = os.path.join(self.path, fastqname)
         # Make the destination folder
         make_path(outputdir)
         # Get the fastq files specific to the fastqname
         specificfastq = glob(
             os.path.join(self.path, '{}*.fastq*'.format(fastqname)))
         # Link the files to the output folder
         try:
             # Link the .gz files to :self.path/:filename
             list(
                 map(
                     lambda x: os.symlink(
                         '../{}'.format(os.path.basename(x)), '{}/{}'.
                         format(outputdir, os.path.basename(x))),
                     specificfastq))
         # Except os errors
         except OSError as exception:
             # If there is an exception other than the file exists, raise it
             if exception.errno != errno.EEXIST:
                 raise
         # Initialise the general and run categories
         metadata.general = GenObject()
         metadata.run = GenObject()
         # Populate the .fastqfiles category of :self.metadata
         metadata.general.fastqfiles = [
             fastq for fastq in sorted(
                 glob(
                     os.path.join(outputdir, '{}*.fastq*'.format(
                         metadata.name)))) if 'trimmed' not in fastq
             and 'normalised' not in fastq and 'corrected' not in fastq
             and 'paired' not in fastq and 'unpaired' not in fastq
         ]
         # Add the output directory to the metadata
         metadata.general.outputdirectory = outputdir
         metadata.general.logout = os.path.join(
             self.path, metadata.name,
             '{}_log_out.txt'.format(metadata.name))
         metadata.general.logerr = os.path.join(
             self.path, metadata.name,
             '{}_log_err.txt'.format(metadata.name))
         # Append the metadata to the list of samples
         self.samples.append(metadata)
     # Grab metadata from previous runs
     previousmetadata = metadataReader.MetadataReader(self)
     # Update self.samples (if required)
     if previousmetadata.samples:
         self.samples = previousmetadata.samples
     # Run the read length method
     self.readlength()
Ejemplo n.º 4
0
def test_sistr(variables):
    metadata = MetadataObject()
    method.runmetadata.samples = list()
    fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta')
    metadata.name = os.path.split(fasta)[1].split('.')[0]
    # Initialise the general and run categories
    metadata.general = GenObject()
    metadata.run = GenObject()
    metadata.general.fastqfiles = list()
    # Set the destination folder
    outputdir = os.path.join(variables.sequencepath, metadata.name)
    make_path(outputdir)
    # Add the output directory to the metadata
    metadata.general.outputdirectory = outputdir
    metadata.general.logout = os.path.join(outputdir, 'out')
    metadata.general.logerr = os.path.join(outputdir, 'err')
    metadata.run.outputdirectory = outputdir
    metadata.general.bestassemblyfile = True
    # Initialise an attribute to store commands
    metadata.commands = GenObject()
    # Assume that all samples are Salmonella
    metadata.general.referencegenus = 'Salmonella'
    # Set the .fasta file as the best assembly
    metadata.general.bestassemblyfile = fasta
    method.runmetadata.samples.append(metadata)
    method.sistr()
    for sample in method.runmetadata.samples:
        assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA'
    variable_update()
Ejemplo n.º 5
0
 def reporter(self):
     """
     Creates a report of the results
     """
     printtime('Creating {} report'.format(self.analysistype),
               self.starttime)
     # Create the path in which the reports are stored
     make_path(self.reportpath)
     header = 'Strain,Serotype\n'
     data = ''
     with open(
             os.path.join(self.reportpath,
                          '{}.csv'.format(self.analysistype)),
             'w') as report:
         for sample in self.runmetadata.samples:
             if sample.general.bestassemblyfile != 'NA':
                 data += sample.name + ','
                 if sample[self.analysistype].results:
                     serotype = '{oset} ({opid}):{hset} ({hpid}),' \
                         .format(oset=';'.join(sample.serosippr.o_set),
                                 opid=sample.serosippr.best_o_pid,
                                 hset=';'.join(sample.serosippr.h_set),
                                 hpid=sample.serosippr.best_h_pid)
                     data += '{}\n'.format(serotype)
                 else:
                     data += '\n'
         report.write(header)
         report.write(data)
Ejemplo n.º 6
0
    def allelealigner(self):
        """
        Perform a multiple sequence alignment of the allele sequences
        """

        logging.info('Aligning alleles')
        # Create the threads for the analysis
        for _ in range(self.cpus):
            threads = Thread(target=self.alignthreads, args=())
            threads.setDaemon(True)
            threads.start()
        for sample in self.samples:
            sample.alignpath = os.path.join(self.path, 'alignedalleles',
                                            sample.organism)
            make_path(sample.alignpath)
            # Create a list to store objects
            sample.alignedalleles = list()
            for outputfile in sample.allelefiles:
                aligned = os.path.join(sample.alignpath,
                                       os.path.basename(outputfile))
                sample.alignedalleles.append(aligned)
                # Create the command line call
                clustalomega = ClustalOmegaCommandline(infile=outputfile,
                                                       outfile=aligned,
                                                       threads=4,
                                                       auto=True)
                sample.clustalomega = str(clustalomega)
                self.queue.put((sample, clustalomega, outputfile, aligned))
        self.queue.join()
Ejemplo n.º 7
0
    def getrmlsthelper(self):
        """
        Makes a system call to rest_auth.py, a Python script modified from
        https://github.com/kjolley/BIGSdb/tree/develop/scripts/test
        And downloads the most up-to-date rMLST profile and alleles
        """

        printtime('Downloading {} alleles'.format(self.analysistype), self.start)
        # Extract the path of the current script from the full path + file name
        homepath = os.path.split(os.path.abspath(__file__))[0]
        # Set the path/name of the folder to contain the new alleles and profile
        newfolder = os.path.join(self.path, self.analysistype)
        # Create the path
        make_path(newfolder)
        # Create arguments to feed into the rest_auth_class script
        args = ArgumentParser
        args.secret_file = os.path.join(homepath, 'secret.txt')
        args.file_path = homepath
        args.output_path = newfolder
        args.start = self.start
        rmlst = rest_auth_class.REST(args)
        # Download the profile and alleles
        rmlst.main()

        # Get the new alleles into a list, and create the combinedAlleles file
        alleles = glob(os.path.join(newfolder, '*.tfa'))
        self.combinealleles(newfolder, alleles)
Ejemplo n.º 8
0
 def predict(self):
     while True:
         sample = self.predictqueue.get()
         # Populate attributes
         sample.prodigal.reportdir = os.path.join(
             sample.general.outputdirectory, 'prodigal')
         sample.prodigal.results_file = os.path.join(
             sample.prodigal.reportdir,
             '{}_prodigalresults.sco'.format(sample.name))
         sample.prodigal.results = sample.prodigal.results_file
         sample.commands.prodigal = 'prodigal -i {in1} -o {out1} -f sco -d {genes}'\
             .format(in1=sample.general.bestassemblyfile,
                     out1=sample.prodigal.results_file,
                     genes=os.path.join(sample.prodigal.reportdir, '{}_genes.fa'.format(sample.name)))
         # Create the folder to store the reports
         make_path(sample.prodigal.reportdir)
         # Determine if the report already exists, and that it is not empty
         size = 0
         if os.path.isfile(sample.prodigal.results_file):
             size = os.stat(sample.prodigal.results_file).st_size
         if not os.path.isfile(sample.prodigal.results_file) or size == 0:
             # Run the command
             out, err = run_subprocess(sample.commands.prodigal)
             threadlock.acquire()
             write_to_logfile(sample.commands.prodigal,
                              sample.commands.prodigal, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             threadlock.release()
         self.predictqueue.task_done()
Ejemplo n.º 9
0
    def getrmlsthelper(self):
        """
        Makes a system call to rest_auth.py, a Python script modified from
        https://github.com/kjolley/BIGSdb/tree/develop/scripts/test
        And downloads the most up-to-date rMLST profile and alleles
        """

        printtime('Downloading {} alleles'.format(self.analysistype),
                  self.start)
        # Extract the path of the current script from the full path + file name
        homepath = os.path.split(os.path.abspath(__file__))[0]
        # Set the path/name of the folder to contain the new alleles and profile
        newfolder = os.path.join(self.path, self.analysistype)
        # Create the path
        make_path(newfolder)
        # Create arguments to feed into the rest_auth_class script
        args = ArgumentParser
        args.secret_file = os.path.join(homepath, 'secret.txt')
        args.file_path = homepath
        args.output_path = newfolder
        args.start = self.start
        rmlst = rest_auth_class.REST(args)
        # Download the profile and alleles
        rmlst.main()

        # Get the new alleles into a list, and create the combinedAlleles file
        alleles = glob(os.path.join(newfolder, '*.tfa'))
        self.combinealleles(newfolder, alleles)
Ejemplo n.º 10
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath):
     """
     :param args: command line arguments
     :param pipelinecommit: pipeline commit or version
     :param startingtime: time the script was started
     :param scriptpath: home path of the script
     """
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.starttime = startingtime
     self.homepath = scriptpath
     self.args = args
     # Define variables based on supplied arguments
     self.path = os.path.join(args.path, '')
     assert os.path.isdir(self.path), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     self.sequencepath = os.path.join(args.sequencepath, '')
     self.seqpath = self.sequencepath
     self.targetpath = os.path.join(args.targetpath, '')
     # ref file path is used to work with submodule code with a different naming scheme
     self.reffilepath = self.targetpath
     self.reportpath = os.path.join(self.path, 'reports')
     make_path(self.reportpath)
     assert os.path.isdir(self.targetpath), u'Target path is not a valid directory {0!r:s}' \
         .format(self.targetpath)
     self.bcltofastq = args.bcl2fastq
     self.miseqpath = args.miseqpath
     self.miseqfolder = args.miseqfolder
     self.fastqdestination = args.destinationfastq
     self.forwardlength = args.readlengthforward
     self.reverselength = args.readlengthreverse
     self.numreads = 2 if self.reverselength != 0 else 1
     self.customsamplesheet = args.customsamplesheet
     # Set the custom cutoff value
     self.cutoff = args.customcutoffs
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = int(args.numthreads if args.numthreads else multiprocessing.cpu_count())
     self.threads = int()
     self.runmetadata = MetadataObject()
     self.taxonomy = {'Escherichia': 'coli', 'Listeria': 'monocytogenes', 'Salmonella': 'enterica'}
     self.analysistype = 'GeneSippr'
     self.copy = args.copy
     self.pipeline = False
     self.forward = str()
     self.reverse = str()
     self.index = str()
     self.header = dict()
     self.rundata = dict()
     self.completed = list()
     self.incomplete = list()
     self.analysescomplete = False
     self.final = False
     self.sum = int()
     self.completemetadata = list()
     self.samplesheetpath = str()
     self.samples = list()
     self.logfile = os.path.join(self.path, 'log')
     self.reports = str()
     # Run the method
     self.main()
Ejemplo n.º 11
0
 def samplesheet(self):
     """
     Create a custom sample sheet based on the original sample sheet for the run, but only including the samples
     that did not pass the quality threshold on the previous iteration
     """
     make_path(self.samplesheetpath)
     self.customsamplesheet = os.path.join(self.samplesheetpath,
                                           'SampleSheet.csv')
     header = [
         'Sample_ID', 'Sample_Name', 'Sample_Plate', 'Sample_Well',
         'I7_Index_ID', 'index', 'I5_Index_ID', 'index2', 'Sample_Project',
         'Description'
     ]
     with open(self.customsamplesheet, 'w') as samplesheet:
         lines = str()
         lines += '[Header]\n'
         lines += 'IEMFileVersion,{}\n'.format(
             self.header['IEMFileVersion'])
         lines += 'Investigator Name,{}\n'.format(
             self.header['InvestigatorName'])
         lines += 'Experiment Name,{}\n'.format(
             self.header['ExperimentName'])
         lines += 'Date,{}\n'.format(self.header['Date'])
         lines += 'Workflow,{}\n'.format(self.header['Workflow'])
         lines += 'Application,{}\n'.format(self.header['Application'])
         lines += 'Assay,{}\n'.format(self.header['Assay'])
         lines += 'Description,{}\n'.format(self.header['Description'])
         lines += 'Chemistry,{}\n'.format(self.header['Chemistry'])
         lines += '\n'
         lines += '[Reads]\n'
         lines += str(self.forward) + '\n'
         lines += str(self.reverse) + '\n'
         lines += '\n'
         lines += '[Settings]\n'
         lines += 'ReverseComplement,{}\n'.format(
             self.header['ReverseComplement'])
         lines += 'Adapter,{}\n'.format(self.header['Adapter'])
         lines += '\n'
         lines += '[Data]\n'
         lines += ','.join(header)
         lines += '\n'
         # Correlate all the samples added to the list of incomplete samples with their metadata
         for incomplete in self.incomplete:
             for sample in self.rundata:
                 if incomplete == sample['SampleID']:
                     # Use each entry in the header list as a key for the rundata dictionary
                     for data in header:
                         # Modify the key to be consistent with how the dictionary was populated
                         result = sample[data.replace('_', '')]
                         # Description is the final entry in the list, and shouldn't have a , following the value
                         if data != 'Description':
                             lines += '{},'.format(result.replace('NA', ''))
                         # This entry should have a newline instead of a ,
                         else:
                             lines += '{}\n'.format(result.replace(
                                 'NA', ''))
         # Write the string to the sample sheet
         samplesheet.write(lines)
Ejemplo n.º 12
0
 def primers(self):
     """Setup and create threads for ePCR"""
     # Create the threads for the ePCR analysis
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             threads = Thread(target=self.epcr, args=())
             threads.setDaemon(True)
             threads.start()
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             setattr(sample, self.analysistype, GenObject())
             # Get the primers ready
             try:
                 sample[self.analysistype].primers = glob(os.path.join(self.reffilepath,
                                                                       self.analysistype,
                                                                       sample.general.referencegenus,
                                                                       'primers',
                                                                       '*.txt'))[0]
                 # Find the name of the probe file
                 sample[self.analysistype].probes = glob(os.path.join(self.reffilepath,
                                                                      self.analysistype,
                                                                      sample.general.referencegenus,
                                                                      'probes',
                                                                      '*.fa'))[0]
                 # Create the BLAST database of the probes (if necessary)
                 self.makeblastdb(sample[self.analysistype].probes)
                 # Initialise a list to store the names of the targets
                 sample[self.analysistype].targets = list()
                 # Open the primer file, and read the names of the targets into a list
                 with open(sample[self.analysistype].primers, 'r') as primerfile:
                     for line in primerfile:
                         sample[self.analysistype].targets.append(line.split('\t')[0])
             # Organisms without primer/probe files will fail. Populate metadata with 'NA' values
             except IndexError:
                 sample[self.analysistype].primers = 'NA'
                 sample[self.analysistype].probes = 'NA'
             # Only try to process organisms with primer files
             if sample[self.analysistype].primers != 'NA':
                 # Make the output path
                 sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory,
                                                                    self.analysistype)
                 make_path(sample[self.analysistype].reportdir)
                 # Set the base name of the output file
                 outfile = sample[self.analysistype].reportdir + sample.name
                 # Set the hashing and mapping commands
                 sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext)
                 sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile)
                 # re-PCR uses the subtyping primers list to search the contigs file using the following parameters
                 # -S {hash file} (Perform STS lookup using hash-file), -r + (Enable/disable reverse STS lookup)
                 # -m 10000 (Set variability for STS size for lookup),
                 # -n 1 (Set max allowed mismatches per primer for lookup)
                 # -g 0 (Set max allowed indels per primer for lookup),
                 # -G (Print alignments in comments), -o {output file}
                 sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 2 -g 0 -G -q -o {}.txt {}' \
                     .format(outfile, outfile, sample[self.analysistype].primers)
                 # Add the variables to the queue
                 self.epcrqueue.put((sample, outfile))
     self.epcrqueue.join()
Ejemplo n.º 13
0
 def __init__(self, args):
     self.databasepath = os.path.join(args.databasepath)
     make_path(self.databasepath)
     self.start = args.start
     # Determine the location of the CLARK scripts
     self.clarkpath = os.path.dirname(shutil.which('CLARK'))
     self.logfile = os.path.join(self.databasepath, 'logfile')
     # Delete log files form previous iterations of the script in this folder
     clear_logfile(self.logfile)
 def extract_rmlst_reads(self):
     """
     rMLST read extraction. Should be the first thing called after parsing the fastq directory.
     """
     for sample in self.metadata:
         # Create the object to store the variables
         setattr(sample, self.analysistype, GenObject())
         # Initialise variables
         sample[self.analysistype].snv_count = list()
         # Initialise a starting value for the number of unique kmers found in each sample
         sample[self.analysistype].unique_kmers = -1
         # Set and create the output directory
         try:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.run.outputdirectory, self.analysistype)
         except KeyError:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.general.outputdirectory, self.analysistype)
         make_path(sample[self.analysistype].outputdir)
         sample[self.analysistype].logout = os.path.join(
             sample[self.analysistype].outputdir, 'logout.txt')
         sample[self.analysistype].logerr = os.path.join(
             sample[self.analysistype].outputdir, 'logerr.txt')
         sample[self.analysistype].baitedfastq = os.path.join(
             sample[self.analysistype].outputdir,
             '{}_targetMatches.fastq.gz'.format(self.analysistype))
         # Create the command to run the baiting - paired inputs and a single, zipped output
         sample[self.analysistype].bbdukcmd = 'bbduk.sh ref={} in1={} in2={} threads={} outm={}'\
             .format(self.database,
                     sample.general.trimmedcorrectedfastqfiles[0],
                     sample.general.trimmedcorrectedfastqfiles[1],
                     str(self.threads),
                     sample[self.analysistype].baitedfastq)
         # Sometimes bbduk hangs forever, so that needs to be handled. Give it a very generous timeout.
         try:
             # Run the call, and write any errors to the logfile
             command = sample[self.analysistype].bbdukcmd
             if self.analyse:
                 out, err = run_subprocess(command)
             else:
                 out = str()
                 err = str()
             write_to_logfile(command, command, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         except TimeoutExpired:
             print('ERROR: Could not extract rMLST reads from sample {}'.
                   format(sample.name))
Ejemplo n.º 15
0
 def createobject(self):
     # Grab any .fastq files in the path
     fastqfiles = glob(os.path.join(self.path, '*.fastq*'))
     # Extract the base name of the globbed name + path provided
     fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles))
     # Iterate through the names of the fastq files
     for fastqname in sorted(fastqnames):
         # Set the name
         metadata = MetadataObject()
         metadata.name = fastqname
         # Set the destination folder
         outputdir = os.path.join(self.path, fastqname)
         # Make the destination folder
         make_path(outputdir)
         # Get the fastq files specific to the fastqname
         specificfastq = glob(
             os.path.join(self.path, '{}*.fastq*'.format(fastqname)))
         # Make relative symlinks to the files in :self.path
         try:
             for fastq in specificfastq:
                 # Get the basename of the file
                 fastqfile = os.path.split(fastq)[-1]
                 # Set the destination fastq path as the base name plus the destination folder
                 destinationfastq = os.path.join(outputdir, fastqfile)
                 # Symlink the files
                 os.symlink('../{}'.format(fastqfile), destinationfastq)
         # Except os errors
         except OSError as exception:
             # If there is an exception other than the file exists, raise it
             if exception.errno != errno.EEXIST:
                 raise
         # Initialise the general and run categories
         metadata.general = GenObject()
         metadata.run = GenObject()
         # Populate the .fastqfiles category of :self.metadata
         metadata.general.fastqfiles = [
             fastq for fastq in glob(
                 os.path.join(outputdir, '{}*.fastq*'.format(fastqname)))
             if 'trimmed' not in fastq
         ]
         # Add the output directory to the metadata
         metadata.general.outputdirectory = outputdir
         metadata.run.outputdirectory = outputdir
         metadata.general.bestassemblyfile = True
         metadata.general.trimmedcorrectedfastqfiles = metadata.general.fastqfiles
         metadata.general.logout = os.path.join(
             metadata.general.outputdirectory, 'logout')
         metadata.general.logerr = os.path.join(
             metadata.general.outputdirectory, 'logerr')
         # Initialise an attribute to store commands
         metadata.commands = GenObject()
         # Append the metadata to the list of samples
         self.samples.append(metadata)
Ejemplo n.º 16
0
 def create_database_folder(self, database):
     """
     Create an appropriately named folder in which the database is to be stored
     :param database: the name of the database folder to create
     :return: the absolute path of the folder
     """
     printtime('Setting up {} database'.format(database), self.start)
     # Define the path to store the database files
     databasepath = os.path.join(self.databasepath, database)
     # Create the path as required
     make_path(databasepath)
     return databasepath
Ejemplo n.º 17
0
    def objects(self):
        """

        :return:
        """
        self.runmetadata = ObjectCreation(inputobject=self)
        make_path(os.path.join(self.path, 'BestAssemblies'))
        for sample in self.runmetadata.samples:
            # Link the assemblies to the BestAssemblies folder - necessary for GenomeQAML
            relative_symlink(sample.general.bestassemblyfile,
                             os.path.join(self.path, 'BestAssemblies'))
            # Create attributes required for downstream analyses
            sample.general.trimmedcorrectedfastqfiles = [sample.general.bestassemblyfile]
Ejemplo n.º 18
0
 def __init__(self, seqids, outdir, copyflag, filetype, verboseflag):
     """
     :param seqids: list of SEQ IDs provided
     :param outdir: Directory in which sequence files are to be copied/linked
     :param copyflag: Boolean for whether files are to be copied of relatively symbolically linked
     :param filetype: File type to process: either FASTQ or FASTA
     :param verboseflag: Boolean for whether debug messages should be printed
     """
     # Configure the logging
     SetupLogging(verboseflag)
     # Class variables from arguments
     self.seqids = seqids
     self.outdir = outdir
     # Make output directory if it doesn't exist.
     make_path(self.outdir)
     self.copyflag = copyflag
     self.filetype = filetype
     # Global setup of expected NAS folder structure
     # Set all the paths for the folders to use
     self.nas_dir = os.path.join('/mnt', 'nas2')
     self.processed_sequence_data = os.path.join(self.nas_dir,
                                                 'processed_sequence_data')
     self.raw_sequence_data = os.path.join(self.nas_dir,
                                           'raw_sequence_data')
     self.merge_backup = os.path.join(self.nas_dir, 'raw_sequence_data',
                                      'merged_sequences')
     # Dictionaries storing the path, the file type present in the folder, and the nested folder structure
     self.nas_folders = {
         self.raw_sequence_data: {
             'fastq': ['*/*']
         },
         self.merge_backup: {
             'fastq': ['']
         },
         self.processed_sequence_data: {
             'fasta': ['*/*/BestAssemblies']
         }
     }
     # List of all the folders
     self.folders = [folder for folder in self.nas_folders]
     # Glob patterns for each file type
     self.extensions = {'fastq': '*.fastq.gz', 'fasta': '*.fasta'}
     # As FASTQ files are (usually) paired, only print a warning about finding duplicate copies if more than
     # two files are found; print the warning if more than one FASTA file is found
     self.lengths = 2 if self.filetype == 'fastq' else 1
     # Set the term to use depending on whether files are copied or linked
     self.verb = 'Copying' if copyflag else 'Linking'
     # Dictionary to store sequence files on the related NAS
     self.new_file_dict = dict()
     # A list to store SEQ IDs for which sequence files cannot be located
     self.missing = list()
Ejemplo n.º 19
0
    def runblast(self):
        while True:  # while daemon
            (assembly, target, sample) = self.blastqueue.get()  # grabs fastapath from dqueue
            genome = os.path.split(assembly)[1].split('.')[0]
            # Run the BioPython BLASTn module with the genome as query, fasta(target gene) as db.
            # Do not re-perform the BLAST search each time
            make_path(sample[self.analysistype].reportdir)
            size = 0
            try:
                report = glob('{}{}*rawresults*'.format(sample[self.analysistype].reportdir, genome))[0]
                size = os.path.getsize(report)
            except IndexError:

                report = '{}{}_rawresults_{:}.csv'.format(sample[self.analysistype].reportdir, genome,
                                                          time.strftime("%Y.%m.%d.%H.%M.%S"))
            db = target.split('.')[0]
            # BLAST command line call. Note the mildly restrictive evalue, and the high number of alignments.
            # Due to the fact that all the targets are combined into one database, this is to ensure that all potential
            # alignments are reported. Also note the custom outfmt: the doubled quotes are necessary to get it work
            blastn = NcbiblastnCommandline(query=assembly,
                                           db=db,
                                           reward=1,
                                           penalty=-5,
                                           gapopen=3,
                                           gapextend=3,
                                           dust="yes",
                                           soft_masking="true",
                                           evalue=0.1,
                                           num_alignments=1000000,
                                           num_threads=24,
                                           outfmt="'6 qseqid sacc stitle positive mismatch gaps "
                                                  "evalue bitscore slen length'",
                                           out=report)
            # Save the blast command in the metadata
            sample[self.analysistype].blastcommand = str(blastn)
            if not os.path.isfile(report) or size == 0:
                try:
                    blastn()
                except:
                    self.blastqueue.task_done()
                    self.blastqueue.join()
                    try:
                        os.remove(report)
                    except IOError:
                        pass
                    raise
            # Run the blast parsing module
            self.blastparser(report, sample)
            self.blastqueue.task_done()  # signals to dqueue job is done
Ejemplo n.º 20
0
 def movefastq(self):
     """Find .fastq files for each sample and move them to an appropriately named folder"""
     printtime('Moving FASTQ files', self.start)
     # Iterate through each sample
     for sample in self.metadata.runmetadata.samples:
         # Retrieve the output directory
         outputdir = os.path.join(self.path, sample.name)
         # Find any fastq files with the sample name
         fastqfiles = sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \
             if sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \
             else sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \
             if sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \
             else sorted(glob(os.path.join(self.path, '{}*.fastq*'.format(sample.name))))
         # Only try and move the files if the files exist
         if fastqfiles:
             make_path(outputdir)
             # Symlink the fastq files to the directory
             try:
                 list(
                     map(
                         lambda x: os.symlink(
                             os.path.join('..', os.path.basename(x)),
                             os.path.join(outputdir, os.path.basename(x))),
                         fastqfiles))
             except OSError:
                 pass
             # Find any fastq files with the sample name
             fastqfiles = [
                 fastq for fastq in sorted(
                     glob(
                         os.path.join(outputdir, '{}*.fastq*'.format(
                             sample.name)))) if 'trimmed' not in fastq
                 and 'normalised' not in fastq and 'corrected' not in fastq
                 and 'paired' not in fastq and 'unpaired' not in fastq
             ]
         else:
             if outputdir:
                 # Find any fastq files with the sample name
                 fastqfiles = [
                     fastq for fastq in sorted(
                         glob(
                             os.path.join(
                                 outputdir, '{}*.fastq*'.format(
                                     outputdir, sample.name))))
                     if 'trimmed' not in fastq and 'normalised' not in fastq
                     and 'corrected' not in fastq and 'paired' not in fastq
                     and 'unpaired' not in fastq
                 ]
         sample.general.fastqfiles = fastqfiles
Ejemplo n.º 21
0
 def alleleretriever(self):
     """
     Retrieve the required alleles from a file of all alleles, and create organism-specific allele files
     """
     logging.info('Retrieving alleles')
     # Index all the records in the allele file
     logging.info('Loading rMLST records')
     recorddict = SeqIO.index(self.allelefile, 'fasta')
     logging.info('Creating allele output files')
     # Create the organism-specific files of alleles
     for organism in sorted(self.alleledict):
         # Make an object to store information for each strain
         metadata = MetadataObject()
         metadata.organism = organism
         metadata.path = self.path
         metadata.outpath = os.path.join(self.path, 'outputalleles',
                                         organism, '')
         # Delete and recreate the output path - as the files are appended to each time, they will be too large if
         # this script is run more than once
         try:
             shutil.rmtree(metadata.outpath)
         except OSError:
             pass
         make_path(metadata.outpath)
         metadata.combined = os.path.join(metadata.outpath,
                                          'gdcs_alleles.fasta')
         metadata.allelefiles = list()
         with open(metadata.combined, 'w') as combined:
             for gene, alleles in sorted(self.alleledict[organism].items()):
                 # Open the file to append
                 allelefiles = os.path.join(metadata.outpath,
                                            '{}.tfa'.format(gene))
                 metadata.allelefiles.append(allelefiles)
                 with open(allelefiles, 'a') as allelefile:
                     # Write each allele record to the file
                     for allele in sorted(alleles):
                         # Skip adding alleles that are no longer in the database
                         try:
                             SeqIO.write(
                                 recorddict['{}_{}'.format(gene, allele)],
                                 allelefile, 'fasta')
                             SeqIO.write(
                                 recorddict['{}_{}'.format(gene, allele)],
                                 combined, 'fasta')
                         except KeyError:
                             pass
         # Add the populated metadata to the list
         self.samples.append(metadata)
Ejemplo n.º 22
0
 def __init__(self, args):
     """
     Initialises the variables required for this class
     :param args: list of arguments passed to the script
     """
     printtime(
         'Welcome to the CFIA de novo bacterial assembly pipeline {}'.
         format(args.commit.decode('utf-8')), args.startingtime,
         '\033[1;94m')
     # Define variables from the arguments - there may be a more streamlined way to do this
     self.args = args
     self.path = os.path.join(args.sequencepath)
     self.reffilepath = os.path.join(args.referencefilepath)
     self.numreads = args.numreads
     self.preprocess = args.preprocess
     # Define the start time
     self.starttime = args.startingtime
     self.customsamplesheet = args.customsamplesheet
     if self.customsamplesheet:
         assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\
             .format(self.customsamplesheet)
     self.basicassembly = args.basicassembly
     if not self.customsamplesheet and not os.path.isfile(
             os.path.join(self.path, 'SampleSheet.csv')):
         self.basicassembly = True
         printtime(
             'Could not find a sample sheet. Performing basic assembly (no run metadata captured)',
             self.starttime)
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = args.threads if args.threads else multiprocessing.cpu_count(
     ) - 1
     # Assertions to ensure that the provided variables are valid
     make_path(self.path)
     assert os.path.isdir(
         self.path
     ), 'Supplied path location is not a valid directory {0!r:s}'.format(
         self.path)
     self.reportpath = os.path.join(self.path, 'reports')
     assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\
         .format(self.reffilepath)
     self.commit = args.commit.decode('utf-8')
     self.homepath = args.homepath
     self.logfile = os.path.join(self.path, 'logfile')
     self.runinfo = str()
     self.pipeline = True
     self.qualityobject = MetadataObject()
     # Initialise the metadata object
     self.runmetadata = MetadataObject()
Ejemplo n.º 23
0
 def fastqc(self):
     """Run fastqc system calls"""
     while True:  # while daemon
         threadlock = threading.Lock()
         # Unpack the variables from the queue
         (sample, systemcall, outputdir, fastqcreads) = self.qcqueue.get()
         # Check to see if the output HTML file already exists
         try:
             _ = glob(os.path.join(outputdir, '*.html'))[0]
         except IndexError:
             # Make the output directory
             make_path(outputdir)
             # Run the system calls
             outstr = str()
             errstr = str()
             out, err = run_subprocess(systemcall)
             outstr += out
             errstr += err
             out, err = run_subprocess(fastqcreads)
             outstr += out
             errstr += err
             # Acquire thread lock, and write the logs to file
             threadlock.acquire()
             write_to_logfile(systemcall, systemcall, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(fastqcreads, fastqcreads, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(outstr, errstr, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             threadlock.release()
             # Rename the outputs
             try:
                 shutil.move(
                     os.path.join(outputdir, 'stdin_fastqc.html'),
                     os.path.join(outputdir,
                                  '{}_fastqc.html'.format(sample.name)))
                 shutil.move(
                     os.path.join(outputdir, 'stdin_fastqc.zip'),
                     os.path.join(outputdir,
                                  '{}_fastqc.zip'.format(sample.name)))
             except IOError:
                 pass
         # Signal to qcqueue that job is done
         self.qcqueue.task_done()
Ejemplo n.º 24
0
 def reporter(self):
     make_path(self.reportpath)
     header = 'Strain,ReferenceGenus,ReferenceFile,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n'
     data = ''
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             data += '{},{},{},{},{},{}\n'.format(
                 sample.name, sample[self.analysistype].closestrefseqgenus,
                 sample[self.analysistype].closestrefseq,
                 sample[self.analysistype].mashdistance,
                 sample[self.analysistype].pvalue,
                 sample[self.analysistype].nummatches)
     # Create the report file
     reportfile = '{}/mash.csv'.format(self.reportpath)
     with open(reportfile, 'w') as report:
         report.write(header)
         report.write(data)
Ejemplo n.º 25
0
 def run_qaml(self):
     """
     Create and run the GenomeQAML system call
     """
     printtime('Running GenomeQAML quality assessment', self.start)
     qaml_call = 'classify.py -t {tf} -r {rf}'\
         .format(tf=self.qaml_path,
                 rf=self.qaml_report)
     make_path(self.reportpath)
     # Only attempt to assess assemblies if the report doesn't already exist
     if not os.path.isfile(self.qaml_report):
         # Run the system calls
         out, err = run_subprocess(qaml_call)
         # Acquire thread lock, and write the logs to file
         self.threadlock.acquire()
         write_to_logfile(qaml_call, qaml_call, self.logfile)
         write_to_logfile(out, err, self.logfile)
         self.threadlock.release()
Ejemplo n.º 26
0
 def vtyper(self):
     """Setup and create  threads for ePCR"""
     printtime('Running ePCR', self.start)
     # Create the threads for the BLAST analysis
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             threads = Thread(target=self.epcr, args=())
             threads.setDaemon(True)
             threads.start()
     # Create the system calls for famap, fahash, and ePCR
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             if 'stx' in sample.general.datastore:
                 setattr(sample, self.analysistype, GenObject())
                 # Get the primers ready
                 if self.reffilepath:
                     sample[self.analysistype].primers = '{}{}/vtx_subtyping_primers.txt'\
                         .format(self.reffilepath, self.analysistype)
                 else:
                     sample[self.analysistype].primers = self.primerfile
                 # Make the output path
                 sample[self.analysistype].reportdir = '{}/{}/'.format(sample.general.outputdirectory,
                                                                       self.analysistype)
                 make_path(sample[self.analysistype].reportdir)
                 outfile = sample[self.analysistype].reportdir + sample.name
                 # Set the hashing and mapping commands
                 sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext)
                 sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile)
                 # re-PCR uses the subtyping primers list to search the contigs file using the following parameters
                 # -S {hash file} (Perform STS lookup using hash-file),
                 # -r + (Enable/disable reverse STS lookup)
                 # -m 10000 (Set variability for STS size for lookup),
                 # -n 1 (Set max allowed mismatches per primer for lookup)
                 # -g 0 (Set max allowed indels per primer for lookup),
                 # -G (Print alignments in comments),
                 # -q quiet
                 # -o {output file},
                 sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 1 -g 0 -G -q -o {}.txt {}'\
                     .format(outfile, outfile, sample[self.analysistype].primers)
                 sample[self.analysistype].resultsfile = '{}.txt'.format(outfile)
                 self.epcrqueue.put((sample, outfile))
     self.epcrqueue.join()
     self.epcrparse()
 def __init__(self, inputobject, samplebasestarget=700000):
     self.metadata = inputobject.runmetadata.samples
     self.database = glob(
         os.path.join(inputobject.reffilepath, 'rMLST', '*.fasta'))[0]
     self.logfile = inputobject.logfile
     self.threads = inputobject.cpus
     self.analysistype = 'confinder'
     self.number_subsamples = 5
     self.start = inputobject.starttime
     self.reportpath = inputobject.reportpath
     make_path(self.reportpath)
     self.samplebasestarget = samplebasestarget
     self.reportfile = os.path.join(self.reportpath,
                                    self.analysistype + '.csv')
     if not os.path.isfile(self.reportfile):
         self.analyse = True
     else:
         self.analyse = False
     self.main()
Ejemplo n.º 28
0
 def reporter(self):
     make_path(self.reportpath)
     header = 'Strain,ReferenceGenus,ReferenceFile,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n'
     data = ''
     for sample in self.metadata:
         try:
             data += '{},{},{},{},{},{}\n'.format(sample.name,
                                                  sample[self.analysistype].closestrefseqgenus,
                                                  sample[self.analysistype].closestrefseq,
                                                  sample[self.analysistype].mashdistance,
                                                  sample[self.analysistype].pvalue,
                                                  sample[self.analysistype].nummatches)
         except AttributeError:
             data += '{}\n'.format(sample.name)
     # Create the report file
     reportfile = os.path.join(self.reportpath, 'mash.csv')
     with open(reportfile, 'w') as report:
         report.write(header)
         report.write(data)
Ejemplo n.º 29
0
 def probes(self):
     """
     Find the 'best' probes for each gene by evaluating the percent identity of the probe to the best recorded
     percent identity for that organism + gene pair
     """
     logging.info('Determining optimal probe sequences')
     for sample in self.samples:
         # Make a folder to store the probes
         sample.gdcsoutputpath = os.path.join(self.gdcsoutputpath,
                                              sample.organism)
         sample.gdcscombined = os.path.join(
             sample.gdcsoutputpath,
             '{}_gdcs_combined.fasta'.format(sample.organism))
         make_path(sample.gdcsoutputpath)
         with open(sample.gdcscombined, 'w') as combined:
             for gene in sample.gene:
                 # Open the file to append
                 gene.gdcsoutputfile = os.path.join(
                     sample.gdcsoutputpath, '{}_gdcs.tfa'.format(gene.name))
                 with open(gene.gdcsoutputfile, 'w') as allelefile:
                     for window in gene.windows:
                         # Variable to record whether a probe has already been identified from this gene
                         passed = False
                         for sliding in window.sliding:
                             # Only consider the sequence if the sliding object has data, if the probe in question
                             # has a mean identity equal to the highest observed identity for that probe size, and
                             # if the mean identity is greater or equal than the lowest observed identity
                             if sliding.datastore and sliding.mean == window.max and sliding.mean >= window.min \
                                     and not passed:
                                 dnaseq = Seq(sliding.sequence,
                                              IUPAC.unambiguous_dna)
                                 # Create a sequence record using BioPython
                                 fasta = SeqRecord(
                                     dnaseq,
                                     # Without this, the header will be improperly formatted
                                     description='',
                                     # Use the gene name as the header
                                     id=gene.name)
                                 # Write each probe to the files
                                 SeqIO.write(fasta, allelefile, 'fasta')
                                 SeqIO.write(fasta, combined, 'fasta')
                                 passed = True
Ejemplo n.º 30
0
 def __init__(self):
     from argparse import ArgumentParser
     from time import time
     # Parser for arguments
     parser = ArgumentParser(
         description='Performs ePCR using a supplied primer file. The primers must be in the format: '
                     '<name>\t<forward primer>\t<reverse primer>\t<max size allowed between primers>\n.'
                     'Sequence files must be stored in <path>/sequences'
     )
     parser.add_argument('path',
                         help='Specify path in which reports are to be stored')
     parser.add_argument('-s', '--sequencepath',
                         required=True,
                         help='Path to assembly files')
     parser.add_argument('-f', '--primerfile',
                         required=True,
                         help='The name and path of the file containing the primers')
     # Get the arguments into an object
     arguments = parser.parse_args()
     self.starttime = time()
     # Add trailing slashes to the path variables to ensure consistent formatting (os.path.join)
     self.path = os.path.join(arguments.path, '')
     self.sequencepath = os.path.join(arguments.sequencepath, '')
     self.primerfile = arguments.primerfile
     # Initialise variables
     self.runmetadata = MetadataObject()
     self.reffilepath = False
     self.analysistype = 'ePCR'
     self.reportpath = os.path.join(self.path, 'reports')
     make_path(self.reportpath)
     # Initialise metadata
     self.runmetadata.samples = self.setup()
     self.logfile = os.path.join(self.path, 'vtyper_logfile.txt')
     # Run the analyses
     Vtyper(self, self.analysistype)
     # Create a report
     self.reporter()
     # Print the metadata to file
     printtime('Printing metadata to file', self.starttime)
     metadataprinter.MetadataPrinter(self)
     # Print a bold, green exit statement
     print(u'\033[92m' + u'\033[1m' + u'\nElapsed Time: %0.2f seconds' % (time() - self.starttime) + u'\033[0m')
Ejemplo n.º 31
0
    def sketching(self):
        printtime('Indexing assemblies for mash analysis', self.starttime)
        # Create the threads for the analysis
        for sample in self.metadata:
            if sample.general.bestassemblyfile != 'NA':
                threads = Thread(target=self.sketch, args=())
                threads.setDaemon(True)
                threads.start()
        # Populate threads for each gene, genome combination
        for sample in self.metadata:
            # Create the analysis type-specific GenObject
            setattr(sample, self.analysistype, GenObject())
            if sample.general.bestassemblyfile != 'NA':
                # Set attributes
                sample[self.analysistype].reportdir = os.path.join(
                    sample.general.outputdirectory, self.analysistype)
                sample[self.analysistype].targetpath = os.path.join(
                    self.referencefilepath, self.analysistype)
                sample[self.analysistype].refseqsketch = \
                    sample[self.analysistype].targetpath + '/RefSeqSketchesDefaults.msh'
                sample[self.analysistype].sketchfilenoext = '{}/{}'.format(
                    sample[self.analysistype].reportdir, sample.name)
                sample[self.analysistype].sketchfile = sample[
                    self.analysistype].sketchfilenoext + '.msh'
                # Make the mash output directory if necessary
                make_path(sample[self.analysistype].reportdir)
                # Create a file containing the path/name of the filtered, corrected fastq files
                sample[self.
                       analysistype].filelist = '{}/{}_fastqfiles.txt'.format(
                           sample[self.analysistype].reportdir, sample.name)
                with open(sample[self.analysistype].filelist, 'w') as filelist:
                    filelist.write('\n'.join(
                        sample.general.trimmedcorrectedfastqfiles))

                # Create the system call
                sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \
                    .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext)
                # Add each sample to the threads
                self.sketchqueue.put(sample)
        # Join the threads
        self.sketchqueue.join()
        self.mashing()
Ejemplo n.º 32
0
    def sketching(self):
        printtime('Indexing files for {} analysis'.format(self.analysistype), self.starttime)
        # Create the threads for the analysis
        for i in range(self.cpus):
            threads = Thread(target=self.sketch, args=())
            threads.setDaemon(True)
            threads.start()
        # Populate threads for each gene, genome combination
        for sample in self.metadata:
            # Create the analysis type-specific GenObject
            setattr(sample, self.analysistype, GenObject())
            # Set attributes
            sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype)
            make_path(sample[self.analysistype].reportdir)
            sample[self.analysistype].targetpath = self.referencefilepath if not self.pipeline else os.path.join(
                self.referencefilepath, self.analysistype)
            sample[self.analysistype].refseqsketch = os.path.join(sample[self.analysistype].targetpath,
                                                                  'RefSeqSketchesDefaults.msh')
            sample[self.analysistype].sketchfilenoext = os.path.join(sample[self.analysistype].reportdir, sample.name)
            sample[self.analysistype].sketchfile = sample[self.analysistype].sketchfilenoext + '.msh'
            # Make the mash output directory if necessary
            make_path(sample[self.analysistype].reportdir)
            # Create a file containing the path/name of the filtered, corrected fastq files
            sample[self.analysistype].filelist = os.path.join(sample[self.analysistype].reportdir,
                                                              '{}_fastqfiles.txt'.format(sample.name))
            with open(sample[self.analysistype].filelist, 'w') as filelist:
                filelist.write('\n'.join(sample.general.trimmedcorrectedfastqfiles))

            # Create the system call
            sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \
                .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext)
            # Add each sample to the threads
            try:
                self.sketchqueue.put(sample)
            except (KeyboardInterrupt, SystemExit):
                printtime('Received keyboard interrupt, quitting threads', self.starttime)
                quit()
        # Join the threads
        self.sketchqueue.join()
        self.mashing()
Ejemplo n.º 33
0
 def __init__(self, args):
     """
     Initialises the variables required for this class
     :param args: list of arguments passed to the script
     """
     SetupLogging()
     logging.info('Welcome to the CFIA de novo bacterial assembly pipeline {}'
                  .format(__version__))
     # Define variables from the arguments - there may be a more streamlined way to do this
     self.args = args
     self.path = os.path.join(args.sequencepath)
     self.reffilepath = os.path.join(args.referencefilepath)
     self.numreads = args.numreads
     self.preprocess = args.preprocess
     # Define the start time
     self.starttime = args.startingtime
     self.customsamplesheet = args.customsamplesheet
     if self.customsamplesheet:
         assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\
             .format(self.customsamplesheet)
     self.basicassembly = args.basicassembly
     if not self.customsamplesheet and not os.path.isfile(os.path.join(self.path, 'SampleSheet.csv')):
         self.basicassembly = True
         logging.warning('Could not find a sample sheet. Performing basic assembly (no run metadata captured)')
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = args.threads if args.threads else multiprocessing.cpu_count() - 1
     # Assertions to ensure that the provided variables are valid
     make_path(self.path)
     assert os.path.isdir(self.path), 'Supplied path location is not a valid directory {0!r:s}'.format(self.path)
     self.reportpath = os.path.join(self.path, 'reports')
     assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\
         .format(self.reffilepath)
     self.commit = __version__
     self.homepath = args.homepath
     self.logfile = os.path.join(self.path, 'logfile')
     self.runinfo = str()
     self.pipeline = True
     self.qualityobject = MetadataObject()
     # Initialise the metadata object
     self.runmetadata = MetadataObject()
Ejemplo n.º 34
0
 def __init__(self, inputobject, extension='fasta', light=False):
     # Create an object to mimic the command line arguments necessary for the script
     args = MetadataObject()
     args.path = inputobject.path
     args.sequencepath = inputobject.path
     args.databasepath = os.path.join(inputobject.reffilepath, 'clark')
     make_path(args.databasepath)
     args.clarkpath = os.path.dirname(which('CLARK'))
     args.clarkpath += '/../opt/clark/'
     args.cutoff = 0.005
     args.database = 'bacteria'
     args.rank = 'species'
     args.filter = False
     args.threads = inputobject.cpus
     args.runmetadata = inputobject.runmetadata
     args.clean_seqs = False
     args.reffilepath = inputobject.reffilepath
     args.runmetadata.extension = extension
     args.light = light
     # Run CLARK
     CLARK(args, inputobject.commit, inputobject.starttime,
           inputobject.homepath)
Ejemplo n.º 35
0
def main(args):
    # Create the path to store the schemes (if necessary)
    make_path(args.path)
    # Allow for Shigella to use the Escherichia MLST profile/alleles
    args.genus = args.genus if args.genus != 'Shigella' else 'Escherichia'
    # As there are multiple profiles for certain organisms, this dictionary has the schemes I use as values
    organismdictionary = {'Escherichia': 'Escherichia coli#1',
                          'Vibrio': 'Vibrio parahaemolyticus',
                          'Campylobacter': 'Campylobacter jejuni',
                          'Listeria': 'Listeria monocytogenes',
                          'Bacillus': 'Bacillus cereus',
                          'Staphylococcus': "Staphylococcus aureus",
                          'Salmonella': 'Salmonella enterica'}
    # Set the appropriate profile based on the dictionary key:value pairs
    try:
        args.genus = organismdictionary[args.species]
    except (KeyError, AttributeError):
        pass
    with url.urlopen(args.repository_url) as docfile:
        doc = xml.parse(docfile)
        root = doc.childNodes[0]
        found_species = []
        for species_node in root.getElementsByTagName('species'):
            info = getspeciesinfo(species_node, args.genus, args.force_scheme_name)
            if info is not None:
                found_species.append(info)
        if len(found_species) == 0:
            print("No species matched your query.")
            return
        if len(found_species) > 1:
            print("The following {} species match your query, please be more specific:".format(len(found_species)))
            for info in found_species:
                print(info.name)
                return
        # exit(2)

    # output information for the single matching species
    assert len(found_species) == 1
    species_info = found_species[0]
    species_name_underscores = species_info.name.replace(' ', '_')
    species_name_underscores = species_name_underscores.replace('/', '_')
    species_all_fasta_filename = species_name_underscores + '.fasta'
    species_all_fasta_file = open('{}/{}'.format(args.path, species_all_fasta_filename), 'w')
    log_filename = "mlst_data_download_{}_{}.log".format(species_name_underscores, species_info.retrieved)
    log_file = open('{}/{}'.format(args.path, log_filename), "w")
    log_file.write(species_info.retrieved + '\n')
    profile_path = urlparse(species_info.profiles_url).path
    profile_filename = profile_path.split('/')[-1]
    log_file.write("definitions: {}\n".format(profile_filename))
    log_file.write("{} profiles\n".format(species_info.profiles_count))
    log_file.write("sourced from: {}\n\n".format(species_info.profiles_url))
    #
    # with url.urlopen(species_info.profiles_url) as profile_doc:
    #     with open(os.path.join(args.path, profile_filename), 'w') as profile_file:
    localfile, headers = url.urlretrieve(species_info.profiles_url)
    with open(localfile, 'r') as profile_doc:
        with open(os.path.join(args.path, profile_filename), 'w') as profile_file:
            profile_file.write(profile_doc.read())
    for locus in species_info.loci:
        locus_path = urlparse(locus.url).path
        locus_filename = locus_path.split('/')[-1]
        log_file.write("locus {}\n".format(locus.name))
        log_file.write(locus_filename + '\n')
        log_file.write("Sourced from {}\n\n".format(locus.url))
        #
        local_locus_doc, headers = url.urlretrieve(locus.url)
        with open(local_locus_doc, 'r') as locus_doc:
            with open(os.path.join(args.path, locus_filename), 'w') as locus_file:
                # locus_doc = url.urlopen(locus.url)
                # locus_file = open('{}/{}'.format(args.path, locus_filename), 'w')
                locus_fasta_content = locus_doc.read()
                locus_file.write(locus_fasta_content)
                species_all_fasta_file.write(locus_fasta_content)
                # locus_file.close()
                # locus_doc.close()
    log_file.write("all loci: {}\n".format(species_all_fasta_filename))
    log_file.close()
    species_all_fasta_file.close()