コード例 #1
0
 def bbduker(self):
     """Run bbduk system calls"""
     while True:  # while daemon
         # Unpack the variables from the queue
         (sample, systemcall, reversename) = self.trimqueue.get()
         # Check to see if the forward file already exists
         if systemcall:
             threadlock = threading.Lock()
             if not os.path.isfile(reversename) and not os.path.isfile(
                     '{}.bz2'.format(reversename)):
                 # Run the call
                 out, err = run_subprocess(systemcall)
                 threadlock.acquire()
                 write_to_logfile(systemcall, systemcall, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr, None, None)
                 write_to_logfile(out, err, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr, None, None)
                 threadlock.release()
             # Define the output directory
             outputdir = sample.general.outputdirectory
             # Add the trimmed fastq files to a list
             trimmedfastqfiles = sorted(
                 glob(os.path.join(outputdir, '*trimmed.fastq.gz')))
             # Populate the metadata if the files exist
             sample.general.trimmedfastqfiles = trimmedfastqfiles if trimmedfastqfiles else 'NA'
         # Signal to trimqueue that job is done
         self.trimqueue.task_done()
コード例 #2
0
 def error_correction(self):
     """
     Use tadpole from the bbmap suite of tools to perform error correction of the reads
     """
     printtime('Error correcting reads', self.start)
     for sample in self.metadata:
         sample.general.trimmedcorrectedfastqfiles = [
             fastq.split('.fastq.gz')[0] + '_trimmed_corrected.fastq.gz'
             for fastq in sorted(sample.general.fastqfiles)
         ]
         try:
             out, err, cmd = bbtools.tadpole(
                 forward_in=sorted(sample.general.trimmedfastqfiles)[0],
                 forward_out=sample.general.trimmedcorrectedfastqfiles[0],
                 returncmd=True,
                 mode='correct',
                 threads=self.cpus)
             # Set the command in the object
             sample[self.analysistype].errorcorrectcmd = cmd
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
         except CalledProcessError:
             sample.general.trimmedcorrectedfastqfiles = sample.general.trimmedfastqfiles
         except KeyError:
             sample.general.trimmedcorrectedfastqfiles = list()
コード例 #3
0
ファイル: prodigal.py プロジェクト: carden24/OLCTools
 def predict(self):
     while True:
         sample = self.predictqueue.get()
         # Populate attributes
         sample.prodigal.reportdir = os.path.join(
             sample.general.outputdirectory, 'prodigal')
         sample.prodigal.results_file = os.path.join(
             sample.prodigal.reportdir,
             '{}_prodigalresults.sco'.format(sample.name))
         sample.prodigal.results = sample.prodigal.results_file
         sample.commands.prodigal = 'prodigal -i {in1} -o {out1} -f sco -d {genes}'\
             .format(in1=sample.general.bestassemblyfile,
                     out1=sample.prodigal.results_file,
                     genes=os.path.join(sample.prodigal.reportdir, '{}_genes.fa'.format(sample.name)))
         # Create the folder to store the reports
         make_path(sample.prodigal.reportdir)
         # Determine if the report already exists, and that it is not empty
         size = 0
         if os.path.isfile(sample.prodigal.results_file):
             size = os.stat(sample.prodigal.results_file).st_size
         if not os.path.isfile(sample.prodigal.results_file) or size == 0:
             # Run the command
             out, err = run_subprocess(sample.commands.prodigal)
             threadlock.acquire()
             write_to_logfile(sample.commands.prodigal,
                              sample.commands.prodigal, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             threadlock.release()
         self.predictqueue.task_done()
コード例 #4
0
 def run_bbmap(self):
     """
     Runs bbmap on kmer fasta file, against kmer fasta file to generate a samfile which can then be parsed to find
     low frequency kmers that have one mismatch to high frequency kmers, indicating that they're from contaminating
     alleles.
     """
     for sample in self.metadata:
         # Create the name for the output bam file
         sample[self.analysistype].bamfile = sample[
             self.analysistype].mer_fasta.replace('.fasta', '.bam')
         # Set the bbmap call - use the overwrite option to overwrite previous files that were created on previous
         # iterations, ambig=all to use all highest scoring mappings, nodisk to build index in memory, and only write
         # ouput to disk, local to allow soft-clipping
         sample[self.analysistype].bbmapcmd = \
             'bbmap.sh ref={} in={} outm={} overwrite ambig=all nodisk local threads={}'\
             .format(sample[self.analysistype].solid_mers,
                     sample[self.analysistype].solid_mers,
                     sample[self.analysistype].bamfile,
                     str(self.threads))
         # Run the call, and write any errors to the logfile
         command = sample[self.analysistype].bbmapcmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
コード例 #5
0
 def subsample_reads(self):
     """
     Subsampling of reads to 20X coverage of rMLST genes (roughly).
     To be called after rMLST extraction and read trimming, in that order.
     """
     for sample in self.metadata:
         # Create the name of the subsampled read file
         sample[self.analysistype].subsampledreads = os.path.join(
             sample[self.analysistype].outputdir,
             '{}_targetMatches_subsampled.fastq'.format(self.analysistype))
         # Set the reformat.sh command - as this command will be run multiple times, overwrite previous iterations
         # each time. Use samplebasestarget to provide an approximation of the number of bases to include in the
         # subsampled reads e.g. for rMLST: 700000 (approx. 35000 bp total length of genes x 20X coverage)
         sample[self.analysistype].subsamplecmd = 'reformat.sh in={} out={} overwrite samplebasestarget={}' \
             .format(sample[self.analysistype].baitedfastq,
                     sample[self.analysistype].subsampledreads,
                     self.samplebasestarget)
         # Run the call, and write any errors to the logfile
         command = sample[self.analysistype].subsamplecmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
コード例 #6
0
 def normalise_reads(self):
     """
     Use bbnorm from the bbmap suite of tools to perform read normalisation
     """
     printtime('Normalising reads to a kmer depth of 100', self.start)
     for sample in self.metadata:
         # Set the name of the normalised read files
         sample.general.normalisedreads = [
             fastq.split('.fastq.gz')[0] + '_normalised.fastq.gz'
             for fastq in sorted(sample.general.fastqfiles)
         ]
         try:
             # Run the normalisation command
             out, err, cmd = bbtools.bbnorm(
                 forward_in=sorted(
                     sample.general.trimmedcorrectedfastqfiles)[0],
                 forward_out=sample.general.normalisedreads[0],
                 returncmd=True,
                 threads=self.cpus)
             sample[self.analysistype].normalisecmd = cmd
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
         except CalledProcessError:
             sample.general.normalisedreads = sample.general.trimmedfastqfiles
         except IndexError:
             sample.general.normalisedreads = list()
コード例 #7
0
 def run_jellyfish(self):
     """
     Runs jellyfish to split subsample reads into kmers. Runs kmers through a bloom filter to get rid of singletons
     that are likely just sequencing errors. Should be run after subsampling reads.
     """
     for sample in self.metadata:
         # Set the name of the jellyfish count file
         sample[self.analysistype].jellyfish_file = os.path.join(
             sample[self.analysistype].outputdir,
             sample.name + '_jellyfish')
         # Set the system call
         sample[self.analysistype].jellyfishcountcmd \
             = 'jellyfish count -m 31 -s 100M --bf-size 100M -C -F 2 {} -o {} -t {}'\
             .format(sample[self.analysistype].subsampledreads,
                     sample[self.analysistype].jellyfish_file,
                     str(self.threads))
         # Run the call, and write any errors to the logfile
         command = sample[self.analysistype].jellyfishcountcmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
コード例 #8
0
 def assemble(self):
     """Run the assembly command in a multi-threaded fashion"""
     threadlock = threading.Lock()
     while True:
         (sample, command) = self.assemblequeue.get()
         if command and not os.path.isfile(
                 os.path.join(sample.general.spadesoutput,
                              'contigs.fasta')):
             # execute(command)
             out, err = run_subprocess(command)
             threadlock.acquire()
             write_to_logfile(command, command, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             threadlock.release()
             #
             call(command,
                  shell=True,
                  stdout=open(os.devnull, 'wb'),
                  stderr=open(os.devnull, 'wb'))
         dotter()
         # Signal to the queue that the job is done
         self.assemblequeue.task_done()
コード例 #9
0
 def makeblastdb(self):
     """
     Makes blast database files from targets as necessary
     """
     # Iterate through the samples to set the bait file.
     for sample in self.runmetadata.samples:
         if sample.general.bestassemblyfile != 'NA':
             # Remove the file extension
             db = os.path.splitext(sample[self.analysistype].baitfile)[0]
             # Add '.nhr' for searching below
             nhr = '{}.nhr'.format(db)
             # Check for already existing database files
             if not os.path.isfile(str(nhr)):
                 # Create the databases
                 command = 'makeblastdb -in {} -parse_seqids -max_file_sz 2GB -dbtype nucl -out {}'\
                     .format(sample[self.analysistype].baitfile, db)
                 out, err = run_subprocess(command)
                 write_to_logfile(command, command, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr,
                                  sample[self.analysistype].logout,
                                  sample[self.analysistype].logerr)
                 write_to_logfile(out, err, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr,
                                  sample[self.analysistype].logout,
                                  sample[self.analysistype].logerr)
コード例 #10
0
def kmerize_individual_fastas(potential_plasmid_list,
                              fasta_dir,
                              output_dir,
                              threads=1,
                              logfile=None):
    """
    Creates a KMC database for a list of potential plasmids that have FASTA-formatted sequences in fasta_dir.
    KMC databases are placed in output_dir.
    :param potential_plasmid_list: List of potential plasmids.
    :param fasta_dir: Directory where FASTA files for each potential plasmid are located.
    :param output_dir: Directory to store KMC Databases in. Created if it doesn't exist.
    :param logfile: File to write output to.
    :param threads: Number of threads to run KMC with.
    """
    if not os.path.isdir(output_dir):  # Make output dir if necessary.
        os.makedirs(output_dir)

    for plasmid in potential_plasmid_list:  # Call KMC in FASTA mode on each individual FASTA.
        out, err = kmc.kmc(forward_in=os.path.join(fasta_dir, plasmid),
                           database_name=os.path.join(output_dir, plasmid),
                           tmpdir=os.path.join(output_dir, 'tmp'),
                           fm='',
                           t=threads)
        if logfile:
            accessoryFunctions.write_to_logfile(out, err, logfile)
コード例 #11
0
 def write_mer_file(self):
     """
     Writes the mer file created by jellyfish to a fasta format to be used by other things downstream.
     Only writes kmers that have been seen at least twice to attempt to get rid of sequencing erros.
     """
     for sample in self.metadata:
         # Set the name of the kmer file dumped from jellyfish
         sample[self.analysistype].mer_fasta = sample[
             self.analysistype].jellyfish_file + '.fasta'
         sample[self.analysistype].solid_mers = sample[
             self.analysistype].jellyfish_file + '_solid.fasta'
         # Set the system call
         sample[self.analysistype].jellyfishdumpcmd =\
             'jellyfish dump {} > {}'\
             .format(sample[self.analysistype].jellyfish_file,
                     sample[self.analysistype].mer_fasta)
         # Run the system call
         command = sample[self.analysistype].jellyfishdumpcmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         # Read in the dumped file to a list
         with open(sample[self.analysistype].mer_fasta, 'r') as mers:
             fastas = mers.readlines()
         # Initialise variables for use in parsing outputs
         num_mers = 0
         sequences = list()
         # Iterate through the list of the fasta outputs. Output is a multifasta e.g.:
         # >8
         # GCCTGGAAAACTGGCCACCGGCAAGCATCGC
         # where the header, >8, indicates that the sequence is present 8 times in the sample
         for i in range(len(fastas)):
             # Find the headers
             if '>' in fastas[i]:
                 # If the number of times the sequence is present in the sample is greater than one, increment
                 # the total number of kmers observed
                 if int(fastas[i].replace('>', '')) > 1:
                     num_mers += 1
                     # Append a string of the header plus the total number of mers, and the sequence information
                     # to the list of sequences e.g. ['>8_1\nGCCTGGAAAACTGGCCACCGGCAAGCATCGC\n']
                     sequences.append(fastas[i].rstrip() + '_' +
                                      str(num_mers) + '\n' + fastas[i + 1])
         # Write out our solid kmers to file to be used later.
         with open(sample[self.analysistype].solid_mers, 'w') as solidmers:
             solidmers.write(''.join(sequences))
         # Update the number of unique kmers
         if num_mers > sample[self.analysistype].unique_kmers:
             sample[self.analysistype].unique_kmers = num_mers
コード例 #12
0
 def extract_rmlst_reads(self):
     """
     rMLST read extraction. Should be the first thing called after parsing the fastq directory.
     """
     for sample in self.metadata:
         # Create the object to store the variables
         setattr(sample, self.analysistype, GenObject())
         # Initialise variables
         sample[self.analysistype].snv_count = list()
         # Initialise a starting value for the number of unique kmers found in each sample
         sample[self.analysistype].unique_kmers = -1
         # Set and create the output directory
         try:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.run.outputdirectory, self.analysistype)
         except KeyError:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.general.outputdirectory, self.analysistype)
         make_path(sample[self.analysistype].outputdir)
         sample[self.analysistype].logout = os.path.join(
             sample[self.analysistype].outputdir, 'logout.txt')
         sample[self.analysistype].logerr = os.path.join(
             sample[self.analysistype].outputdir, 'logerr.txt')
         sample[self.analysistype].baitedfastq = os.path.join(
             sample[self.analysistype].outputdir,
             '{}_targetMatches.fastq.gz'.format(self.analysistype))
         # Create the command to run the baiting - paired inputs and a single, zipped output
         sample[self.analysistype].bbdukcmd = 'bbduk.sh ref={} in1={} in2={} threads={} outm={}'\
             .format(self.database,
                     sample.general.trimmedcorrectedfastqfiles[0],
                     sample.general.trimmedcorrectedfastqfiles[1],
                     str(self.threads),
                     sample[self.analysistype].baitedfastq)
         # Sometimes bbduk hangs forever, so that needs to be handled. Give it a very generous timeout.
         try:
             # Run the call, and write any errors to the logfile
             command = sample[self.analysistype].bbdukcmd
             if self.analyse:
                 out, err = run_subprocess(command)
             else:
                 out = str()
                 err = str()
             write_to_logfile(command, command, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         except TimeoutExpired:
             print('ERROR: Could not extract rMLST reads from sample {}'.
                   format(sample.name))
コード例 #13
0
ファイル: mash.py プロジェクト: carden24/OLCTools
 def mash(self):
     while True:
         sample = self.mashqueue.get()
         if not os.path.isfile(sample[self.analysistype].mashresults):
             threadlock = threading.Lock()
             out, err = run_subprocess(sample.commands.mash)
             threadlock.acquire()
             write_to_logfile(sample.commands.mash, sample.commands.mash,
                              self.logfile)
             write_to_logfile(out, err, self.logfile)
             threadlock.release()
             # call(sample.commands.mash, shell=True, stdout=self.fnull, stderr=self.fnull)
         self.mashqueue.task_done()
コード例 #14
0
 def epcr(self):
     while True:
         # I think this should work for getting output from processes and writing to a logfile - but it's long
         # and terrible - maybe possible to get it set up in accessoryFunctions so it's a bit less of a pain?
         # Setup a threadlock for later so multiple processes don't all try to write their output at once.
         threadlock = threading.Lock()
         # Get our stdout and stderr strings set up.
         outstr = ''
         errstr = ''
         sample, linkfile = self.epcrqueue.get()
         if not os.path.isfile('{}.famap'.format(linkfile)):
             # Run the subprocess, then get the stdout in outstr and stderr in errstr
             out, err = run_subprocess(sample.commands.famap)
             outstr += out
             errstr += err
         if not os.path.isfile('{}.hash'.format(linkfile)):
             out, err = run_subprocess(sample.commands.fahash)
             outstr += out
             errstr += err
         if not os.path.isfile('{}.txt'.format(linkfile)):
             out, err = run_subprocess(sample.commands.epcr)
             outstr += out
             errstr += err
         # Once processes are finished running, get the threadlock, because now it's output writing time.
         threadlock.acquire()
         # Write stuff to the logfile.
         write_to_logfile(sample.commands.famap, sample.commands.famap, self.logfile)
         write_to_logfile(sample.commands.fahash, sample.commands.fahash, self.logfile)
         write_to_logfile(sample.commands.epcr, sample.commands.epcr, self.logfile)
         write_to_logfile(outstr, errstr, self.logfile)
         threadlock.release()
         # Release the threadlock so that other processes can get on with it.
         self.epcrqueue.task_done()
コード例 #15
0
 def sistr(self):
     """Perform sistr analyses on Salmonella"""
     printtime('Performing sistr analyses', self.start)
     for sample in self.metadata:
         # Create the analysis-type specific attribute
         setattr(sample, self.analysistype, GenObject())
         if sample.general.bestassemblyfile != 'NA':
             try:
                 # Only process strains that have been determined to be Salmonella
                 if sample.general.referencegenus == 'Salmonella':
                     # Set and create the path of the directory to store the strain-specific reports
                     sample[self.analysistype].reportdir = os.path.join(
                         sample.general.outputdirectory, self.analysistype)
                     # Name of the .json output file
                     sample[self.analysistype].jsonoutput = os.path.join(
                         sample[self.analysistype].reportdir,
                         '{}.json'.format(sample.name))
                     # Set the sistr system call
                     sample.commands.sistr = \
                         'sistr -f json -o {} -t {} -T {} {}'\
                         .format(sample[self.analysistype].jsonoutput,
                                 self.cpus,
                                 os.path.join(sample[self.analysistype].reportdir, 'tmp'),
                                 sample.general.bestassemblyfile)
                     #
                     sample[self.analysistype].logout = os.path.join(
                         sample[self.analysistype].reportdir, 'logout')
                     sample[self.analysistype].logerr = os.path.join(
                         sample[self.analysistype].reportdir, 'logerr')
                     # Only run the analyses if the output json file does not exist
                     if not os.path.isfile(
                             sample[self.analysistype].jsonoutput):
                         out, err = run_subprocess(sample.commands.sistr)
                         write_to_logfile(sample.commands.sistr,
                                          sample.commands.sistr,
                                          self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                         write_to_logfile(out, err, self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                     self.queue.task_done()
             except (ValueError, KeyError):
                 pass
     self.queue.join()
     self.report()
コード例 #16
0
ファイル: skesa.py プロジェクト: carden24/OLCTools
 def assemble(self):
     while True:
         sample = self.assemblequeue.get()
         if not os.path.isfile(sample.general.assemblyfile):
             # Run the assembly
             out, err = run_subprocess(sample.commands.assemble)
             self.threadlock.acquire()
             write_to_logfile(sample.commands.assemble,
                              sample.commands.assemble, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             self.threadlock.release()
         self.assemblequeue.task_done()
コード例 #17
0
ファイル: CHAS.py プロジェクト: carden24/OLCTools
 def makeblastdb(self, fastapath):
     """
     Makes blast database files from targets as necessary
     """
     # remove the path and the file extension for easier future globbing
     db = fastapath.split('.')[0]
     nhr = '{}.nhr'.format(db)  # add nhr for searching
     if not os.path.isfile(str(nhr)):  # if check for already existing dbs
         # Create the databases
         threadlock = threading.Lock()
         command = 'makeblastdb -in {} -parse_seqids -max_file_sz 2GB -dbtype nucl -out {}'.format(fastapath, db)
         out, err = run_subprocess(command)
         threadlock.acquire()
         write_to_logfile(out, err, self.logfile)
         threadlock.release()
     dotter()
コード例 #18
0
 def run_qaml(self):
     """
     Create and run the GenomeQAML system call
     """
     printtime('Running GenomeQAML quality assessment', self.start)
     qaml_call = 'classify.py -t {tf} -r {rf}'\
         .format(tf=self.qaml_path,
                 rf=self.qaml_report)
     make_path(self.reportpath)
     # Only attempt to assess assemblies if the report doesn't already exist
     if not os.path.isfile(self.qaml_report):
         # Run the system calls
         out, err = run_subprocess(qaml_call)
         # Acquire thread lock, and write the logs to file
         self.threadlock.acquire()
         write_to_logfile(qaml_call, qaml_call, self.logfile)
         write_to_logfile(out, err, self.logfile)
         self.threadlock.release()
コード例 #19
0
 def fastathreads(self):
     while True:
         sample = self.fastaqueue.get()
         # Check to see if the FASTA file already exists
         if not os.path.isfile(sample[self.analysistype].fasta):
             # Run the system call , stdout=self.devnull, stderr=self.devnull
             out, err = run_subprocess(sample[self.analysistype].fastxcall)
             write_to_logfile(sample[self.analysistype].fastxcall,
                              sample[self.analysistype].fastxcall,
                              self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         self.fastaqueue.task_done()
コード例 #20
0
 def subsamplethreads(self):
     while True:
         sample = self.samplequeue.get()
         # Check to see if the subsampled FASTQ file has already been created
         if not os.path.isfile(sample[self.analysistype].subsampledfastq):
             # Run the system call
             # call(sample[self.analysistype].seqtkcall, shell=True, stdout=self.devnull, stderr=self.devnull)
             out, err = run_subprocess(sample[self.analysistype].seqtkcall)
             write_to_logfile(sample[self.analysistype].seqtkcall,
                              sample[self.analysistype].seqtkcall,
                              self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         self.samplequeue.task_done()
コード例 #21
0
 def make_db(self):
     """
     Makes the blast database if it isn't present. Doesn't do anything if we already have database files.
     """
     db_files = ['.nhr', '.nin', '.nsq']
     db_present = True
     for db_file in db_files:
         if not os.path.isfile(self.database + db_file):
             db_present = False
     if not db_present:
         printtime('Making database!', self.start)
         command = 'makeblastdb -dbtype nucl -in ' + self.database
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile, None, None, None,
                          None)
         write_to_logfile(out, err, self.logfile, None, None, None, None)
コード例 #22
0
ファイル: database_setup.py プロジェクト: carden24/COWBAT
 def database_download(self, targetcall, databasepath, complete=True):
     """
     Checks to see if the database has already been downloaded. If not, downloads the database, and writes stdout
     and stderr to the logfile
     :param targetcall: system call to download, and possibly set-up the database
     :param databasepath: absolute path of the database
     :param complete: boolean variable to determine whether the complete file should be created
     """
     # Create a file to store the logs; it will be used to determine if the database was downloaded and set-up
     completefile = os.path.join(databasepath, 'complete')
     # Run the system call if the database is not already downloaded
     if not os.path.isfile(completefile):
         out, err = run_subprocess(targetcall)
         print(out, err)
         # Write the out and err streams to the master files
         write_to_logfile(out, err, self.logfile, None, None, None, None)
         if complete:
             # Create the database completeness assessment file and populate it with the out and err streams
             with open(completefile, 'w') as complete:
                 complete.write(out)
                 complete.write(err)
コード例 #23
0
 def merge_pairs(self):
     """
     Use bbmerge from the bbmap suite of tools to merge paired-end reads
     """
     printtime('Merging paired reads', self.start)
     for sample in self.metadata:
         # Can only merge paired-end
         if len(sample.general.fastqfiles) == 2:
             # Set the name of the merged, and unmerged files
             sample.general.mergedreads = \
                 os.path.join(sample.general.outputdirectory, '{}_paired.fastq.gz'.format(sample.name))
             sample.general.unmergedforward = \
                 os.path.join(sample.general.outputdirectory, '{}_unpaired_R1.fastq.gz'.format(sample.name))
             sample.general.unmergedreverse = \
                 os.path.join(sample.general.outputdirectory, '{}_unpaired_R2.fastq.gz'.format(sample.name))
             try:
                 # Run the merging command - forward_in=sample.general.normalisedreads[0],
                 out, err, cmd = bbtools.bbmerge(
                     forward_in=sorted(
                         sample.general.trimmedcorrectedfastqfiles)[0],
                     merged_reads=sample.general.mergedreads,
                     returncmd=True,
                     outu1=sample.general.unmergedforward,
                     outu2=sample.general.unmergedreverse,
                     threads=self.cpus)
                 sample[self.analysistype].bbmergecmd = cmd
                 write_to_logfile(out, err, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr, None, None)
             except CalledProcessError:
                 delattr(sample.general, 'mergedreads')
                 delattr(sample.general, 'unmergedforward')
                 delattr(sample.general, 'unmergedreverse')
             except IndexError:
                 delattr(sample.general, 'mergedreads')
                 delattr(sample.general, 'unmergedforward')
                 delattr(sample.general, 'unmergedreverse')
         else:
             sample.general.mergedreads = sorted(
                 sample.general.trimmedcorrectedfastqfiles)[0]
コード例 #24
0
 def estimate_genome_size(self):
     """
     Use kmercountexact from the bbmap suite of tools to estimate the size of the genome
     """
     printtime('Estimating genome size using kmercountexact', self.start)
     for sample in self.metadata:
         # Initialise the name of the output file
         sample[self.analysistype].peaksfile = os.path.join(
             sample[self.analysistype].outputdir, 'peaks.txt')
         # Run the kmer counting command
         out, err, cmd = bbtools.kmercountexact(
             forward_in=sorted(sample.general.fastqfiles)[0],
             peaks=sample[self.analysistype].peaksfile,
             returncmd=True,
             threads=self.cpus)
         # Set the command in the object
         sample[self.analysistype].kmercountexactcmd = cmd
         # Extract the genome size from the peaks file
         sample[self.analysistype].genomesize = bbtools.genome_size(
             sample[self.analysistype].peaksfile)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr, None, None)
コード例 #25
0
 def runquast(self):
     while True:
         sample, quastoutputdirectory = self.quastqueue.get()
         make_path(quastoutputdirectory)
         threadlock = threading.Lock()
         # fnull = open(os.devnull, 'wb')
         # Don't re-perform the analysis if the report file exists
         if not os.path.isfile(
                 '{}/report.tsv'.format(quastoutputdirectory)):
             out, err = run_subprocess(sample.commands.quast)
             # call(sample.commands.quast, shell=True, stdout=fnull, stderr=fnull)
             threadlock.acquire()
             write_to_logfile(sample.commands.quast, sample.commands.quast,
                              self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             threadlock.release()
         # Following the analysis, parse the report (if it exists) into the metadata object
         if os.path.isfile('{}/report.tsv'.format(quastoutputdirectory)):
             self.metaparse(sample, quastoutputdirectory)
         self.quastqueue.task_done()
コード例 #26
0
ファイル: CHAS.py プロジェクト: carden24/OLCTools
 def epcr(self):
     while True:
         sample, linkfile = self.epcrqueue.get()
         # Set the names of the output files
         sample[self.analysistype].famap = '{}.famap'.format(linkfile)
         sample[self.analysistype].hash = '{}.hash'.format(linkfile)
         sample[self.analysistype].output = '{}.txt'.format(linkfile)
         # Initialise a list to store the results
         sample[self.analysistype].epcrresults = list()
         # If the files created by the results do not exist, run the necessary system calls
         threadlock = threading.Lock()
         # Get our stdout and stderr strings set up.
         outstr = ''
         errstr = ''
         sample, linkfile = self.epcrqueue.get()
         if not os.path.isfile(sample[self.analysistype].famap):
             # Run the subprocess, then get the stdout in outstr and stderr in errstr
             out, err = run_subprocess(sample.commands.famap)
             outstr += out
             errstr += err
         if not os.path.isfile(sample[self.analysistype].famap):
             out, err = run_subprocess(sample.commands.fahash)
             outstr += out
             errstr += err
         if not os.path.isfile(sample[self.analysistype].output):
             out, err = run_subprocess(sample.commands.epcr)
             outstr += out
             errstr += err
         # Once processes are finished running, get the threadlock, because now it's output writing time.
         threadlock.acquire()
         # Write stuff to the logfile.
         write_to_logfile(sample.commands.famap, sample.commands.famap, self.logfile)
         write_to_logfile(sample.commands.fahash, sample.commands.fahash, self.logfile)
         write_to_logfile(sample.commands.epcr, sample.commands.epcr, self.logfile)
         write_to_logfile(outstr, errstr, self.logfile)
         threadlock.release()
         # Read the results into a list
         with open(sample[self.analysistype].output, 'r') as results:
             for line in results:
                 sample[self.analysistype].epcrresults.append(line.strip())
         self.epcrqueue.task_done()
コード例 #27
0
 def fastqc(self):
     """Run fastqc system calls"""
     while True:  # while daemon
         threadlock = threading.Lock()
         # Unpack the variables from the queue
         (sample, systemcall, outputdir, fastqcreads) = self.qcqueue.get()
         # Check to see if the output HTML file already exists
         try:
             _ = glob(os.path.join(outputdir, '*.html'))[0]
         except IndexError:
             # Make the output directory
             make_path(outputdir)
             # Run the system calls
             outstr = str()
             errstr = str()
             out, err = run_subprocess(systemcall)
             outstr += out
             errstr += err
             out, err = run_subprocess(fastqcreads)
             outstr += out
             errstr += err
             # Acquire thread lock, and write the logs to file
             threadlock.acquire()
             write_to_logfile(systemcall, systemcall, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(fastqcreads, fastqcreads, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(outstr, errstr, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             threadlock.release()
             # Rename the outputs
             try:
                 shutil.move(
                     os.path.join(outputdir, 'stdin_fastqc.html'),
                     os.path.join(outputdir,
                                  '{}_fastqc.html'.format(sample.name)))
                 shutil.move(
                     os.path.join(outputdir, 'stdin_fastqc.zip'),
                     os.path.join(outputdir,
                                  '{}_fastqc.zip'.format(sample.name)))
             except IOError:
                 pass
         # Signal to qcqueue that job is done
         self.qcqueue.task_done()
コード例 #28
0
    def validate_fastq(self):
        """
        Runs reformat.sh on the FASTQ files. If a CalledProcessError arises, do not proceed with the assembly of
        these files
        """
        printtime('Validating FASTQ files', self.start)
        validated_reads = list()
        for sample in self.metadata:
            # Tiny files can pass the validation tests - ensure that they don't
            size = os.path.getsize(sample.general.fastqfiles[0])
            if size >= 1000000:
                # Try to run reformat.sh on the reads - on any errors try to run repair.sh
                try:
                    out, err, cmd = bbtools.validate_reads(
                        forward_in=sample.general.fastqfiles[0],
                        returncmd=True)
                    write_to_logfile(out, err, self.logfile,
                                     sample.general.logout,
                                     sample.general.logerr, None, None)
                    # Add the sample to the list of samples with FASTQ files that pass this validation step
                    validated_reads.append(sample)
                except CalledProcessError:
                    # Set the file names for the reformatted and repaired files
                    outputfile1 = os.path.join(
                        sample.general.outputdirectory,
                        '{}_reformatted_R1.fastq.gz'.format(sample.name))
                    repair_file1 = os.path.join(
                        sample.general.outputdirectory,
                        '{}_repaired_R1.fastq.gz'.format(sample.name))
                    if len(sample.general.fastqfiles) == 2:
                        outputfile2 = os.path.join(
                            sample.general.outputdirectory,
                            '{}_reformatted_R2.fastq.gz'.format(sample.name))
                        repair_file2 = os.path.join(
                            sample.general.outputdirectory,
                            '{}_repaired_R2.fastq.gz'.format(sample.name))
                    else:
                        outputfile2 = str()
                        repair_file2 = str()
                    # Try to use reformat.sh to repair the reads - if this fails, discard the sample from the analyses
                    try:
                        printtime(
                            'Errors detected in FASTQ files for sample {sample}. Please check the following files'
                            ' for details {log} {logout} {logerr}. Using reformat.sh to attempt to repair issues'
                            .format(sample=sample.name,
                                    log=self.logfile,
                                    logout=sample.general.logout,
                                    logerr=sample.general.logerr), self.start)
                        if not os.path.isfile(outputfile1):
                            # Run reformat.sh
                            out, err, cmd = bbtools.reformat_reads(
                                forward_in=sample.general.fastqfiles[0],
                                forward_out=outputfile1,
                                returncmd=True)
                            write_to_logfile(out, err, self.logfile,
                                             sample.general.logout,
                                             sample.general.logerr, None, None)
                            # Run repair.sh (if necessary)
                            if outputfile2:
                                out, err, cmd = bbtools.repair_reads(
                                    forward_in=outputfile1,
                                    forward_out=repair_file1,
                                    returncmd=True)
                                write_to_logfile(out, err, self.logfile,
                                                 sample.general.logout,
                                                 sample.general.logerr, None,
                                                 None)
                        # Ensure that the output file(s) exist before declaring this a success
                        if os.path.isfile(outputfile1):
                            # Update the fastqfiles attribute to point to the repaired files
                            sample.general.fastqfiles = [
                                repair_file1, repair_file2
                            ] if repair_file2 else [outputfile1]
                            # Add the sample object to the list of samples passing the FASTQ validation step
                            validated_reads.append(sample)
                    except CalledProcessError:
                        # The file(s) can be created even if there is STDERR from reformat.sh
                        if os.path.isfile(outputfile1) and outputfile2:
                            try:
                                out, err, cmd = bbtools.repair_reads(
                                    forward_in=outputfile1,
                                    forward_out=repair_file1,
                                    returncmd=True)
                                write_to_logfile(out, err, self.logfile,
                                                 sample.general.logout,
                                                 sample.general.logerr, None,
                                                 None)
                                # Update the fastqfiles attribute to point to the repaired files
                                sample.general.fastqfiles = [repair_file1, repair_file2] if repair_file2 else \
                                    [repair_file1]
                                # Add the sample object to the list of samples passing the FASTQ validation step
                                validated_reads.append(sample)
                            except CalledProcessError:
                                # Write in the logs that there was an error detected in the FASTQ files
                                write_to_logfile(
                                    'An error was detected in the FASTQ files for sample {}. '
                                    'These files will not be processed further'
                                    .format(sample.name),
                                    'An error was detected in the FASTQ files for sample {}. '
                                    'These files will not be processed further'
                                    .format(sample.name), self.logfile,
                                    sample.general.logout,
                                    sample.general.logerr, None, None)
                                # Update metadata objects with error
                                self.error(sample, 'fastq_error')
                        else:
                            # Write in the logs that there was an error detected in the FASTQ files
                            write_to_logfile(
                                'An error was detected in the FASTQ files for sample {}. '
                                'These files will not be processed further'.
                                format(sample.name),
                                'An error was detected in the FASTQ files for sample {}. '
                                'These files will not be processed further'.
                                format(sample.name), self.logfile,
                                sample.general.logout, sample.general.logerr,
                                None, None)

                            # Update metadata objects with error
                            self.error(sample, 'fastq_error')
            else:
                # Update metadata objects with error
                self.error(sample, 'files_too_small')
        # Print the metadata to file
        metadataprinter.MetadataPrinter(self)
        # Overwrite self.metadata with objects that do not fail the validation
        self.metadata = validated_reads
コード例 #29
0
 def createfastq(self):
     """Uses bcl2fastq to create .fastq files from a MiSeqRun"""
     # Initialise samplecount
     samplecount = 0
     # If the fastq destination folder is not provided, make the default value of :path/:miseqfoldername
     self.fastqdestination = self.fastqdestination if self.fastqdestination else self.path + self.miseqfoldername
     # Make the path
     make_path(self.fastqdestination)
     # Initialise variables for storing index information
     index = ''
     indexlength = int()
     # bcl2fastq requires an older version of the sample sheet, this recreates the required version
     # Create the new sample sheet
     with open('{}/SampleSheet_modified.csv'.format(self.fastqdestination),
               "w") as modifiedsamplesheet:
         # Write the required headings to the file
         modifiedsamplesheet.write(
             "FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject\n"
         )
         for strain in self.samples:
             # Create a combined index of index1-index2
             try:
                 strain.run.modifiedindex = '{}-{}'.format(
                     strain.run.index, strain.run.index2)
                 indexlength = 16
                 index = 'I8,I8'
             except KeyError:
                 strain.run.modifiedindex = strain.run.index
                 indexlength = 6
                 index = 'I6'
             # The list of items to print to each line of the modified sample sheet
             printlist = [
                 self.flowcell, '1', strain.name,
                 str(strain.run.SampleNumber), strain.run.modifiedindex,
                 strain.run.Description, 'N', 'NA',
                 strain.run.InvestigatorName, self.projectname
             ]
             modifiedsamplesheet.write('{}\n'.format(",".join(printlist)))
             samplecount += 1
     # Set :forward/reverse length to :header.forward/reverse length if the argument is not provided, or it's 'full',
     # otherwise  use the supplied argument
     self.forwardlength = self.metadata.header.forwardlength if self.forwardlength.lower()\
         == 'full' else self.forwardlength
     # Set :reverselength to :header.reverselength
     self.reverselength = self.metadata.header.reverselength if self.reverselength.lower() \
         == 'full' else self.reverselength
     # As the number of cycles required is the number of forward reads + the index(8) + the second index(8)
     # Also set the basemask variable as required
     if self.reverselength != '0':
         self.readsneeded = int(self.forwardlength) + int(
             self.reverselength) + indexlength
         basemask = "Y{}n*,{},Y{}n*".format(self.forwardlength, index,
                                            self.reverselength)
         nohup = "nohup make -j 16 > nohup.out"
     else:
         #  + 1
         self.readsneeded = int(self.forwardlength) + indexlength
         basemask = "Y{}n*,{},n*".format(self.forwardlength, index)
         nohup = "nohup make -j 16 r1 > nohup.out"
     # Handle plurality appropriately
     samples = 'samples' if samplecount > 1 else 'sample'
     number = 'are' if samplecount > 1 else 'is'
     printtime(
         'There {} {} {} in this run. '
         'Running fastq creating module with the following parameters:\n'
         'MiSeqPath: {},\n'
         'MiSeqFolder: {},\n'
         'Fastq destination: {},\n'
         'SampleSheet: {}'.format(
             number, samplecount, samples, self.miseqpath, self.miseqfolder,
             self.fastqdestination,
             '{}/SampleSheet_modified.csv'.format(self.fastqdestination)),
         self.start)
     # Count the number of completed cycles in the run of interest
     cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format(
         self.miseqfolder))
     while len(cycles) < self.readsneeded:
         printtime(
             'Currently at {} cycles. Waiting until the MiSeq reaches cycle {}'
             .format(len(cycles), self.readsneeded), self.start)
         sleep(1800)
         cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format(
             self.miseqfolder))
     # configureBClToFastq requires :self.miseqfolder//Data/Intensities/BaseCalls/config.xml in order to work
     # When you download runs from BaseSpace, this file is not provided. There is an empty config.xml file that
     # can be populated with run-specific values and moved to the appropriate folder
     if not os.path.isfile('{}Data/Intensities/BaseCalls/config.xml'.format(
             self.miseqfolder)):
         self.configfilepopulator()
     # Define the bcl2fastq system call
     bclcall = "configureBclToFastq.pl --input-dir {}Data/Intensities/BaseCalls " \
               "--output-dir {} --force --sample-sheet {}/SampleSheet_modified.csv " \
               "--mismatches 1 --no-eamss --fastq-cluster-count 0 --compression none --use-bases-mask {}"\
         .format(self.miseqfolder, self.fastqdestination, self.fastqdestination, basemask)
     # Define the nohup system call
     nohupcall = "cd {} && {}".format(self.fastqdestination, nohup)
     # fnull = open(os.devnull, 'wb')
     if not os.path.isdir("{}/Project_{}".format(self.fastqdestination,
                                                 self.projectname)):
         # Call configureBclToFastq.pl
         printtime('Running bcl2fastq', self.start)
         # Run the commands
         threadlock = threading.Lock()
         outstr = ''
         outerr = ''
         out, err = run_subprocess(bclcall)
         outstr += out
         outerr += out
         out, err = run_subprocess(nohupcall)
         outstr += out
         outerr += out
         # call(bclcall, shell=True, stdout=fnull, stderr=fnull)
         # call(nohupcall, shell=True, stdout=fnull, stderr=fnull)
         threadlock.acquire()
         write_to_logfile(bclcall, bclcall, self.logfile)
         write_to_logfile(nohupcall, nohupcall, self.logfile)
         write_to_logfile(outstr, outerr, self.logfile)
         threadlock.release()
     # Populate the metadata
     for sample in self.metadata.samples:
         sample.commands = GenObject()
         sample.commands.nohup = nohupcall
         sample.commands.bcl = bclcall
         sample.run.forwardlength = self.forwardlength
         sample.run.reverselength = self.reverselength
     # Copy the fastq files to a central folder so they can be processed
     self.fastqmover()
コード例 #30
0
ファイル: skesa.py プロジェクト: carden24/OLCTools
 def merge(self, sample):
     """
     Use bbmerge to merge paired FASTQ files for use in metagenomics pipelines. Create a report with the
     total number of reads, and the number of reads that could be paired
     :param sample: metadata sample object flagged as a metagenome
     """
     # Set the assembly file to 'NA' as assembly is not desirable for metagenomes
     sample.general.assemblyfile = 'NA'
     # Can only merge paired-end
     if len(sample.general.fastqfiles) == 2:
         outpath = os.path.join(sample.general.outputdirectory,
                                'merged_reads')
         make_path(outpath)
         # Merge path - keep all the merged FASTQ files in one directory
         merge_path = os.path.join(self.path, 'merged_reads')
         make_path(merge_path)
         # Set the name of the merged, and unmerged files
         sample.general.mergedreads = \
             os.path.join(merge_path, '{}_paired.fastq.gz'.format(sample.name))
         log = os.path.join(outpath, 'log')
         error = os.path.join(outpath, 'err')
         try:
             if not os.path.isfile(sample.general.mergedreads):
                 # Run the merging command
                 out, err, cmd = bbtools.bbmerge(
                     forward_in=sorted(
                         sample.general.trimmedcorrectedfastqfiles)[0],
                     merged_reads=sample.general.mergedreads,
                     mix=True,
                     returncmd=True,
                     threads=self.cpus)
                 write_to_logfile(out, err, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr, None, None)
                 with open(log, 'w') as log_file:
                     log_file.write(out)
                 with open(error, 'w') as error_file:
                     error_file.write(err)
         except (CalledProcessError, IndexError):
             delattr(sample.general, 'mergedreads')
         # Set the name of the report to store the metagenome file merging results
         report = os.path.join(self.reportpath, 'merged_metagenomes.csv')
         # Extract the total number of reads, and the number of reads that could be paired from the bbmerge
         # err stream
         num_reads, num_pairs = self.reads(error)
         # If the report doesn't exist, create it with the header and the results from the first sample
         if not os.path.isfile(report):
             with open(report, 'w') as report_file:
                 report_file.write(
                     'Sample,TotalReads,PairedReads\n{sample},{total},{paired}\n'
                     .format(sample=sample.name,
                             total=num_reads,
                             paired=num_pairs))
         # If the report exists, open it to determine which samples have already been added - useful if re-running
         # the analysis
         else:
             lines = list()
             with open(report, 'r') as report_file:
                 for line in report_file:
                     lines.append(line.split(',')[0])
             # Add the results to the report
             if sample.name not in lines:
                 with open(report, 'a+') as report_file:
                     report_file.write('{sample},{total},{paired}\n'.format(
                         sample=sample.name,
                         total=num_reads,
                         paired=num_pairs))