Example #1
0
 def __init__(self, args):
     """
     Initialises the variables required for this class
     :param args: list of arguments passed to the script
     """
     printtime(
         'Welcome to the CFIA de novo bacterial assembly pipeline {}'.
         format(args.commit.decode('utf-8')), args.startingtime,
         '\033[1;94m')
     # Define variables from the arguments - there may be a more streamlined way to do this
     self.args = args
     self.path = os.path.join(args.sequencepath)
     self.reffilepath = os.path.join(args.referencefilepath)
     self.numreads = args.numreads
     self.preprocess = args.preprocess
     # Define the start time
     self.starttime = args.startingtime
     self.customsamplesheet = args.customsamplesheet
     if self.customsamplesheet:
         assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\
             .format(self.customsamplesheet)
     self.basicassembly = args.basicassembly
     if not self.customsamplesheet and not os.path.isfile(
             os.path.join(self.path, 'SampleSheet.csv')):
         self.basicassembly = True
         printtime(
             'Could not find a sample sheet. Performing basic assembly (no run metadata captured)',
             self.starttime)
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = args.threads if args.threads else multiprocessing.cpu_count(
     ) - 1
     # Assertions to ensure that the provided variables are valid
     make_path(self.path)
     assert os.path.isdir(
         self.path
     ), 'Supplied path location is not a valid directory {0!r:s}'.format(
         self.path)
     self.reportpath = os.path.join(self.path, 'reports')
     assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\
         .format(self.reffilepath)
     self.commit = args.commit.decode('utf-8')
     self.homepath = args.homepath
     self.logfile = os.path.join(self.path, 'logfile')
     self.runinfo = str()
     self.pipeline = True
     self.qualityobject = MetadataObject()
     # Initialise the metadata object
     self.runmetadata = MetadataObject()
Example #2
0
def test_sistr(variables):
    metadata = MetadataObject()
    method.runmetadata.samples = list()
    fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta')
    metadata.name = os.path.split(fasta)[1].split('.')[0]
    # Initialise the general and run categories
    metadata.general = GenObject()
    metadata.run = GenObject()
    metadata.general.fastqfiles = list()
    # Set the destination folder
    outputdir = os.path.join(variables.sequencepath, metadata.name)
    make_path(outputdir)
    # Add the output directory to the metadata
    metadata.general.outputdirectory = outputdir
    metadata.run.outputdirectory = outputdir
    metadata.general.bestassemblyfile = True
    # Initialise an attribute to store commands
    metadata.commands = GenObject()
    # Assume that all samples are Salmonella
    metadata.general.referencegenus = 'Salmonella'
    # Set the .fasta file as the best assembly
    metadata.general.bestassemblyfile = fasta
    method.runmetadata.samples.append(metadata)
    method.sistr()
    for sample in method.runmetadata.samples:
        assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA'
    variable_update()
Example #3
0
 def __init__(self, inputobject):
     self.starttime = inputobject.starttime
     try:
         self.samples = inputobject.samples
     except AttributeError:
         self.samples = inputobject.runmetadata.samples
     try:
         self.completemetadata = inputobject.completemetadata
     except AttributeError:
         self.completemetadata = inputobject.runmetadata.samples
     self.path = inputobject.path
     try:
         self.analysescomplete = inputobject.analysescomplete
     except AttributeError:
         self.analysescomplete = True
     self.reportpath = inputobject.reportpath
     self.runmetadata = MetadataObject()
     try:
         self.runmetadata.samples = inputobject.runmetadata.samples
     except AttributeError:
         self.runmetadata.samples = inputobject.runmetadata
     try:
         self.portallog = inputobject.portallog
     except AttributeError:
         self.portallog = ''
Example #4
0
 def __init__(self, inputobject):
     self.path = inputobject.path
     self.starttime = inputobject.starttime
     self.sequencepath = inputobject.sequencepath
     try:
         self.customsamplesheet = inputobject.customsamplesheet
         self.bcltofastq = inputobject.bcltofastq
         self.miseqpath = inputobject.miseqpath
         self.miseqfolder = inputobject.miseqfolder
         self.fastqdestination = inputobject.fastqdestination
         self.forwardlength = inputobject.forwardlength
         self.reverselength = inputobject.reverselength
         self.numreads = 2 if self.reverselength != 0 else 1
         self.customsamplesheet = inputobject.customsamplesheet
         self.homepath = inputobject.homepath
         self.commit = inputobject.commit
         self.copy = inputobject.copy
     except AttributeError:
         self.bcltofastq = False
     try:
         self.debug = inputobject.debug
     except AttributeError:
         self.debug = False
     try:
         self.portallog = inputobject.portallog
     except AttributeError:
         self.portallog = ''
     self.samples = MetadataObject()
     self.forward = str()
     self.reverse = str()
     self.index = str()
     self.header = dict()
     self.run = dict()
Example #5
0
 def basic(self):
     # Grab any .fastq files in the path
     fastqfiles = glob(os.path.join(self.path, '*.fastq*'))
     # Extract the base name of the globbed name + path provided
     fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles))
     # Iterate through the names of the fastq files
     for fastqname in sorted(fastqnames):
         # Set the name
         metadata = MetadataObject()
         metadata.name = fastqname
         # Set the destination folder
         outputdir = os.path.join(self.path, fastqname)
         # Make the destination folder
         make_path(outputdir)
         # Get the fastq files specific to the fastqname
         specificfastq = glob(
             os.path.join(self.path, '{}*.fastq*'.format(fastqname)))
         # Link the files to the output folder
         try:
             # Link the .gz files to :self.path/:filename
             list(
                 map(
                     lambda x: os.symlink(
                         '../{}'.format(os.path.basename(x)), '{}/{}'.
                         format(outputdir, os.path.basename(x))),
                     specificfastq))
         # Except os errors
         except OSError as exception:
             # If there is an exception other than the file exists, raise it
             if exception.errno != errno.EEXIST:
                 raise
         # Initialise the general and run categories
         metadata.general = GenObject()
         metadata.run = GenObject()
         # Populate the .fastqfiles category of :self.metadata
         metadata.general.fastqfiles = [
             fastq for fastq in sorted(
                 glob(
                     os.path.join(outputdir, '{}*.fastq*'.format(
                         metadata.name)))) if 'trimmed' not in fastq
             and 'normalised' not in fastq and 'corrected' not in fastq
             and 'paired' not in fastq and 'unpaired' not in fastq
         ]
         # Add the output directory to the metadata
         metadata.general.outputdirectory = outputdir
         metadata.general.logout = os.path.join(
             self.path, metadata.name,
             '{}_log_out.txt'.format(metadata.name))
         metadata.general.logerr = os.path.join(
             self.path, metadata.name,
             '{}_log_err.txt'.format(metadata.name))
         # Append the metadata to the list of samples
         self.samples.append(metadata)
     # Grab metadata from previous runs
     previousmetadata = metadataReader.MetadataReader(self)
     # Update self.samples (if required)
     if previousmetadata.samples:
         self.samples = previousmetadata.samples
     # Run the read length method
     self.readlength()
Example #6
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath):
     """
     :param args: command line arguments
     :param pipelinecommit: pipeline commit or version
     :param startingtime: time the script was started
     :param scriptpath: home path of the script
     """
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.starttime = startingtime
     self.homepath = scriptpath
     self.args = args
     # Define variables based on supplied arguments
     self.path = os.path.join(args.path, '')
     assert os.path.isdir(self.path), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     self.sequencepath = os.path.join(args.sequencepath, '')
     self.seqpath = self.sequencepath
     self.targetpath = os.path.join(args.targetpath, '')
     # ref file path is used to work with submodule code with a different naming scheme
     self.reffilepath = self.targetpath
     self.reportpath = os.path.join(self.path, 'reports')
     make_path(self.reportpath)
     assert os.path.isdir(self.targetpath), u'Target path is not a valid directory {0!r:s}' \
         .format(self.targetpath)
     self.bcltofastq = args.bcl2fastq
     self.miseqpath = args.miseqpath
     self.miseqfolder = args.miseqfolder
     self.fastqdestination = args.destinationfastq
     self.forwardlength = args.readlengthforward
     self.reverselength = args.readlengthreverse
     self.numreads = 2 if self.reverselength != 0 else 1
     self.customsamplesheet = args.customsamplesheet
     # Set the custom cutoff value
     self.cutoff = args.customcutoffs
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = int(args.numthreads if args.numthreads else multiprocessing.cpu_count())
     self.threads = int()
     self.runmetadata = MetadataObject()
     self.taxonomy = {'Escherichia': 'coli', 'Listeria': 'monocytogenes', 'Salmonella': 'enterica'}
     self.analysistype = 'GeneSippr'
     self.copy = args.copy
     self.pipeline = False
     self.forward = str()
     self.reverse = str()
     self.index = str()
     self.header = dict()
     self.rundata = dict()
     self.completed = list()
     self.incomplete = list()
     self.analysescomplete = False
     self.final = False
     self.sum = int()
     self.completemetadata = list()
     self.samplesheetpath = str()
     self.samples = list()
     self.logfile = os.path.join(self.path, 'log')
     self.reports = str()
     # Run the method
     self.main()
Example #7
0
 def createobject(self):
     # Grab any .fastq files in the path
     fastqfiles = glob(os.path.join(self.path, '*.fastq*'))
     # Extract the base name of the globbed name + path provided
     fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles))
     # Iterate through the names of the fastq files
     for fastqname in sorted(fastqnames):
         # Set the name
         metadata = MetadataObject()
         metadata.name = fastqname
         # Set the destination folder
         outputdir = os.path.join(self.path, fastqname)
         # Make the destination folder
         make_path(outputdir)
         # Get the fastq files specific to the fastqname
         specificfastq = glob(
             os.path.join(self.path, '{}*.fastq*'.format(fastqname)))
         # Make relative symlinks to the files in :self.path
         try:
             for fastq in specificfastq:
                 # Get the basename of the file
                 fastqfile = os.path.split(fastq)[-1]
                 # Set the destination fastq path as the base name plus the destination folder
                 destinationfastq = os.path.join(outputdir, fastqfile)
                 # Symlink the files
                 os.symlink('../{}'.format(fastqfile), destinationfastq)
         # Except os errors
         except OSError as exception:
             # If there is an exception other than the file exists, raise it
             if exception.errno != errno.EEXIST:
                 raise
         # Initialise the general and run categories
         metadata.general = GenObject()
         metadata.run = GenObject()
         # Populate the .fastqfiles category of :self.metadata
         metadata.general.fastqfiles = [
             fastq for fastq in glob(
                 os.path.join(outputdir, '{}*.fastq*'.format(fastqname)))
             if 'trimmed' not in fastq
         ]
         # Add the output directory to the metadata
         metadata.general.outputdirectory = outputdir
         metadata.run.outputdirectory = outputdir
         metadata.general.bestassemblyfile = True
         metadata.general.trimmedcorrectedfastqfiles = metadata.general.fastqfiles
         metadata.general.logout = os.path.join(
             metadata.general.outputdirectory, 'logout')
         metadata.general.logerr = os.path.join(
             metadata.general.outputdirectory, 'logerr')
         # Initialise an attribute to store commands
         metadata.commands = GenObject()
         # Append the metadata to the list of samples
         self.samples.append(metadata)
Example #8
0
 def alleleretriever(self):
     """
     Retrieve the required alleles from a file of all alleles, and create organism-specific allele files
     """
     logging.info('Retrieving alleles')
     # Index all the records in the allele file
     logging.info('Loading rMLST records')
     recorddict = SeqIO.index(self.allelefile, 'fasta')
     logging.info('Creating allele output files')
     # Create the organism-specific files of alleles
     for organism in sorted(self.alleledict):
         # Make an object to store information for each strain
         metadata = MetadataObject()
         metadata.organism = organism
         metadata.path = self.path
         metadata.outpath = os.path.join(self.path, 'outputalleles',
                                         organism, '')
         # Delete and recreate the output path - as the files are appended to each time, they will be too large if
         # this script is run more than once
         try:
             shutil.rmtree(metadata.outpath)
         except OSError:
             pass
         make_path(metadata.outpath)
         metadata.combined = os.path.join(metadata.outpath,
                                          'gdcs_alleles.fasta')
         metadata.allelefiles = list()
         with open(metadata.combined, 'w') as combined:
             for gene, alleles in sorted(self.alleledict[organism].items()):
                 # Open the file to append
                 allelefiles = os.path.join(metadata.outpath,
                                            '{}.tfa'.format(gene))
                 metadata.allelefiles.append(allelefiles)
                 with open(allelefiles, 'a') as allelefile:
                     # Write each allele record to the file
                     for allele in sorted(alleles):
                         # Skip adding alleles that are no longer in the database
                         try:
                             SeqIO.write(
                                 recorddict['{}_{}'.format(gene, allele)],
                                 allelefile, 'fasta')
                             SeqIO.write(
                                 recorddict['{}_{}'.format(gene, allele)],
                                 combined, 'fasta')
                         except KeyError:
                             pass
         # Add the populated metadata to the list
         self.samples.append(metadata)
Example #9
0
 def reader(self):
     import os
     import json
     from accessoryFunctions.accessoryFunctions import GenObject, MetadataObject
     for sample in self.metadata:
         metadatafile = '{}{}/{}_metadata.json'.format(
             self.path, sample.name, sample.name)
         if os.path.isfile(metadatafile):
             size = os.stat(metadatafile).st_size
             if size != 0:
                 try:
                     with open(metadatafile) as metadatareport:
                         jsondata = json.load(metadatareport)
                     # Create the metadata objects
                     metadata = MetadataObject()
                     # Initialise the metadata categories as GenObjects created using the appropriate key
                     for attr in jsondata:
                         if not isinstance(jsondata[attr], dict):
                             setattr(metadata, attr, jsondata[attr])
                         else:
                             setattr(metadata, attr,
                                     GenObject(jsondata[attr]))
                     # As files often need to be reanalysed after being moved, test to see if it possible to use the
                     # metadata from the previous assembly
                     jsonfile = '{}/{}_metadata.json'.format(
                         metadata.general.outputdirectory, sample.name)
                     try:
                         # Open the metadata file to write
                         with open(
                                 jsonfile, 'w'
                         ) as metadatafile:  # Change from wb to w since this is text in python3
                             # Write the json dump of the object dump to the metadata file
                             json.dump(sample.dump(),
                                       metadatafile,
                                       sort_keys=True,
                                       indent=4,
                                       separators=(',', ': '))
                         # Set the name
                         metadata.name = sample.name
                         self.samples.append(metadata)
                     except IOError:
                         self.samples.append(sample)
                 except ValueError:
                     self.samples.append(sample)
         else:
             self.samples.append(sample)
Example #10
0
 def rmlst(self):
     """
     Get the most up-to-date profiles and alleles from pubmlst. Note that you will need the necessary access token
     and secret for this to work
     """
     printtime('Downloading rMLST database', self.start)
     # Set the name of the file to be used to determine if the database download and set-up was successful
     completefile = os.path.join(self.databasepath, 'rMLST', 'complete')
     if not os.path.isfile(completefile):
         # Create an object to send to the rMLST download script
         args = MetadataObject()
         # Add the path and start time attributes
         args.path = self.databasepath
         args.start = self.start
         # Run the rMLST download
         get_rmlst.Get(args)
         # Create and populate the complete.txt file
         with open(completefile, 'w') as complete:
             complete.write('\n'.join(glob(os.path.join(self.databasepath, 'rMLST', '*'))))
Example #11
0
 def __init__(self):
     from argparse import ArgumentParser
     from time import time
     # Parser for arguments
     parser = ArgumentParser(
         description='Performs ePCR using a supplied primer file. The primers must be in the format: '
                     '<name>\t<forward primer>\t<reverse primer>\t<max size allowed between primers>\n.'
                     'Sequence files must be stored in <path>/sequences'
     )
     parser.add_argument('path',
                         help='Specify path in which reports are to be stored')
     parser.add_argument('-s', '--sequencepath',
                         required=True,
                         help='Path to assembly files')
     parser.add_argument('-f', '--primerfile',
                         required=True,
                         help='The name and path of the file containing the primers')
     # Get the arguments into an object
     arguments = parser.parse_args()
     self.starttime = time()
     # Add trailing slashes to the path variables to ensure consistent formatting (os.path.join)
     self.path = os.path.join(arguments.path, '')
     self.sequencepath = os.path.join(arguments.sequencepath, '')
     self.primerfile = arguments.primerfile
     # Initialise variables
     self.runmetadata = MetadataObject()
     self.reffilepath = False
     self.analysistype = 'ePCR'
     self.reportpath = os.path.join(self.path, 'reports')
     make_path(self.reportpath)
     # Initialise metadata
     self.runmetadata.samples = self.setup()
     self.logfile = os.path.join(self.path, 'vtyper_logfile.txt')
     # Run the analyses
     Vtyper(self, self.analysistype)
     # Create a report
     self.reporter()
     # Print the metadata to file
     printtime('Printing metadata to file', self.starttime)
     metadataprinter.MetadataPrinter(self)
     # Print a bold, green exit statement
     print(u'\033[92m' + u'\033[1m' + u'\nElapsed Time: %0.2f seconds' % (time() - self.starttime) + u'\033[0m')
Example #12
0
 def __init__(self, inputobject, extension='fasta', light=False):
     # Create an object to mimic the command line arguments necessary for the script
     args = MetadataObject()
     args.path = inputobject.path
     args.sequencepath = inputobject.path
     args.databasepath = os.path.join(inputobject.reffilepath, 'clark')
     make_path(args.databasepath)
     args.clarkpath = os.path.dirname(which('CLARK'))
     args.clarkpath += '/../opt/clark/'
     args.cutoff = 0.005
     args.database = 'bacteria'
     args.rank = 'species'
     args.filter = False
     args.threads = inputobject.cpus
     args.runmetadata = inputobject.runmetadata
     args.clean_seqs = False
     args.reffilepath = inputobject.reffilepath
     args.runmetadata.extension = extension
     args.light = light
     # Run CLARK
     CLARK(args, inputobject.commit, inputobject.starttime,
           inputobject.homepath)
Example #13
0
 def setup(self):
     """
     Set up the metadata object to be passed to Vtyper()
     """
     from glob import glob
     files = sorted(glob('{}*.fasta'.format(self.sequencepath)))
     samples = list()
     # Create the metadata for each file
     for fasta in files:
         # Create a metadata object to store all metadata associated with each strain
         metadata = MetadataObject()
         metadata.general = GenObject()
         metadata.commands = GenObject()
         # Set the name
         metadata.name = os.path.basename(fasta).split('.')[0]
         metadata.general.bestassemblyfile = fasta
         metadata.general.stx = True
         metadata.general.outputdirectory = self.path
         metadata.general.filenoext = fasta.split('.')[0]
         metadata.general.fastqfiles = list()
         samples.append(metadata)
     return samples
Example #14
0
 def __init__(self, inputobject):
     self.start = inputobject.starttime
     self.commit = inputobject.commit
     self.starttime = inputobject.starttime
     self.homepath = inputobject.homepath
     self.path = inputobject.path
     self.cpus = inputobject.cpus
     self.metadata = inputobject.runmetadata.samples
     self.runmetadata = MetadataObject()
     self.runmetadata.samples = list()
     self.reffilepath = inputobject.reffilepath
     self.reportpath = inputobject.reportpath
     self.logfile = inputobject.logfile
     self.analysistype = 'coregenome'
     self.cutoff = 90
     self.coregenomes = list()
     # Fields used for custom outfmt 6 BLAST output:
     self.fieldnames = ['query_id', 'subject_id', 'positives', 'mismatches', 'gaps',
                        'evalue', 'bit_score', 'subject_length', 'alignment_length',
                        'query_start', 'query_end', 'query_sequence',
                        'subject_start', 'subject_end', 'subject_sequence']
     # Run the analyses
     self.annotatedcore()
Example #15
0
 def mlst(self, genera={'Escherichia', 'Vibrio', 'Campylobacter', 'Listeria', 'Bacillus', 'Staphylococcus',
                        'Salmonella'}):
     """
     Download the necessary up-to-date MLST profiles and alleles
     """
     printtime('Downloading MLST databases', self.start)
     for genus in genera:
         # Create an object to pass to the get_mlst script
         args = MetadataObject()
         # Populate the object with the necessary attributes
         args.species = genus
         args.repository_url = 'http://pubmlst.org/data/dbases.xml'
         args.force_scheme_name = False
         args.path = os.path.join(self.databasepath, 'MLST', genus)
         # Create the name of the file to be used to determine if the database download and setup was successful
         completefile = os.path.join(args.path, 'complete')
         # Only download the files if the download was not previously successful
         if not os.path.isfile(completefile):
             # Run the download
             get_mlst.main(args)
             # Create and populate the complete.txt file
             with open(completefile, 'w') as complete:
                 complete.write('\n'.join(glob(os.path.join(args.path, '*'))))
Example #16
0
 def methodreporter(self):
     """
     Create final reports collating results from all the individual iterations through the method pipeline
     """
     # Ensure that the analyses are set to complete
     self.analysescomplete = True
     # Reset the report path to original value
     self.reportpath = os.path.join(self.path, 'reports')
     # Clear the runmetadata - it will be populated with all the metadata from completemetadata
     self.runmetadata = MetadataObject()
     self.runmetadata.samples = list()
     # As the samples were entered into self.completemetadata depending on when they passed the quality threshold,
     # this list is not ordered numerically/alphabetically like the original runmetadata. Reset the order.
     for strain in self.samples:
         for sample in self.completemetadata:
             if sample.name == strain:
                 # Append the sample to the ordered list of objects
                 self.runmetadata.samples.append(sample)
     # Create the reports
     self.reporter()
     self.genusspecific()
     self.sixteensreporter()
     self.gdcsreporter()
Example #17
0
    parser.add_argument('path', help='Specify input directory')
    parser.add_argument('-s',
                        '--sequencepath',
                        required=True,
                        help='Path of .fastq(.gz) files to process.')
    # Get the arguments into an object
    arguments = parser.parse_args()

    # Define the start time
    arguments.starttime = time.time()

    # Find the files
    fastas = sorted(glob(os.path.join(arguments.sequencepath, '*.fa*')))

    # Create a metadata object
    arguments.runmetadata = MetadataObject()
    arguments.runmetadata.samples = list()
    for fasta in fastas:
        metadata = MetadataObject()
        metadata.name = os.path.split(fasta)[1].split('.')[0]
        # Initialise the general and run categories
        metadata.general = GenObject()
        metadata.run = GenObject()
        # Set the destination folder
        outputdir = os.path.join(arguments.sequencepath, metadata.name)
        make_path(outputdir)
        # Add the output directory to the metadata
        metadata.general.outputdirectory = outputdir
        metadata.run.outputdirectory = outputdir
        metadata.general.bestassemblyfile = True
        # Initialise an attribute to store commands
Example #18
0
    def probefinder(self):
        """
        Find the longest probe sequences
        """
        logging.info('Finding and filtering probe sequences')
        for sample in self.samples:
            # A list to store the metadata object for each alignment
            sample.gene = list()
            for align in sample.alignedalleles:
                # Create an object to store all the information for each alignment file
                metadata = GenObject()
                metadata.name = os.path.splitext(os.path.basename(align))[0]
                metadata.alignmentfile = align
                # Create an alignment object from the alignment file
                try:
                    metadata.alignment = AlignIO.read(align, 'fasta')
                except ValueError:
                    # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences
                    # to be the length of the longest sequence
                    # https://stackoverflow.com/questions/32833230/biopython-alignio-valueerror-says-strings-must-be-same-length
                    records = SeqIO.parse(align, 'fasta')
                    # Make a copy, otherwise our generator is exhausted after calculating maxlen
                    records = list(records)
                    # Calculate the length of the longest sequence
                    maxlen = max(len(record.seq) for record in records)
                    # Pad sequences so that they all have the same length
                    for record in records:
                        if len(record.seq) != maxlen:
                            sequence = str(record.seq).ljust(maxlen, '.')
                            record.seq = Seq(sequence)
                    assert all(len(record.seq) == maxlen for record in records)
                    # Write to file and do alignment
                    metadata.alignmentfile = '{}_padded.tfa'.format(
                        os.path.splitext(align)[0])
                    with open(metadata.alignmentfile, 'w') as padded:
                        SeqIO.write(records, padded, 'fasta')
                    # Align the padded sequences
                    metadata.alignment = AlignIO.read(metadata.alignmentfile,
                                                      'fasta')

                metadata.summaryalign = AlignInfo.SummaryInfo(
                    metadata.alignment)
                # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default
                # parameters of threshold=.7, and ambiguous='X' are used
                consensus = metadata.summaryalign.dumb_consensus()
                metadata.consensus = str(consensus)
                # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each
                # location along the entire consensus sequence
                metadata.pssm = metadata.summaryalign.pos_specific_score_matrix(
                    consensus)
                metadata.identity = list()
                # Find the prevalence of each base for every location along the sequence
                for line in metadata.pssm:
                    try:
                        bases = [
                            line['A'], line['C'], line['G'], line['T'],
                            line['-']
                        ]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases[:4]) / sum(bases) * 100)))
                    except KeyError:
                        bases = [line['A'], line['C'], line['G'], line['T']]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases) / sum(bases) * 100)))
                # List to store metadata objects
                metadata.windows = list()
                # Variable to store whether a suitable probe has been found for the current organism + gene pair.
                # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the
                # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found
                passing = False
                # Create sliding windows of size self.max - self.min from the list of identities for each column
                # of the alignment
                for i in reversed(range(self.min, self.max + 1)):
                    if not passing:
                        windowdata = MetadataObject()
                        windowdata.size = i
                        windowdata.max = 0
                        windowdata.sliding = list()
                        # Create a counter to store the starting location of the window in the sequence
                        n = 0
                        # Create sliding windows from the range of sizes for the list of identities
                        windows = self.window(metadata.identity, i)
                        # Go through each window from the collection of sliding windows to determine which window(s)
                        # has (have) the best results
                        for window in windows:
                            # Create another object to store all the data for the window
                            slidingdata = MetadataObject()
                            # Only consider the window if every position has a percent identity greater than the cutoff
                            if min(window) > self.cutoff:
                                # Populate the object with the necessary variables
                                slidingdata.location = '{}:{}'.format(n, n + i)
                                slidingdata.min = min(window)
                                slidingdata.mean = float('{:.2f}'.format(
                                    numpy.mean(window)))
                                slidingdata.sequence = str(consensus[n:n + i])
                                # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min
                                #  means a better/less overall percent identity, respectively
                                windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \
                                    else windowdata.max
                                windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \
                                    else windowdata.min
                                # Add the object to the list of objects
                                windowdata.sliding.append(slidingdata)
                                passing = True
                            n += 1
                        # All the object to the list of objects
                        metadata.windows.append(windowdata)
                # All the object to the list of objects
                sample.gene.append(metadata)
Example #19
0
        '-a',
        '--averagedepth',
        default=10,
        help=
        'Supply an integer of the minimum mapping depth in order to return a positive result '
    )
    parser.add_argument(
        '-C',
        '--copy',
        action='store_true',
        help=
        'Normally, the program will create symbolic links of the files into the sequence path, '
        'however, the are occasions when it is necessary to copy the files instead'
    )
    # Get the arguments into an object
    arguments = parser.parse_args()
    arguments.pipeline = False
    arguments.runmetadata.samples = MetadataObject()
    arguments.analysistype = 'genesippr'
    arguments.logfile = os.path.join(arguments.path, 'logfile')
    # Define the start time
    start = time.time()

    # Run the script
    GeneSippr(arguments, commit, start, homepath, arguments.analysistype,
              arguments.cutoff, arguments.pipeline, False)

    # Print a bold, green exit statement
    print('\033[92m' + '\033[1m' + "\nElapsed Time: %0.2f seconds" %
          (time.time() - start) + '\033[0m')
Example #20
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath):
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.start = startingtime
     self.homepath = scriptpath
     # Define variables based on supplied arguments
     self.args = args
     self.path = os.path.join(args.path, '')
     assert os.path.isdir(
         self.path
     ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     self.sequencepath = os.path.join(args.sequencepath, '')
     assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \
         .format(self.sequencepath)
     self.databasepath = os.path.join(args.databasepath, '')
     assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \
         .format(self.databasepath)
     # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1
     self.cpus = 1
     # Set variables from the arguments
     self.database = args.database
     self.rank = args.rank
     self.clarkpath = args.clarkpath
     self.cutoff = float(args.cutoff) * 100
     # Initialise variables for the analysis
     self.targetcall = str()
     self.classifycall = str()
     self.devnull = open(os.devnull, 'wb')
     self.filelist = os.path.join(self.path, 'sampleList.txt')
     self.reportlist = os.path.join(self.path, 'reportList.txt')
     self.abundancequeue = Queue()
     self.datapath = str()
     self.reportpath = os.path.join(self.path, 'reports')
     self.clean_seqs = args.clean_seqs
     self.light = args.light
     if self.clean_seqs:
         try:
             self.reffilepath = args.reffilepath
         except AttributeError:
             self.clean_seqs = False
     # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects
     # and variables play nice
     try:
         if args.runmetadata:
             self.runmetadata = args.runmetadata
             self.extension = self.runmetadata.extension
             # Create the name of the final report
             self.report = os.path.join(
                 self.reportpath,
                 '{}'.format('abundance{}.xlsx'.format(self.extension)))
             # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK
             if not os.path.isfile(self.report):
                 printtime(
                     'Performing CLARK analysis on {} files'.format(
                         self.extension), self.start)
                 if self.extension != 'fastq':
                     for sample in self.runmetadata.samples:
                         sample.general.combined = sample.general.bestassemblyfile
                     # Run the pipeline
                     self.main()
                 else:
                     # Only perform FASTQ analyses if the sample is declared to be a metagenome
                     metagenome = False
                     for sample in self.runmetadata.samples:
                         try:
                             status = sample.run.Description
                         except KeyError:
                             status = 'unknown'
                         if status == 'metagenome':
                             metagenome = True
                     # If any of the samples are metagenomes, run the CLARK analysis on the raw files
                     if metagenome:
                         fileprep.Fileprep(self)
                         # Run the pipeline
                         self.main()
                 # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects
                 for sample in self.runmetadata.samples:
                     if sample.general.bestassemblyfile != 'NA':
                         # Create a GenObject to store metadata when this script is run as part of the pipeline
                         clarkextension = 'clark{}'.format(self.extension)
                         setattr(sample, clarkextension, GenObject())
                         # Create a folder to store all the CLARK files
                         sample[clarkextension].outputpath = os.path.join(
                             sample.general.outputdirectory, 'CLARK')
                         make_path(sample[clarkextension].outputpath)
                         # Move the files to the CLARK folder
                         try:
                             move(
                                 sample.general.abundance,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.abundance)))
                             move(
                                 sample.general.classification,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.classification)))
                         except (KeyError, FileNotFoundError):
                             pass
                         # Set the CLARK-specific attributes
                         try:
                             sample[
                                 clarkextension].abundance = sample.general.abundance
                             sample[
                                 clarkextension].classification = sample.general.classification
                             sample[
                                 clarkextension].combined = sample.general.combined
                         except KeyError:
                             pass
                         if self.extension == 'fastq':
                             # Remove the combined .fastq files
                             try:
                                 if type(sample[clarkextension].combined
                                         ) is list:
                                     os.remove(
                                         sample[clarkextension].combined)
                             except (OSError, KeyError):
                                 pass
                     # Remove all the attributes from .general
                     map(lambda x: delattr(sample.general, x),
                         ['abundance', 'classification', 'combined'])
                     # Remove the text files lists of files and reports created by CLARK
                     try:
                         map(
                             lambda x: os.remove(os.path.join(self.path, x)
                                                 ),
                             ['reportList.txt', 'sampleList.txt'])
                     except OSError:
                         pass
         else:
             self.runmetadata = MetadataObject()
             self.report = os.path.join(self.reportpath, 'abundance.xlsx')
             # Create the objects
             self.objectprep()
             self.main()
     except AttributeError:
         self.runmetadata = MetadataObject()
         self.report = os.path.join(self.reportpath, 'abundance.xlsx')
         # Create the objects
         self.objectprep()
         self.main()
     # Optionally filter the .fastq reads based on taxonomic assignment
     if args.filter:
         filtermetagenome.PipelineInit(self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(self)
Example #21
0
 def parsesamplesheet(self):
     """Parses the sample sheet (SampleSheet.csv) to determine certain values
     important for the creation of the assembly report"""
     # Open the sample sheet
     with open(self.samplesheet, "r") as samplesheet:
         # Iterate through the sample sheet
         samples, prev, header = False, 0, []
         for count, line in enumerate(samplesheet):
             # Remove new lines, and split on commas
             # line = line.decode('utf-8')  # Turn from bytes to string, since python3 is finicky.
             data = line.rstrip().split(",")
             if any(data):
                 if "[Settings]" in line:
                     samples = False
                 if not line.startswith(
                         "[") and not samples and not data == ['']:
                     # Grab an data not in the [Data] Section
                     setattr(self.header, data[0].replace(" ", ""),
                             "".join(data[1:]))
                 elif "[Data]" in line or "[Reads]" in line:
                     samples = True
                 elif samples and "Sample_ID" in line:
                     header.extend([
                         x.replace("_", "").replace(' ', "") for x in data
                     ])
                     prev = count
                 elif header:
                     # Try and replicate the Illumina rules to create file names from "Sample_Name"
                     samplename = samplenamer(data)
                     # Create an object for storing nested static variables
                     strainmetadata = MetadataObject()
                     # Set the sample name in the object
                     strainmetadata.name = samplename
                     # Add the header object to strainmetadata
                     # strainmetadata.__setattr__("run", GenObject(dict(self.header)))
                     strainmetadata.run = GenObject(
                         copy.copy(self.header.datastore))
                     # Create the run object, so it will be easier to populate the object (eg run.SampleName = ...
                     # instead of strainmetadata.run.SampleName = ...
                     run = strainmetadata.run
                     # Capture Sample_ID, Sample_Name, I7_Index_ID, index1, I5_Index_ID,	index2, Sample_Project
                     for idx, item in enumerate(data):
                         setattr(run, header[idx],
                                 item) if item else setattr(
                                     run, header[idx], "NA")
                     # Add the sample number
                     run.SampleNumber = count - prev
                     # Create the 'General' category for strainmetadata
                     strainmetadata.general = GenObject({
                         'outputdirectory':
                         os.path.join(self.path, samplename),
                         'pipelinecommit':
                         self.commit
                     })
                     strainmetadata.general.logout = os.path.join(
                         self.path, samplename,
                         '{}_log_out.txt'.format(samplename))
                     strainmetadata.general.logerr = os.path.join(
                         self.path, samplename,
                         '{}_log_err.txt'.format(samplename))
                     # Add the output directory to the general category
                     # Append the strainmetadata object to a list
                     self.samples.append(strainmetadata)
                 elif samples:
                     setattr(self.header, 'forwardlength', data[0]) \
                         if 'forwardlength' not in self.header.datastore else \
                         setattr(self.header, 'reverselength', data[0])
                     self.totalreads += int(data[0])
     self.date = self.header.Date if "Date" in self.header.datastore else self.date
     for sample in self.samples:
         if 'InvestigatorName' not in sample.run.datastore:
             sample.run.InvestigatorName = 'NA'
Example #22
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath,
              analysistype, cutoff, pipeline):
     """
     :param args: command line arguments
     :param pipelinecommit: pipeline commit or version
     :param startingtime: time the script was started
     :param scriptpath: home path of the script
     :param analysistype: name of the analysis being performed - allows the program to find databases
     :param cutoff: percent identity cutoff for matches
     :param pipeline: boolean of whether this script needs to run as part of a particular assembly pipeline
     """
     import multiprocessing
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.starttime = startingtime
     self.homepath = scriptpath
     # Define variables based on supplied arguments
     self.path = os.path.join(args.path, '')
     assert os.path.isdir(
         self.path
     ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     try:
         self.sequencepath = os.path.join(args.sequencepath, '')
     except AttributeError:
         self.sequencepath = self.path
     assert os.path.isdir(self.sequencepath), u'Sequence path  is not a valid directory {0!r:s}' \
         .format(self.sequencepath)
     try:
         self.targetpath = os.path.join(args.reffilepath)
     except AttributeError:
         self.targetpath = os.path.join(args.targetpath)
     self.reportpath = os.path.join(self.path, 'reports')
     assert os.path.isdir(self.targetpath), u'Target path is not a valid directory {0!r:s}' \
         .format(self.targetpath)
     try:
         self.bcltofastq = args.bcltofastq
     except AttributeError:
         self.bcltofastq = False
     try:
         self.miseqpath = args.miseqpath
     except AttributeError:
         self.miseqpath = str()
     try:
         self.miseqfolder = args.miseqfolder
     except AttributeError:
         self.miseqfolder = str()
     try:
         self.fastqdestination = args.fastqdestination
     except AttributeError:
         self.fastqdestination = str()
     try:
         self.forwardlength = args.forwardlength
     except AttributeError:
         self.forwardlength = 'full'
     try:
         self.reverselength = args.reverselength
     except AttributeError:
         self.reverselength = 'full'
     self.numreads = 2 if self.reverselength != 0 else 1
     self.customsamplesheet = args.customsamplesheet
     self.taxonomy = {
         'Escherichia': 'coli',
         'Listeria': 'monocytogenes',
         'Salmonella': 'enterica'
     }
     self.logfile = args.logfile
     # Set the custom cutoff value
     self.cutoff = float(cutoff)
     try:
         self.averagedepth = int(args.averagedepth)
     except AttributeError:
         self.averagedepth = 10
     try:
         self.copy = args.copy
     except AttributeError:
         self.copy = False
     self.pipeline = pipeline
     if not self.pipeline:
         self.runmetadata = MetadataObject()
         # Create the objects to be used in the analyses
         objects = Objectprep(self)
         objects.objectprep()
         self.runmetadata = objects.samples
     else:
         self.runmetadata = args.runmetadata
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     try:
         self.cpus = int(args.cpus)
     except AttributeError:
         self.cpus = multiprocessing.cpu_count()
     try:
         self.threads = int(
             self.cpus / len(self.runmetadata.samples)
         ) if self.cpus / len(self.runmetadata.samples) > 1 else 1
     except TypeError:
         self.threads = self.cpus
     self.analysistype = analysistype
     self.threads = int(self.cpus / len(self.runmetadata.samples)) if self.cpus / len(self.runmetadata.samples) > 1 \
         else 1
     # Run the analyses
     self.runner()