def run_blast(self):
     """
     BLAST the alleles against the genomes
     """
     logging.info('BLASTing alleles against sequence files')
     for query_file in self.query_files:
         # Create a metadata object to store all the sample-specific information
         sample = MetadataObject()
         sample.alleles = GenObject()
         local_db = os.path.splitext(query_file)[0]
         sample.name = os.path.basename(local_db)
         # Set the name of the BLAST output file
         sample.alleles.blast_report = os.path.join(
             self.reportpath, '{seq_id}.tsv'.format(seq_id=sample.name))
         # Update the list of metadata objects with this sample
         self.runmetadata.samples.append(sample)
         self.blast_reports.append(sample.alleles.blast_report)
         # Run the appropriate BLAST command: BLASTn for nt; tBLASTn for aa against translated nt
         if self.amino_acid:
             blast = NcbitblastnCommandline(db=local_db,
                                            query=self.target_file,
                                            num_alignments=100000000,
                                            evalue=0.001,
                                            num_threads=self.cpus,
                                            task='tblastn',
                                            outfmt=self.outfmt,
                                            word_size=3,
                                            out=sample.alleles.blast_report)
         else:
             blast = NcbiblastnCommandline(db=local_db,
                                           query=self.target_file,
                                           num_alignments=100000000,
                                           evalue=0.001,
                                           num_threads=self.cpus,
                                           task='blastn',
                                           outfmt=self.outfmt,
                                           out=sample.alleles.blast_report)
         if not os.path.isfile(sample.alleles.blast_report):
             # Run BLAST - supply the record sequence as stdin, so BLAST doesn't look for an input file
             try:
                 blast()
             # BLAST can have issues with genomes that have very large contigs. Retry the analysis using only one
             # thread
             except ApplicationError:
                 os.remove(sample.alleles.blast_report)
                 blast = NcbitblastnCommandline(
                     db=local_db,
                     query=self.target_file,
                     num_alignments=100000000,
                     evalue=0.001,
                     num_threads=1,
                     task='tblastn',
                     outfmt=self.outfmt,
                     word_size=3,
                     out=sample.alleles.blast_report)
                 blast()
Exemple #2
0
 def record_extraction(self):
     """
     Parse the input FASTA file, and create a dictionary of header: sequence for each entry
     """
     for record in SeqIO.parse(self.file, 'fasta'):
         metadata = MetadataObject()
         metadata.name = record.id
         metadata.records = record.seq.upper()
         # self.records[record.id] = record.seq.upper()
         self.samples.append(metadata)
 def file_list(self):
     """
     Create metadata objects for every .ab1 file in the supplied sequence path
     """
     # Glob and sort a list of all the paths to the .ab1 files
     file_list = sorted(glob(os.path.join(self.sequencepath, '*.ab1')),
                        reverse=True)
     for seq_file in file_list:
         # P19954_2019FCP-0000034876-4_VI0364_22 _1D06_M13-R17_E11_087.ab1
         file_name = os.path.splitext(os.path.basename(seq_file))[0]
         # Create a metadata object for each sample
         sample = MetadataObject()
         sample.name = file_name
         sample.filepath = seq_file
         self.samples.append(sample)
 def helper(self):
     """Helper function for file creation (if desired), manipulation, quality assessment,
     and trimming as well as the assembly"""
     # Simple assembly without requiring accessory files (SampleSheet.csv, etc).
     if self.basicassembly:
         self.runmetadata = Basic(inputobject=self)
     else:
         # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and
         # RunInfo.xml files
         self.runinfo = os.path.join(self.path, 'RunInfo.xml')
         self.runmetadata = runMetadata.Metadata(passed=self)
         # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided
         self.runmetadata.parseruninfo()
         # Extract PhiX mapping information from the run
         phi = phix.PhiX(inputobject=self)
         phi.main()
         # Populate the lack of bclcall and nohup call into the metadata sheet
         for sample in self.runmetadata.samples:
             sample.commands = GenObject()
             sample.commands.nohupcall = 'NA'
             sample.commands.bclcall = 'NA'
         # Move/link the FASTQ files to strain-specific working directories
         fastqmover.FastqMover(inputobject=self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(inputobject=self)
 def query_prep(self):
     """
     Create metadata objects for each sample
     """
     logging.info('Preparing query files')
     # Find all the sequence files in the path
     fastas = sorted(glob(os.path.join(self.query_path, '*.fasta')))
     for fasta in fastas:
         name = os.path.splitext(os.path.basename(fasta))[0]
         if name != 'combinedtargets':
             # Create a metadata object for each sample
             metadata = MetadataObject()
             metadata.samples = list()
             # Populate the metadata object with the required attributes
             metadata.name = name
             metadata.general = GenObject()
             metadata.commands = GenObject()
             metadata.alleles = GenObject()
             metadata.alleles.outputdirectory = os.path.join(self.query_path, metadata.name)
             # Set the name of the BLAST output file
             metadata.alleles.blast_report = os.path.join(metadata.alleles.outputdirectory,
                                                          '{seq_id}.tsv'.format(seq_id=metadata.name))
             try:
                 os.remove(metadata.alleles.blast_report)
             except FileNotFoundError:
                 pass
             make_path(metadata.alleles.outputdirectory)
             metadata.general.bestassemblyfile = relative_symlink(src_file=fasta,
                                                                  output_dir=metadata.alleles.outputdirectory,
                                                                  export_output=True)
             metadata.samples.append(metadata)
             self.runmetadata.samples.append(metadata)
Exemple #6
0
def create_args():
    arguments = ArgumentParser()
    arguments.sequencepath = test_sequences_path
    arguments.starttime = time()
    arguments.reportpath = os.path.join(arguments.sequencepath, 'reports')
    arguments.runmetadata = MetadataObject()
    # Create metadata objects for the samples
    arguments.runmetadata.samples = Filer.filer(arguments)
    return arguments
Exemple #7
0
 def excelparse(self):
     """
     Parses input excel file, and creates objects with headers as keys, and cell data as values for each row
     """
     logging.info('Loading excel file')
     # A dictionary to store the parsed excel file in a more readable format
     nesteddictionary = dict()
     # Use pandas to read in the excel file, and subsequently convert the pandas data frame to a dictionary
     # (.to_dict()). Only read the first fourteen columns (parse_cols=range(14)), as later columns are not
     # relevant to this script
     dictionary = pandas.read_excel(self.file, usecols=range(14)).to_dict()
     # Iterate through the dictionary - each header from the excel file
     for header in dictionary:
         # Sample is the primary key, and value is the value of the cell for that primary key + header combination
         for sample, value in dictionary[header].items():
             # Update the dictionary with the new data
             try:
                 nesteddictionary[sample].update({header: value})
             # Create the nested dictionary if it hasn't been created yet
             except KeyError:
                 nesteddictionary[sample] = dict()
                 nesteddictionary[sample].update({header: value})
     # Create objects for each of the samples, rather than using a nested dictionary. It may have been possible to
     # skip the creation of the nested dictionary, and create the objects from the original dictionary, but there
     # seemed to be too many possible places for something to go wrong
     for line in nesteddictionary:
         # Create an object for each sample
         metadata = MetadataObject()
         # Set the name of the metadata to be the primary key for the sample from the excel file
         metadata.name = line
         # Find the headers and values for every sample
         for header, value in nesteddictionary[line].items():
             # Try/except for value.encode() - some of the value are type int, so they cannot be encoded
             try:
                 # Create each attribute - use the header (in lowercase, and spaces removed) as the attribute name,
                 # and the value as the attribute value
                 setattr(metadata,
                         str(header).replace(' ', '').lower(), str(value))
             except TypeError:
                 setattr(metadata,
                         str(header).replace(' ', '').lower(), value)
         # Append the object to the list of objects
         self.metadata.append(metadata)
Exemple #8
0
def ultimatum(args):
    SetupLogging(debug=args.debug)
    # Create metadata objects for the samples
    args.runmetadata = MetadataObject()
    args.runmetadata.samples = Filer.filer(args)
    finder = Ultimatum(metadataobject=args.runmetadata.samples,
                       sequencepath=args.sequencepath,
                       reportpath=os.path.join(args.sequencepath, 'reports'),
                       primerfile=args.primerfile,
                       primer_format=args.primer_format,
                       mismatches=args.mismatches,
                       export_amplicons=args.export_amplicons)
    finder.main()
 def __init__(self, path, amino_acid):
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     self.allele_path = os.path.join(self.path, 'alleles')
     self.aa_allele_path = os.path.join(self.path, 'aa_alleles')
     self.profile_path = os.path.join(self.path, 'profile')
     self.aa_profile_path = os.path.join(self.path, 'aa_profile')
     make_path(self.profile_path)
     self.profile_file = os.path.join(self.profile_path, 'profile.txt')
     self.aa_profile_file = os.path.join(self.aa_profile_path, 'aa_profile.txt')
     self.query_path = os.path.join(self.path, 'query')
     self.report_path = os.path.join(self.path, 'reports')
     self.aa_report_path = os.path.join(self.path, 'aa_reports')
     make_path(self.report_path)
     make_path(self.aa_report_path)
     novel_alleles = glob(os.path.join(self.report_path, '*.fasta'))
     for novel_allele in novel_alleles:
         os.remove(novel_allele)
     self.aa_notes_path = os.path.join(self.path, 'aa_notes')
     make_path(self.aa_notes_path)
     self.aa_profile_notes = os.path.join(self.aa_notes_path, 'aa_profile_notes.tsv')
     self.amino_acid = amino_acid
     if not self.amino_acid:
         self.combined_targets = os.path.join(self.allele_path, 'combinedtargets.fasta')
     else:
         self.combined_targets = os.path.join(self.aa_allele_path, 'combinedtargets.fasta')
     self.gene_names = list()
     self.runmetadata = MetadataObject()
     self.runmetadata.samples = list()
     self.cpus = multiprocessing.cpu_count() - 1
     self.profile_report = os.path.join(self.report_path, 'profiles.tsv')
     self.aa_profile_report = os.path.join(self.aa_report_path, 'aa_profiles.tsv')
     try:
         os.remove(self.profile_report)
     except FileNotFoundError:
         pass
     # Fields used for custom outfmt 6 BLAST output:
     self.fieldnames = ['query_id', 'subject_id', 'identical', 'mismatches', 'gaps',
                        'evalue', 'bit_score', 'query_length', 'subject_length', 'alignment_length',
                        'query_start', 'query_end', 'subject_start', 'subject_end',
                        'query_sequence', 'subject_sequence']
     self.extended_fieldnames = self.fieldnames.copy()
     self.extended_fieldnames.insert(14, 'percent_match')
     self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \
                   'qstart qend sstart send qseq sseq'
     # A string of the header to use for formatting the profile file, and the report headers
     self.data = str()
     self.aa_allele_dict = dict()
     self.aa_nt_allele_link_dict = dict()
Exemple #10
0
 def __init__(self, inputobject):
     # Define variables based on supplied arguments
     self.start = inputobject.start
     self.path = inputobject.path
     self.sequencepath = inputobject.sequencepath
     self.datapath = inputobject.datapath
     self.reportpath = inputobject.reportpath
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = inputobject.cpus
     # Set the cutoff to be a percent
     self.cutoff = inputobject.cutoff
     # Initialise a variable to hold the sample objects
     self.runmetadata = inputobject.runmetadata if inputobject.runmetadata else MetadataObject(
     )
     # Initialise queues
     self.loadqueue = Queue()
     self.listqueue = Queue()
     self.filterqueue = Queue()
     self.devnull = open(os.devnull, 'wb')
Exemple #11
0
def legacy(args):
    # Prep the args object to be used in the legacy script
    SetupLogging(debug=args.debug)
    args.reportpath = os.path.join(args.sequencepath, 'reports')
    args.runmetadata = MetadataObject()
    # Create metadata objects for the samples
    args.runmetadata.samples = Filer.filer(args)
    if args.analysistype == 'vtyper':
        # Perform vtx typing
        vtyper = Vtyper(inputobject=args,
                        analysistype='vtyper_legacy',
                        mismatches=args.mismatches)
        vtyper.vtyper()
    else:
        epcr = Custom(inputobject=args,
                      analysistype='custom_epcr',
                      primerfile=args.primerfile,
                      ampliconsize=args.maxampliconsize,
                      mismatches=args.mismatches,
                      primer_format=args.primer_format,
                      export_amplicons=args.export_amplicons)
        epcr.main()
Exemple #12
0
def identity(args):
    SetupLogging(debug=args.debug)
    # Create metadata objects for the samples
    args.runmetadata = MetadataObject()
    args.runmetadata.samples = Filer.filer(args)
    if args.analysistype == 'vtyper':
        epcr = VtyperIP(metadataobject=args.runmetadata.samples,
                        analysistype=args.analysistype,
                        reportpath=os.path.join(args.sequencepath, 'reports'))
        epcr.vtyper()
    else:
        epcr = CustomIP(metadataobject=args.runmetadata.samples,
                        sequencepath=args.sequencepath,
                        reportpath=os.path.join(args.sequencepath, 'reports'),
                        primerfile=args.primerfile,
                        min_amplicon_size=args.minampliconsize,
                        max_amplicon_size=args.maxampliconsize,
                        primer_format=args.primer_format,
                        mismatches=args.mismatches,
                        export_amplicons=args.export_amplicons,
                        contigbreaks=args.contigbreaks)
        epcr.main()
 def __init__(self, start, sequencepath, referencefilepath, scriptpath,
              debug):
     """
     
     :param start: 
     :param sequencepath: 
     :param referencefilepath: 
     :param scriptpath:
     """
     self.debug = debug
     SetupLogging(self.debug)
     logging.info('Welcome to the CFIA bacterial typing pipeline {}'.format(
         __version__))
     # Define variables from the arguments - there may be a more streamlined way to do this
     self.sequencepath = os.path.join(sequencepath)
     self.path = self.sequencepath
     self.targetpath = os.path.join(referencefilepath)
     self.reffilepath = self.targetpath
     # Define the start time
     self.starttime = start
     self.start = self.starttime
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = multiprocessing.cpu_count() - 1
     # Assertions to ensure that the provided variables are valid
     assert os.path.isdir(self.sequencepath), 'Supplied path location is not a valid directory {0!r:s}'\
         .format(self.sequencepath)
     self.reportpath = os.path.join(self.sequencepath, 'reports')
     assert os.path.isdir(self.targetpath), 'Reference file path is not a valid directory {0!r:s}'\
         .format(self.targetpath)
     self.commit = __version__
     self.homepath = scriptpath
     self.analysistype = 'assembly_typing'
     self.genus_specific = False
     self.logfile = os.path.join(self.sequencepath, 'logfile')
     self.pipeline = True
     # Initialise the metadata object
     self.metadata = list()
     self.runmetadata = MetadataObject()
Exemple #14
0
def test_sistr_seqsero():
    metadata = MetadataObject()
    method.runmetadata.samples = list()
    fasta = os.path.join(var.sequencepath, 'NC_003198.fasta')
    metadata.name = os.path.split(fasta)[1].split('.')[0]
    # Initialise the general and run categories
    metadata.general = GenObject()
    metadata.run = GenObject()
    metadata.general.fastqfiles = list()
    metadata.general.trimmedcorrectedfastqfiles = [
        os.path.join(var.sequencepath, 'seqsero',
                     '2014-SEQ-1049_seqsero.fastq.gz')
    ]
    # Set the destination folder
    outputdir = os.path.join(var.sequencepath, metadata.name)
    make_path(outputdir)
    # Add the output directory to the metadata
    metadata.general.outputdirectory = outputdir
    metadata.general.logout = os.path.join(outputdir, 'out')
    metadata.general.logerr = os.path.join(outputdir, 'err')
    metadata.run.outputdirectory = outputdir
    metadata.general.bestassemblyfile = True
    # Initialise an attribute to store commands
    metadata.commands = GenObject()
    # Assume that all samples are Salmonella
    metadata.general.referencegenus = 'Salmonella'
    # Set the .fasta file as the best assembly
    metadata.general.bestassemblyfile = fasta
    method.runmetadata.samples.append(metadata)
    method.sistr()
    for sample in method.runmetadata.samples:
        assert sample.sistr.cgmlst_genome_match == 'ERR586739' or sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA'
    method.seqsero()
    for sample in method.runmetadata.samples:
        assert sample.seqsero.predicted_serotype == '- 9:f,g,t:-'
    variable_update()
Exemple #15
0
def variables():
    v = MetadataObject()
    v.sequencepath = os.path.join(testpath, 'testdata')
    v.referencefilepath = os.path.join(v.sequencepath, 'databases')
    v.customsamplesheet = os.path.join(v.sequencepath, 'SampleSheet.csv')
    v.debug = True
    v.numreads = 2
    v.kmerrange = '21'
    v.preprocess = False
    v.basicassembly = True
    v.threads = multiprocessing.cpu_count()
    v.startingtime = time()
    v.commit = b''
    v.homepath = scriptpath
    return v
Exemple #16
0
    def probefinder(self):
        """
        Find the longest probe sequences
        """
        logging.info('Finding and filtering probe sequences')
        for sample in self.samples:
            # A list to store the metadata object for each alignment
            sample.gene = list()
            for align in sample.alignedalleles:
                # Create an object to store all the information for each alignment file
                metadata = GenObject()
                metadata.name = os.path.splitext(os.path.basename(align))[0]
                metadata.alignmentfile = align
                # Create an alignment object from the alignment file
                try:
                    metadata.alignment = AlignIO.read(align, 'fasta')
                except ValueError:
                    # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences
                    # to be the length of the longest sequence
                    # https://stackoverflow.com/q/32833230
                    records = SeqIO.parse(align, 'fasta')
                    # Make a copy, otherwise our generator is exhausted after calculating maxlen
                    records = list(records)
                    # Calculate the length of the longest sequence
                    maxlen = max(len(record.seq) for record in records)
                    # Pad sequences so that they all have the same length
                    for record in records:
                        if len(record.seq) != maxlen:
                            sequence = str(record.seq).ljust(maxlen, '.')
                            record.seq = Seq(sequence)
                    assert all(len(record.seq) == maxlen for record in records)
                    # Write to file and do alignment
                    metadata.alignmentfile = '{}_padded.tfa'.format(
                        os.path.splitext(align)[0])
                    with open(metadata.alignmentfile, 'w') as padded:
                        SeqIO.write(records, padded, 'fasta')
                    # Align the padded sequences
                    metadata.alignment = AlignIO.read(metadata.alignmentfile,
                                                      'fasta')

                metadata.summaryalign = AlignInfo.SummaryInfo(
                    metadata.alignment)
                # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default
                # parameters of threshold=.7, and ambiguous='X' are used
                consensus = metadata.summaryalign.dumb_consensus()
                metadata.consensus = str(consensus)
                # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each
                # location along the entire consensus sequence
                metadata.pssm = metadata.summaryalign.pos_specific_score_matrix(
                    consensus)
                metadata.identity = list()
                # Find the prevalence of each base for every location along the sequence
                for line in metadata.pssm:
                    try:
                        bases = [
                            line['A'], line['C'], line['G'], line['T'],
                            line['-']
                        ]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases[:4]) / sum(bases) * 100)))
                    except KeyError:
                        bases = [line['A'], line['C'], line['G'], line['T']]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases) / sum(bases) * 100)))
                # List to store metadata objects
                metadata.windows = list()
                # Variable to store whether a suitable probe has been found for the current organism + gene pair.
                # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the
                # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found
                passing = False
                # Create sliding windows of size self.max - self.min from the list of identities for each column
                # of the alignment
                for i in reversed(range(self.min, self.max + 1)):
                    if not passing:
                        windowdata = MetadataObject()
                        windowdata.size = i
                        windowdata.max = 0
                        windowdata.sliding = list()
                        # Create a counter to store the starting location of the window in the sequence
                        n = 0
                        # Create sliding windows from the range of sizes for the list of identities
                        windows = self.window(metadata.identity, i)
                        # Go through each window from the collection of sliding windows to determine which window(s)
                        # has (have) the best results
                        for window in windows:
                            # Create another object to store all the data for the window
                            slidingdata = MetadataObject()
                            # Only consider the window if every position has a percent identity greater than the cutoff
                            if min(window) > self.cutoff:
                                # Populate the object with the necessary variables
                                slidingdata.location = '{}:{}'.format(n, n + i)
                                slidingdata.min = min(window)
                                slidingdata.mean = float('{:.2f}'.format(
                                    numpy.mean(window)))
                                slidingdata.sequence = str(consensus[n:n + i])
                                # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min
                                #  means a better/less overall percent identity, respectively
                                windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \
                                    else windowdata.max
                                windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \
                                    else windowdata.min
                                # Add the object to the list of objects
                                windowdata.sliding.append(slidingdata)
                                passing = True
                            n += 1
                        # All the object to the list of objects
                        metadata.windows.append(windowdata)
                # All the object to the list of objects
                sample.gene.append(metadata)
Exemple #17
0
 def __init__(self, inputobject, extension='fasta', light=False):
     # Create an object to mimic the command line arguments necessary for the script
     args = MetadataObject()
     args.path = inputobject.path
     args.sequencepath = inputobject.path
     args.databasepath = os.path.join(inputobject.reffilepath, 'clark')
     make_path(args.databasepath)
     args.clarkpath = os.path.dirname(which('CLARK'))
     args.clarkpath += '/../opt/clark/'
     args.cutoff = 0.005
     args.database = 'bacteria'
     args.rank = 'species'
     args.filter = False
     args.threads = inputobject.cpus
     args.runmetadata = inputobject.runmetadata
     args.clean_seqs = False
     args.reffilepath = inputobject.reffilepath
     args.extension = extension
     args.light = light
     # Run CLARK
     CLARK(args, inputobject.commit, inputobject.starttime,
           inputobject.homepath)
Exemple #18
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath):
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.start = startingtime
     self.homepath = scriptpath
     # Define variables based on supplied arguments
     self.args = args
     self.path = os.path.join(args.path)
     assert os.path.isdir(
         self.path
     ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     self.sequencepath = os.path.join(args.sequencepath, '')
     assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \
         .format(self.sequencepath)
     self.databasepath = os.path.join(args.databasepath, '')
     assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \
         .format(self.databasepath)
     # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1
     self.cpus = 4
     # Set variables from the arguments
     self.database = args.database
     self.rank = args.rank
     self.clarkpath = args.clarkpath
     self.cutoff = float(args.cutoff) * 100
     # Initialise variables for the analysis
     self.targetcall = str()
     self.classifycall = str()
     self.devnull = open(os.devnull, 'wb')
     self.filelist = os.path.join(self.path, 'sampleList.txt')
     self.reportlist = os.path.join(self.path, 'reportList.txt')
     self.abundancequeue = Queue()
     self.datapath = str()
     self.reportpath = os.path.join(self.path, 'reports')
     self.clean_seqs = args.clean_seqs
     self.light = args.light
     self.extension = args.extension
     if self.clean_seqs:
         try:
             self.reffilepath = args.reffilepath
         except AttributeError:
             self.clean_seqs = False
     # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects
     # and variables play nice
     try:
         if args.runmetadata:
             self.runmetadata = args.runmetadata
             # Create the name of the final report
             self.report = os.path.join(
                 self.reportpath,
                 'abundance_{ft}.xlsx'.format(ft=self.extension))
             # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK
             if not os.path.isfile(self.report):
                 logging.info(
                     'Performing CLARK analysis on {ft} files'.format(
                         ft=self.extension))
                 if self.extension != 'fastq':
                     for sample in self.runmetadata.samples:
                         sample.general.combined = sample.general.bestassemblyfile
                     # Run the pipeline
                     self.main()
                 else:
                     # Only perform FASTQ analyses if the sample is declared to be a metagenome
                     metagenome = False
                     for sample in self.runmetadata.samples:
                         try:
                             status = sample.run.Description
                         except AttributeError:
                             status = 'unknown'
                         if status == 'metagenome':
                             metagenome = True
                     # If any of the samples are metagenomes, run the CLARK analysis on the raw files
                     if metagenome:
                         fileprep.Fileprep(self)
                         # Run the pipeline
                         self.main()
                 # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects
                 for sample in self.runmetadata.samples:
                     # Create a GenObject to store metadata when this script is run as part of the pipeline
                     clarkextension = 'clark{}'.format(self.extension)
                     setattr(sample, clarkextension, GenObject())
                     # Create a folder to store all the CLARK files
                     sample[clarkextension].outputpath = os.path.join(
                         sample.general.outputdirectory, 'CLARK')
                     make_path(sample[clarkextension].outputpath)
                     if sample.general.bestassemblyfile != 'NA':
                         # Move the files to the CLARK folder
                         try:
                             move(
                                 sample.general.abundance,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.abundance)))
                             move(
                                 sample.general.classification,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.classification)))
                         except (AttributeError, FileNotFoundError):
                             pass
                         # Set the CLARK-specific attributes
                         try:
                             sample[
                                 clarkextension].abundance = sample.general.abundance
                             sample[
                                 clarkextension].classification = sample.general.classification
                             sample[
                                 clarkextension].combined = sample.general.combined
                         except AttributeError:
                             pass
                         if self.extension == 'fastq':
                             # Remove the combined .fastq files
                             try:
                                 if type(sample[clarkextension].combined
                                         ) is list:
                                     os.remove(
                                         sample[clarkextension].combined)
                             except (OSError, AttributeError):
                                 pass
                     # Remove the text files lists of files and reports created by CLARK
                     try:
                         map(
                             lambda x: os.remove(os.path.join(self.path, x)
                                                 ),
                             ['reportList.txt', 'sampleList.txt'])
                     except OSError:
                         pass
         else:
             self.runmetadata = MetadataObject()
             self.report = os.path.join(self.reportpath, 'abundance.xlsx')
             # Create the objects
             self.objectprep()
             self.main()
     except AttributeError:
         self.runmetadata = MetadataObject()
         self.report = os.path.join(self.reportpath, 'abundance.xlsx')
         # Create the objects
         self.objectprep()
         # Set the run description to 'metagenome' in order to process the samples
         for sample in self.runmetadata.samples:
             sample.run.Description = 'metagenome'
         self.main()
     # Optionally filter the .fastq reads based on taxonomic assignment
     if args.filter:
         filtermetagenome.PipelineInit(self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(self)
class RunAssemble(object):
    def main(self):
        """
        Run the methods in the correct order
        """
        # Start the assembly
        self.helper()
        # Create the quality object
        self.create_quality_object()
        # Run the quality analyses
        self.quality()
        # Perform assembly
        self.assemble()
        # Perform genus-agnostic typing
        self.agnostictyping()
        # Perform typing
        self.typing()
        # Compress or remove all large, temporary files created by the pipeline
        if not self.debug:
            compress.Compress(self)
        metadataprinter.MetadataPrinter(inputobject=self)

    def helper(self):
        """Helper function for file creation (if desired), manipulation, quality assessment,
        and trimming as well as the assembly"""
        # Simple assembly without requiring accessory files (SampleSheet.csv, etc).
        if self.basicassembly:
            self.runmetadata = Basic(inputobject=self)
        else:
            # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and
            # RunInfo.xml files
            self.runinfo = os.path.join(self.path, 'RunInfo.xml')
            self.runmetadata = runMetadata.Metadata(passed=self)
            # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided
            self.runmetadata.parseruninfo()
            # Extract PhiX mapping information from the run
            phi = phix.PhiX(inputobject=self)
            phi.main()
            # Populate the lack of bclcall and nohup call into the metadata sheet
            for sample in self.runmetadata.samples:
                sample.commands = GenObject()
                sample.commands.nohupcall = 'NA'
                sample.commands.bclcall = 'NA'
            # Move/link the FASTQ files to strain-specific working directories
            fastqmover.FastqMover(inputobject=self)
        # Print the metadata to file
        metadataprinter.MetadataPrinter(inputobject=self)

    def create_quality_object(self):
        """
        Create the quality object
        """
        self.qualityobject = quality.Quality(inputobject=self)

    def quality(self):
        """
        Creates quality objects and runs quality assessments and quality processes on the
        supplied sequences
        """
        # Validate that the FASTQ files are in the proper format, and that there are no issues e.g. different numbers
        # of forward and reverse reads, read length longer than quality score length, proper extension
        if not self.debug:
            self.fastq_validate()
        # Run FastQC on the unprocessed fastq files
        self.fastqc_raw()
        # Perform quality trimming and FastQC on the trimmed files
        self.quality_trim()
        # Run FastQC on the trimmed files
        self.fastqc_trimmed()
        # Perform error correcting on the reads
        self.error_correct()
        # Detect contamination in the reads
        self.contamination_detection()
        # Run FastQC on the processed fastq files
        self.fastqc_trimmedcorrected()
        # Exit if only pre-processing of data is requested
        metadataprinter.MetadataPrinter(inputobject=self)
        if self.preprocess:
            logging.info('Pre-processing complete')
            quit()

    def fastq_validate(self):
        """
        Attempt to detect and fix issues with the FASTQ files
        """
        self.qualityobject.validate_fastq()
        metadataprinter.MetadataPrinter(inputobject=self)

    def fastqc_raw(self):
        """
        Run FastQC on the unprocessed FASTQ files
        """
        self.qualityobject.fastqcthreader(level='Raw')
        metadataprinter.MetadataPrinter(inputobject=self)

    def quality_trim(self):
        """
        Perform quality trimming and FastQC on the trimmed files
        """
        self.qualityobject.trimquality()
        metadataprinter.MetadataPrinter(inputobject=self)

    def fastqc_trimmed(self):
        """
        Run FastQC on the quality trimmed FASTQ files
        """
        self.qualityobject.fastqcthreader(level='Trimmed')
        metadataprinter.MetadataPrinter(inputobject=self)

    def error_correct(self):
        """
        Perform error correcting on the reads
        """
        self.qualityobject.error_correction()
        metadataprinter.MetadataPrinter(inputobject=self)

    def contamination_detection(self):
        """
        Calculate the levels of contamination in the reads
        """
        self.qualityobject.contamination_finder(report_path=self.reportpath,
                                                debug=self.debug)
        metadataprinter.MetadataPrinter(inputobject=self)

    def fastqc_trimmedcorrected(self):
        """
        Run FastQC on the processed fastq files
        """
        self.qualityobject.fastqcthreader(level='trimmedcorrected')
        metadataprinter.MetadataPrinter(inputobject=self)

    def assemble(self):
        """
        Assemble genomes and perform some basic quality analyses
        """
        # Assemble genomes
        self.assemble_genomes()
        # Calculate assembly metrics on raw assemblies
        self.evaluate_assemblies()
        # ORF detection
        self.prodigal()
        # CLARK analyses
        self.clark()

    def assemble_genomes(self):
        """
        Use skesa to assemble genomes
        """
        assembly = skesa.Skesa(inputobject=self)
        assembly.main()
        metadataprinter.MetadataPrinter(inputobject=self)

    def evaluate_assemblies(self):
        """
        Evaluate assemblies with Quast
        """
        qual = evaluate.AssemblyEvaluation(inputobject=self)
        qual.main()
        metadataprinter.MetadataPrinter(inputobject=self)

    def prodigal(self):
        """
        Use prodigal to detect open reading frames in the assemblies
        """
        prodigal.Prodigal(self)
        metadataprinter.MetadataPrinter(self)

    def clark(self):
        """
        Run CLARK metagenome analyses on the raw reads and assemblies if the system has adequate resources
        """
        # Run CLARK typing on the .fastq and .fasta files
        automateCLARK.PipelineInit(inputobject=self,
                                   extension='fasta',
                                   light=True)
        automateCLARK.PipelineInit(inputobject=self,
                                   extension='fastq',
                                   light=True)

    def agnostictyping(self):
        """
        Perform typing that does not require the genus of the organism to be known
        """
        # Run mash
        self.mash()
        # Run rMLST on assemblies
        self.rmlst_assembled()
        # Create reports summarising the run and sample qualities
        self.quality_report()
        # Run the 16S analyses
        self.sixteens()
        # Find genes of interest
        self.genesippr()
        # Resistance finding - raw reads
        self.ressippr()
        # Resistance finding - assemblies
        self.resfinder()
        # Run MOB-suite
        self.mob_suite()
        # Prophage detection
        self.prophages()
        # Univec contamination search
        self.univec()
        # Virulence
        self.virulence()
        # cgMLST
        self.cgmlst()

    def mash(self):
        """
        Run mash to determine closest refseq genome
        """
        mash.Mash(inputobject=self, analysistype='mash')
        metadataprinter.MetadataPrinter(inputobject=self)

    def rmlst_assembled(self):
        """
        Run rMLST analyses on assemblies
        """
        if not os.path.isfile(os.path.join(self.reportpath, 'rmlst.csv')):
            rmlst = BLAST(args=self, analysistype='rmlst', cutoff=100)
            rmlst.seekr()
        else:
            parse = ReportParse(args=self, analysistype='rmlst')
            parse.report_parse()
        metadataprinter.MetadataPrinter(inputobject=self)

    def quality_report(self):
        """
        Create reports summarising the run and sample quality outputs
        """
        qual_report = reporter.Reporter(self)
        qual_report.run_quality_reporter()
        qual_report.sample_quality_report()

    def sixteens(self):
        """
        Run the 16S analyses
        """
        SixteensFull(args=self,
                     pipelinecommit=self.commit,
                     startingtime=self.starttime,
                     scriptpath=self.homepath,
                     analysistype='sixteens_full',
                     cutoff=0.95)
        metadataprinter.MetadataPrinter(inputobject=self)

    def genesippr(self):
        """
        Find genes of interest
        """
        GeneSippr(args=self,
                  pipelinecommit=self.commit,
                  startingtime=self.starttime,
                  scriptpath=self.homepath,
                  analysistype='genesippr',
                  cutoff=0.95,
                  pipeline=False,
                  revbait=False)
        metadataprinter.MetadataPrinter(inputobject=self)

    def mob_suite(self):
        """

        """
        mob = MobRecon(metadata=self.runmetadata.samples,
                       analysistype='mobrecon',
                       databasepath=self.reffilepath,
                       threads=self.cpus,
                       logfile=self.logfile,
                       reportpath=self.reportpath)
        mob.mob_recon()
        metadataprinter.MetadataPrinter(inputobject=self)

    def ressippr(self):
        """
        Resistance finding - raw reads
        """
        res = Resistance(args=self,
                         pipelinecommit=self.commit,
                         startingtime=self.starttime,
                         scriptpath=self.homepath,
                         analysistype='resfinder',
                         cutoff=0.7,
                         pipeline=False,
                         revbait=True)
        res.main()
        metadataprinter.MetadataPrinter(inputobject=self)

    def resfinder(self):
        """
        Resistance finding - assemblies
        """
        resfinder = BLAST(args=self, analysistype='resfinder_assembled')
        resfinder.seekr()
        metadataprinter.MetadataPrinter(inputobject=self)

    def prophages(self, cutoff=90):
        """
        Prophage detection
        :param cutoff: cutoff value to be used in the analyses
        """
        prophages = Prophages(args=self,
                              analysistype='prophages',
                              cutoff=cutoff,
                              unique=True)
        if not os.path.isfile(os.path.join(self.reportpath, 'prophages.csv')):
            prophages.seekr()
        metadataprinter.MetadataPrinter(inputobject=self)

    def univec(self):
        """
        Univec contamination search
        """
        if not os.path.isfile(os.path.join(self.reportpath, 'univec.csv')):
            univec = Univec(args=self,
                            analysistype='univec',
                            cutoff=80,
                            unique=True)
            univec.seekr()
        metadataprinter.MetadataPrinter(inputobject=self)

    def virulence(self):
        """
        Virulence gene detection
        """
        vir = Virulence(args=self,
                        pipelinecommit=self.commit,
                        startingtime=self.starttime,
                        scriptpath=self.homepath,
                        analysistype='virulence',
                        cutoff=0.9,
                        pipeline=False,
                        revbait=True)
        if not os.path.isfile(os.path.join(self.reportpath, 'virulence.csv')):
            vir.reporter()
        metadataprinter.MetadataPrinter(inputobject=self)

    def cgmlst(self):
        """
        Run rMLST analyses on raw reads
        """
        if not os.path.isfile(os.path.join(self.reportpath, 'cgmlst.csv')):
            cgmlst = KMAMLST(args=self,
                             pipeline=True,
                             analysistype='cgmlst',
                             cutoff=98,
                             kma_kwargs=' -cge -and')
            cgmlst.main()
        else:
            parse = ReportParse(args=self, analysistype='cgmlst')
            parse.report_parse()
        metadataprinter.MetadataPrinter(inputobject=self)

    def typing(self):
        """
        Perform analyses that use genera-specific databases
        """
        # Run modules and print metadata to file
        # MLST on assemblies
        self.mlst_assembled()
        # Assembly-based serotyping
        self.ec_typer()
        # Serotyping
        self.serosippr()
        # SeqSero
        self.seqsero()
        # Assembly-based vtyper
        self.legacy_vtyper()
        # Raw read verotoxin typing
        self.verotoxin()
        # Sistr
        self.sistr()
        # Calculate the presence/absence of GDCS
        self.run_gdcs()
        # Create a final summary report
        self.run_report()

    def mlst_assembled(self):
        """
        Run rMLST analyses on assemblies
        """
        if not os.path.isfile(os.path.join(self.reportpath, 'mlst.csv')):

            mlst = BLAST(args=self,
                         analysistype='mlst',
                         cutoff=100,
                         genus_specific=True)
            mlst.seekr()
        else:
            parse = ReportParse(args=self, analysistype='mlst')
            parse.report_parse()
        metadataprinter.MetadataPrinter(inputobject=self)

    def ec_typer(self):
        """
        Assembly-based serotyping
        """
        ec = ECTyper(metadata=self.runmetadata,
                     report_path=self.reportpath,
                     assembly_path=os.path.join(self.path, 'raw_assemblies'),
                     threads=self.cpus,
                     logfile=self.logfile)
        ec.main()
        metadataprinter.MetadataPrinter(inputobject=self)

    def serosippr(self):
        """
        Serotyping analyses
        """
        Serotype(args=self,
                 pipelinecommit=self.commit,
                 startingtime=self.starttime,
                 scriptpath=self.homepath,
                 analysistype='serosippr',
                 cutoff=0.90,
                 pipeline=True)
        metadataprinter.MetadataPrinter(inputobject=self)

    def seqsero(self):
        """
        Run SeqSero2 on Salmonella samples
        """
        seqsero = SeqSero(self)
        seqsero.main()
        metadataprinter.MetadataPrinter(inputobject=self)

    def legacy_vtyper(self):
        """
        Legacy vtyper - uses ePCR
        """
        legacy_vtyper = LegacyVtyper(inputobject=self,
                                     analysistype='legacy_vtyper',
                                     mismatches=2)
        legacy_vtyper.vtyper()
        metadataprinter.MetadataPrinter(inputobject=self)

    def verotoxin(self):
        """
        Raw read verotoxin typing
        """
        vero = Verotoxin(args=self,
                         pipeline=True,
                         analysistype='verotoxin',
                         cutoff=90)
        vero.main()

    def sistr(self):
        """
        Sistr
        """
        sistr_obj = sistr.Sistr(inputobject=self, analysistype='sistr')
        sistr_obj.main()
        metadataprinter.MetadataPrinter(inputobject=self)

    def run_gdcs(self):
        """
        Determine the presence of genomically-dispersed conserved sequences (genes from MLST, rMLST, and cgMLST
        analyses)
        """
        # Run the GDCS analysis
        gdcs = GDCS(inputobject=self)
        gdcs.main()
        metadataprinter.MetadataPrinter(inputobject=self)

    def run_report(self):
        """
        Create the final combinedMetadata report
        """
        run_report = reporter.Reporter(self)
        # Create the standard and legacy reports
        run_report.metadata_reporter()
        run_report.legacy_reporter()
        # Clean the large attributes from the metadata objects
        run_report.clean_object()

    def __init__(self, args):
        """
        Initialises the variables required for this class
        :param args: list of arguments passed to the script
        """
        self.debug = args.debug
        SetupLogging(self.debug)
        logging.info(
            'Welcome to the CFIA OLC Workflow for Bacterial Assembly and Typing (COWBAT) version {version}'
            .format(version=__version__))
        # Define variables from the arguments - there may be a more streamlined way to do this
        self.args = args
        if args.sequencepath.startswith('~'):
            self.path = os.path.abspath(
                os.path.expanduser(os.path.join(args.sequencepath)))
        else:
            self.path = os.path.abspath(os.path.join(args.sequencepath))
        self.sequencepath = self.path
        if args.referencefilepath.startswith('~'):
            self.reffilepath = os.path.expanduser(
                os.path.abspath(os.path.join(args.referencefilepath)))
        else:
            self.reffilepath = os.path.abspath(
                os.path.join(args.referencefilepath))
        self.numreads = args.numreads
        self.preprocess = args.preprocess
        # Define the start time
        self.starttime = args.startingtime
        if args.customsamplesheet:
            if args.customsamplesheet.startswith('~'):
                self.customsamplesheet = os.path.expanduser(
                    os.path.abspath(os.path.join(self.customsamplesheet)))
            else:
                self.customsamplesheet = os.path.abspath(
                    os.path.join(args.customsamplesheet))
        else:
            self.customsamplesheet = args.customsamplesheet
        if self.customsamplesheet:
            assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {css}' \
                .format(css=self.customsamplesheet)
        self.basicassembly = args.basicassembly
        if not self.customsamplesheet and not os.path.isfile(
                os.path.join(self.path, 'SampleSheet.csv')):
            self.basicassembly = True
            logging.warning(
                'Could not find a sample sheet. Performing basic assembly (no run metadata captured)'
            )
        # Use the argument for the number of threads to use, or default to the number of cpus in the system
        self.cpus = args.threads if args.threads else multiprocessing.cpu_count(
        ) - 1
        # Assertions to ensure that the provided variables are valid
        make_path(self.path)
        assert os.path.isdir(
            self.path
        ), 'Supplied path location is not a valid directory {0!r:s}'.format(
            self.path)
        self.reportpath = os.path.join(self.path, 'reports')
        make_path(self.reportpath)
        assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}' \
            .format(self.reffilepath)
        self.commit = __version__
        self.homepath = args.homepath
        self.logfile = os.path.join(self.path, 'logfile')
        self.runinfo = str()
        self.pipeline = True
        self.qualityobject = MetadataObject()
        # Initialise the metadata object
        self.runmetadata = MetadataObject()
 def __init__(self, args):
     """
     Initialises the variables required for this class
     :param args: list of arguments passed to the script
     """
     self.debug = args.debug
     SetupLogging(self.debug)
     logging.info(
         'Welcome to the CFIA OLC Workflow for Bacterial Assembly and Typing (COWBAT) version {version}'
         .format(version=__version__))
     # Define variables from the arguments - there may be a more streamlined way to do this
     self.args = args
     if args.sequencepath.startswith('~'):
         self.path = os.path.abspath(
             os.path.expanduser(os.path.join(args.sequencepath)))
     else:
         self.path = os.path.abspath(os.path.join(args.sequencepath))
     self.sequencepath = self.path
     if args.referencefilepath.startswith('~'):
         self.reffilepath = os.path.expanduser(
             os.path.abspath(os.path.join(args.referencefilepath)))
     else:
         self.reffilepath = os.path.abspath(
             os.path.join(args.referencefilepath))
     self.numreads = args.numreads
     self.preprocess = args.preprocess
     # Define the start time
     self.starttime = args.startingtime
     if args.customsamplesheet:
         if args.customsamplesheet.startswith('~'):
             self.customsamplesheet = os.path.expanduser(
                 os.path.abspath(os.path.join(self.customsamplesheet)))
         else:
             self.customsamplesheet = os.path.abspath(
                 os.path.join(args.customsamplesheet))
     else:
         self.customsamplesheet = args.customsamplesheet
     if self.customsamplesheet:
         assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {css}' \
             .format(css=self.customsamplesheet)
     self.basicassembly = args.basicassembly
     if not self.customsamplesheet and not os.path.isfile(
             os.path.join(self.path, 'SampleSheet.csv')):
         self.basicassembly = True
         logging.warning(
             'Could not find a sample sheet. Performing basic assembly (no run metadata captured)'
         )
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = args.threads if args.threads else multiprocessing.cpu_count(
     ) - 1
     # Assertions to ensure that the provided variables are valid
     make_path(self.path)
     assert os.path.isdir(
         self.path
     ), 'Supplied path location is not a valid directory {0!r:s}'.format(
         self.path)
     self.reportpath = os.path.join(self.path, 'reports')
     make_path(self.reportpath)
     assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}' \
         .format(self.reffilepath)
     self.commit = __version__
     self.homepath = args.homepath
     self.logfile = os.path.join(self.path, 'logfile')
     self.runinfo = str()
     self.pipeline = True
     self.qualityobject = MetadataObject()
     # Initialise the metadata object
     self.runmetadata = MetadataObject()
 def __init__(self, path, fasta_path, records, amino_acid):
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     if fasta_path.startswith('~'):
         self.fasta_path = os.path.abspath(
             os.path.expanduser(os.path.join(fasta_path)))
     else:
         self.fasta_path = os.path.abspath(os.path.join(fasta_path))
     self.working_path = os.path.join(self.path, 'strain_profiles')
     self.sequencepath = os.path.join(self.working_path, 'query')
     make_path(self.sequencepath)
     target_files = [
         fasta
         for fasta in sorted(glob(os.path.join(self.fasta_path, '*.fasta')))
         if os.path.basename(fasta) != 'combinedtargets.fasta'
         or os.path.basename(fasta) != 'custom.tfa'
     ]
     self.query_files = list()
     # Create symlinks of the target files in the local path
     for target in target_files:
         try:
             query_file = os.path.join(
                 self.sequencepath,
                 os.path.basename(target).replace('.tfa', '.fasta'))
             self.query_files.append(query_file)
             os.symlink(target, query_file)
         except FileExistsError:
             pass
     self.targetpath = os.path.join(self.working_path, 'targets')
     make_path(self.targetpath)
     self.profilepath = os.path.join(self.working_path, 'sequence_profile')
     make_path(self.profilepath)
     self.profile_file = os.path.join(self.profilepath, 'profile.txt')
     self.target_file = os.path.join(self.targetpath,
                                     'combinedtargets.fasta')
     shutil.copyfile(src=os.path.join(os.path.join(self.path, 'alleles'),
                                      'combinedtargets.fasta'),
                     dst=self.target_file)
     self.reportpath = os.path.join(self.working_path, 'reports')
     make_path(self.reportpath)
     self.strain_profile_path = os.path.join(self.working_path,
                                             'strain_profiles')
     make_path(self.strain_profile_path)
     self.profile_report = os.path.join(self.strain_profile_path,
                                        'profiles.tsv')
     self.cpus = multiprocessing.cpu_count() - 1
     self.starttime = time()
     self.start = self.starttime
     self.runmetadata = MetadataObject()
     self.runmetadata.samples = list()
     self.records = records
     # Create an object for performing BLAST analyses
     if amino_acid:
         self.amino_acid = amino_acid
     else:
         self.amino_acid = None
     if amino_acid:
         self.program = 'tblastn'
     else:
         self.program = 'blastn'
     # Fields used for custom outfmt 6 BLAST output:
     self.fieldnames = [
         'query_id', 'subject_id', 'identical', 'mismatches', 'gaps',
         'evalue', 'bit_score', 'query_length', 'subject_length',
         'alignment_length', 'query_start', 'query_end', 'subject_start',
         'subject_end', 'query_sequence', 'subject_sequence'
     ]
     self.extended_fieldnames = self.fieldnames.copy()
     self.extended_fieldnames.insert(14, 'percent_match')
     self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \
                   'qstart qend sstart send qseq sseq'
     self.blast_reports = list()
     self.profile_dict = dict()
     self.profile_data = dict()
     self.profile_set = list()
     self.sequence_profile = dict()
     self.profile_matches = dict()
     self.new_profiles = list()
     # A string of the header to use for formatting the profile file, and the report headers
     genes = '\t'.join(sorted(self.records))
     self.data = 'ST\t{genes}\n'.format(genes=genes.rstrip())
     self.gene_names = list()
Exemple #22
0
        def __init__(self):

            # Get the current commit of the pipeline from git
            # Extract the path of the current script from the full path + file name
            homepath = os.path.split(os.path.abspath(__file__))[0]
            # Find the commit of the script by running a command to change to the directory containing the script and
            # run a git command to return the short version of the commit hash
            commit = subprocess.Popen(
                'cd {} && git tag | tail -n 1'.format(homepath),
                shell=True,
                stdout=subprocess.PIPE).communicate()[0].rstrip()
            # Parser for arguments
            parser = ArgumentParser(
                description='Filter reads based on taxonomic assignment')
            parser.add_argument('-v',
                                '--version',
                                action='version',
                                version='%(prog)s commit {}'.format(commit))
            parser.add_argument('path', help='Specify path')
            parser.add_argument(
                '-t',
                '--threads',
                help=
                'Number of threads. Default is the number of cpus in the system'
            )
            parser.add_argument('-s',
                                '--sequencepath',
                                required=True,
                                help='Path of .fastq(.gz) files to process.')
            parser.add_argument(
                '-d',
                '--datapath',
                required=True,
                help=
                'Path of .csv files created by CLARK with read ID, length, and assignment.'
            )
            parser.add_argument(
                '-c',
                '--cutoff',
                default=0.01,
                help=
                'Cutoff value for deciding which taxIDs to use when sorting .fastq files. '
                'Defaults to 1 percent. Please note that you must use a decimal format: enter 0.05'
                ' to get a 5 percent cutoff value')
            parser.add_argument(
                '-x',
                '--taxids',
                help='NOT IMPLEMENTED: CSV of desired taxIDs from each sample. '
            )
            # Get the arguments into an object
            args = parser.parse_args()
            self.start = time()
            # Define variables based on supplied arguments
            self.path = os.path.join(args.path)
            assert os.path.isdir(
                self.path
            ), 'Supplied path is not a valid directory {path}'.format(
                path=self.path)
            self.sequencepath = os.path.join(args.sequencepath)
            assert os.path.isdir(self.sequencepath), 'Sequence location supplied is not a valid directory {seq_path}' \
                .format(seq_path=self.sequencepath)
            self.datapath = os.path.join(args.datapath)
            self.reportpath = os.path.join(self.path, 'reports')
            # Use the argument for the number of threads to use, or default to the number of cpus in the system
            self.cpus = args.threads if args.threads else multiprocessing.cpu_count(
            )
            # Set the cutoff to be a percent
            self.cutoff = args.cutoff * 100
            # Run the pipeline
            self.runmetadata = MetadataObject()
            genome = FilterGenome(self)
            genome.objectprep()
            logging.info('Filtering complete')