Beispiel #1
0
 def __init__(self, allelepath, targetpath, reportpath, gene):
     logging.info('Welcome to the CFIA Allele Attributer')
     if allelepath.startswith('~'):
         self.allelepath = os.path.abspath(
             os.path.expanduser(os.path.join(allelepath)))
     else:
         self.allelepath = os.path.abspath(os.path.join(allelepath))
     self.allelefiles = glob(os.path.join(self.allelepath, '*.tfa'))
     self.alleleset = set()
     if targetpath.startswith('~'):
         self.targetpath = os.path.expanduser(
             os.path.abspath(os.path.join(targetpath)))
     else:
         self.targetpath = os.path.abspath(os.path.join(targetpath))
     self.targetfile = sorted(glob(os.path.join(self.targetpath,
                                                '*.fasta')))[0]
     self.targetrecords = dict()
     if reportpath.startswith('~'):
         self.reportpath = os.path.expanduser(
             os.path.abspath(os.path.join(reportpath, 'reports')))
     else:
         self.reportpath = os.path.abspath(
             os.path.join(reportpath, 'reports'))
     make_path(self.reportpath)
     self.gene = gene
     self.unaligned_alleles = os.path.join(
         self.reportpath,
         '{gene}_unaligned_alleles.fasta'.format(gene=self.gene))
     self.aligned_alleles = os.path.join(
         self.reportpath,
         '{gene}_aligned_alleles.fasta'.format(gene=self.gene))
     self.attributedalleles = list()
     self.complete = set()
Beispiel #2
0
 def listread(self):
     while True:
         sample = self.listqueue.get()
         # Set and create the path of the sorted fastq files
         sample.general.sortedfastqpath = os.path.join(
             sample.general.outputdirectory, 'sortedFastq')
         make_path(sample.general.sortedfastqpath)
         # Initialise dictionaries to hold data
         sample.general.fastqlist = dict()
         sample.general.filteredfastq = dict()
         # Iterate through the taxIDs
         for taxid in sample.general.taxids:
             # Set the name of the list to store all the reads associated with the taxID
             sample.general.fastqlist[taxid] = os.path.join(
                 sample.general.sortedfastqpath,
                 '{sn}_{taxid}.txt'.format(sn=sample.name, taxid=taxid))
             # Set the name of the .fastq file that will store the filtered reads
             sample.general.filteredfastq[taxid] = os.path.join(
                 sample.general.sortedfastqpath,
                 '{sn}_{taxid}.fastq.gz'.format(sn=sample.name,
                                                taxid=taxid))
             # Open the list, and write the list of all reads, one per line
             with open(sample.general.fastqlist[taxid], 'w') as binned:
                 binned.write('\n'.join(set(sample[taxid].readlist)))
         self.listqueue.task_done()
Beispiel #3
0
    def getrmlsthelper(self):
        """
        Makes a system call to rest_auth.py, a Python script modified from
        https://github.com/kjolley/BIGSdb/tree/develop/scripts/test
        And downloads the most up-to-date rMLST profile and alleles
        """

        printtime('Downloading {} alleles'.format(self.analysistype),
                  self.start)
        # Extract the path of the current script from the full path + file name
        homepath = os.path.split(os.path.abspath(__file__))[0]
        # Set the path/name of the folder to contain the new alleles and profile
        newfolder = os.path.join(self.path, self.analysistype)
        # Create the path
        make_path(newfolder)
        # Create arguments to feed into the rest_auth_class script
        args = ArgumentParser
        args.secret_file = os.path.join(homepath, 'secret.txt')
        args.file_path = homepath
        args.output_path = newfolder
        args.start = self.start
        rmlst = rest_auth_class.REST(args)
        # Download the profile and alleles
        rmlst.main()

        # Get the new alleles into a list, and create the combinedAlleles file
        alleles = glob(os.path.join(newfolder, '*.tfa'))
        self.combinealleles(newfolder, alleles)
 def __init__(self, sequencepath, reportpath):
     # Allow for relative paths
     if sequencepath.startswith('~'):
         self.sequencepath = os.path.abspath(
             os.path.expanduser(os.path.join(sequencepath)))
     else:
         self.sequencepath = os.path.abspath(os.path.join(sequencepath))
     assert os.path.isdir(self.sequencepath), 'Cannot locate supplied sequence path: {seq_path}'\
         .format(seq_path=self.sequencepath)
     if reportpath.startswith('~'):
         self.reportpath = os.path.abspath(
             os.path.expanduser(os.path.join(reportpath)))
     else:
         self.reportpath = os.path.abspath(os.path.join(reportpath))
     make_path(self.reportpath)
     assert os.path.isdir(self.reportpath), 'Could not create the requested report directory: {rep_path}'\
         .format(rep_path=self.reportpath)
     # Initialise class variables
     self.max_allele = int()
     self.samples = list()
     self.allele_dict = dict()
     self.record_dict = dict()
     self.output_dict = dict()
     self.new_alleles = dict()
     self.primer_sequences = dict()
     self.json_report = os.path.join(self.reportpath,
                                     'virus_typer_outputs.json')
     # Extract the path of this file - will be used to find the necessary accessory files
     self.homepath = os.path.split(os.path.abspath(__file__))[0]
     self.forward_primers = os.path.join(self.homepath,
                                         'forward_typing_primers.fasta')
     self.reverse_primers = os.path.join(self.homepath,
                                         'reverse_typing_primers.fasta')
     self.allele_database = os.path.join(self.homepath,
                                         'virus_typer_alleles.fasta')
 def query_prep(self):
     """
     Create metadata objects for each sample
     """
     logging.info('Preparing query files')
     # Find all the sequence files in the path
     fastas = sorted(glob(os.path.join(self.query_path, '*.fasta')))
     for fasta in fastas:
         name = os.path.splitext(os.path.basename(fasta))[0]
         if name != 'combinedtargets':
             # Create a metadata object for each sample
             metadata = MetadataObject()
             metadata.samples = list()
             # Populate the metadata object with the required attributes
             metadata.name = name
             metadata.general = GenObject()
             metadata.commands = GenObject()
             metadata.alleles = GenObject()
             metadata.alleles.outputdirectory = os.path.join(self.query_path, metadata.name)
             # Set the name of the BLAST output file
             metadata.alleles.blast_report = os.path.join(metadata.alleles.outputdirectory,
                                                          '{seq_id}.tsv'.format(seq_id=metadata.name))
             try:
                 os.remove(metadata.alleles.blast_report)
             except FileNotFoundError:
                 pass
             make_path(metadata.alleles.outputdirectory)
             metadata.general.bestassemblyfile = relative_symlink(src_file=fasta,
                                                                  output_dir=metadata.alleles.outputdirectory,
                                                                  export_output=True)
             metadata.samples.append(metadata)
             self.runmetadata.samples.append(metadata)
 def __init__(self, profile, names):
     logging.info('Welcome to profile reducer!')
     if profile.startswith('~'):
         self.profile = os.path.abspath(
             os.path.expanduser(os.path.join(profile)))
     else:
         self.profile = os.path.abspath(os.path.join(profile))
     assert os.path.isfile(
         self.profile), f'Cannot find the supplied profile {self.profile}'
     self.report_path = os.path.join(os.path.dirname(self.profile),
                                     'reports')
     make_path(self.report_path)
     self.reduced_profile = os.path.join(self.report_path, 'profile.txt')
     self.notes_file = os.path.join(self.report_path, 'reducing_notes.txt')
     if names.startswith('~'):
         self.name_file = os.path.abspath(
             os.path.expanduser(os.path.join(names)))
     else:
         self.name_file = os.path.abspath(os.path.join(names))
     assert os.path.isfile(
         self.name_file
     ), f'Cannot find the supplied file with gene names: {self.name_file}'
     self.names = list()
     self.profile_dict = dict()
     self.allele_dict = dict()
Beispiel #7
0
 def allelealigner(self):
     """
     Perform a multiple sequence alignment of the allele sequences
     """
     logging.info('Aligning alleles')
     # Create the threads for the analysis
     for i in range(self.cpus):
         threads = Thread(target=self.alignthreads, args=())
         threads.setDaemon(True)
         threads.start()
     for sample in self.samples:
         sample.alignpath = os.path.join(self.path, 'alignedalleles')
         make_path(sample.alignpath)
         # Create a list to store objects
         sample.alignedalleles = list()
         for outputfile in sample.allelefiles:
             aligned = os.path.join(sample.alignpath,
                                    os.path.basename(outputfile))
             sample.alignedalleles.append(aligned)
             # Create the command line call
             clustalomega = ClustalOmegaCommandline(infile=outputfile,
                                                    outfile=aligned,
                                                    threads=4,
                                                    auto=True)
             sample.clustalomega = str(clustalomega)
             self.queue.put((sample, clustalomega, outputfile, aligned))
     self.queue.join()
 def __init__(self, path, outputpath, accessiontable, threads, sleeptime):
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     if outputpath:
         if outputpath.startswith('~'):
             self.outputpath = os.path.abspath(
                 os.path.expanduser(os.path.join(outputpath)))
         else:
             self.outputpath = os.path.abspath(os.path.join(outputpath))
     else:
         self.outputpath = os.path.join(self.path, 'downloads')
     make_path(self.outputpath)
     self.metadatatable = os.path.join(self.path, accessiontable)
     assert os.path.isfile(self.metadatatable), 'Cannot find supplied pathogen metadata table {at} in ' \
                                                'supplied path {sp}' \
         .format(at=self.metadatatable,
                 sp=self.path)
     self.threads = threads
     self.sleeptime = sleeptime
     if self.sleeptime:
         assert self.sleeptime > 10, 'Must sleep at least 10 seconds'
         assert self.sleeptime < 86400, 'Cannot sleep for more than 24 hours'
     self.assembly_dict = dict()
     self.queue = Queue(maxsize=self.threads)
     logging.info('Starting pathogen assembly download using {at}'.format(
         at=self.metadatatable))
 def __init__(self, sequencepath, reportpath):
     # Allow for relative paths
     if sequencepath.startswith('~'):
         self.path = os.path.abspath(
             os.path.expanduser(os.path.join(sequencepath)))
     else:
         self.path = os.path.abspath(os.path.join(sequencepath))
     assert os.path.isdir(self.path), 'Cannot locate supplied sequence path: {seq_path}' \
         .format(seq_path=self.path)
     self.sequencepath = self.path
     if reportpath.startswith('~'):
         self.reportpath = os.path.abspath(
             os.path.expanduser(os.path.join(reportpath)))
     else:
         self.reportpath = os.path.abspath(os.path.join(reportpath))
     make_path(self.reportpath)
     assert os.path.isdir(self.reportpath), 'Could not create the requested report directory: {rep_path}' \
         .format(rep_path=self.reportpath)
     # Define the start time for legacy code compatibility
     self.starttime = time.time()
     self.logfile = os.path.join(self.path, 'log')
     self.cpus = multiprocessing.cpu_count() - 1
     self.sketchqueue = Queue(maxsize=self.cpus)
     self.mashqueue = Queue(maxsize=self.cpus)
     # Extract the path of this file - will be used to find the necessary accessory files
     self.homepath = os.path.split(os.path.abspath(__file__))[0]
     # self.reference_mash_sketch_file = os.path.join(self.homepath, 'toxoplasma.msh')
     self.reference_mash_sketch_file = '/mnt/nas2/redmine/bio_requests/17874/reference_sequences/toxoplasma.msh'
     self.metadata = list()
     self.output_dict = dict()
     self.json_report = os.path.join(self.reportpath,
                                     'para_typer_outputs.json')
    def objects(self):
        """

        :return:
        """
        self.runmetadata = ObjectCreation(inputobject=self)
        make_path(os.path.join(self.path, 'BestAssemblies'))
        for sample in self.runmetadata.samples:
            # Link the assemblies to the BestAssemblies folder - necessary for GenomeQAML
            relative_symlink(sample.general.bestassemblyfile,
                             os.path.join(self.path, 'BestAssemblies'))
            # Create attributes required for downstream analyses
            sample.general.trimmedcorrectedfastqfiles = [
                sample.general.bestassemblyfile
            ]
Beispiel #11
0
 def __init__(self, spectra_path, filename, start_time, outputpath, classic,
              extensions):
     """
     :param spectra_path: Path to .spa/.spc files
     :param filename: Path to .xls(x) file with renaming information.
     :param start_time: Time the analyses started
     :param outputpath: Path to folder in which the renamed files are to be stored
     :param classic: BOOL whether to use the "classic" method of file renaming.
     :param extension: BOOL whether the file extension is .spc
     """
     SetupLogging()
     # Define variables based on supplied arguments
     if spectra_path.startswith('~'):
         self.spectra_path = os.path.abspath(
             os.path.expanduser(os.path.join(spectra_path)))
     else:
         self.spectra_path = self.file = os.path.abspath(
             os.path.join(spectra_path))
     assert os.path.isdir(self.spectra_path), 'Supplied sequence path is not a valid directory {0!r:s}'\
         .format(self.spectra_path)
     if filename.startswith('~'):
         self.file = os.path.abspath(
             os.path.expanduser(os.path.join(filename)))
     else:
         self.file = os.path.abspath(os.path.join(filename))
     # If the path to the file wasn't provided, check the spectra folder
     if not os.path.isfile(self.file):
         self.file = os.path.join(self.spectra_path, filename)
     # If the file still can't be found, check the parental folder of the spectra folder
     if not os.path.isfile(self.file):
         self.file = os.path.join(os.path.dirname(self.spectra_path),
                                  filename)
     self.start = start_time
     assert os.path.isfile(self.file), 'Cannot find the supplied Excel file ({0!r:s}) with the file information. ' \
                                       'Please ensure that this file is in the path, and there\'s no spelling ' \
                                       'mistakes'.format(self.file)
     # Set the output path
     self.outputpath = os.path.join(outputpath)
     # Create the output path as required
     make_path(self.outputpath)
     # Determine the naming scheme
     self.classic = classic
     # Variable for extensions of files to rename
     self.extensions = extensions
     # Create class variable
     self.metadata = list()
 def aa_allele_prep(self):
     """
     Create (first time only) and read the amino acid allele database file
     """
     # Create the amino acid allele database file path as required
     make_path(self.aa_allele_path)
     # Iterate through all the gene in the analysis
     for gene in self.gene_names:
         # Attempt to find the database file
         try:
             allele_file = glob(os.path.join(self.aa_allele_path, f'{gene}*.*fa*'))[0]
         # Create the file if it doesn't exist
         except IndexError:
             allele_file = self.initialise_aa_alleles(gene=gene)
         # Read in and store all the amino acid records in the allele database file
         for record in SeqIO.parse(allele_file, 'fasta'):
             self.aa_allele_dict[record.id] = str(record.seq)
 def __init__(self, path, amino_acid):
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     self.allele_path = os.path.join(self.path, 'alleles')
     self.aa_allele_path = os.path.join(self.path, 'aa_alleles')
     self.profile_path = os.path.join(self.path, 'profile')
     self.aa_profile_path = os.path.join(self.path, 'aa_profile')
     make_path(self.profile_path)
     self.profile_file = os.path.join(self.profile_path, 'profile.txt')
     self.aa_profile_file = os.path.join(self.aa_profile_path, 'aa_profile.txt')
     self.query_path = os.path.join(self.path, 'query')
     self.report_path = os.path.join(self.path, 'reports')
     self.aa_report_path = os.path.join(self.path, 'aa_reports')
     make_path(self.report_path)
     make_path(self.aa_report_path)
     novel_alleles = glob(os.path.join(self.report_path, '*.fasta'))
     for novel_allele in novel_alleles:
         os.remove(novel_allele)
     self.aa_notes_path = os.path.join(self.path, 'aa_notes')
     make_path(self.aa_notes_path)
     self.aa_profile_notes = os.path.join(self.aa_notes_path, 'aa_profile_notes.tsv')
     self.amino_acid = amino_acid
     if not self.amino_acid:
         self.combined_targets = os.path.join(self.allele_path, 'combinedtargets.fasta')
     else:
         self.combined_targets = os.path.join(self.aa_allele_path, 'combinedtargets.fasta')
     self.gene_names = list()
     self.runmetadata = MetadataObject()
     self.runmetadata.samples = list()
     self.cpus = multiprocessing.cpu_count() - 1
     self.profile_report = os.path.join(self.report_path, 'profiles.tsv')
     self.aa_profile_report = os.path.join(self.aa_report_path, 'aa_profiles.tsv')
     try:
         os.remove(self.profile_report)
     except FileNotFoundError:
         pass
     # Fields used for custom outfmt 6 BLAST output:
     self.fieldnames = ['query_id', 'subject_id', 'identical', 'mismatches', 'gaps',
                        'evalue', 'bit_score', 'query_length', 'subject_length', 'alignment_length',
                        'query_start', 'query_end', 'subject_start', 'subject_end',
                        'query_sequence', 'subject_sequence']
     self.extended_fieldnames = self.fieldnames.copy()
     self.extended_fieldnames.insert(14, 'percent_match')
     self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \
                   'qstart qend sstart send qseq sseq'
     # A string of the header to use for formatting the profile file, and the report headers
     self.data = str()
     self.aa_allele_dict = dict()
     self.aa_nt_allele_link_dict = dict()
 def __init__(self, path, profile, one_based):
     logging.info('Welcome to the allele translator!')
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     if profile:
         self.profile_file = os.path.join(self.path, 'profile',
                                          'profile.txt')
         assert os.path.isfile(self.profile_file), 'Cannot locate the required profile file: {profile}. Please ' \
                                                   'ensure that the file name and path of your file is correct'\
             .format(profile=self.profile_file)
     else:
         self.profile_file = None
     self.one_based = one_based
     self.sequence_files = glob(os.path.join(self.path, '*.fasta'))
     self.translated_path = os.path.join(self.path, 'aa_alleles')
     self.notes_path = os.path.join(self.path, 'notes')
     make_path(inpath=self.translated_path)
     make_path(inpath=self.notes_path)
     self.allele_dict = dict()
     self.profile_data = dict()
     self.allele_links = dict()
     self.aa_profile_path = os.path.join(self.path, 'aa_profile')
     make_path(self.aa_profile_path)
     self.aa_profile_file = os.path.join(self.aa_profile_path,
                                         'aa_profile.txt')
     self.gene_names = set()
     self.gene_name_file = os.path.join(self.aa_profile_path,
                                        'gene_names.txt')
     self.aa_profile_data = dict()
     self.profile_matches = dict()
     self.aa_nt_profile_link_file = os.path.join(self.aa_profile_path,
                                                 'reports',
                                                 'aa_nt_profile_links.tsv')
Beispiel #15
0
 def __init__(self, inputobject, extension='fasta', light=False):
     # Create an object to mimic the command line arguments necessary for the script
     args = MetadataObject()
     args.path = inputobject.path
     args.sequencepath = inputobject.path
     args.databasepath = os.path.join(inputobject.reffilepath, 'clark')
     make_path(args.databasepath)
     args.clarkpath = os.path.dirname(which('CLARK'))
     args.clarkpath += '/../opt/clark/'
     args.cutoff = 0.005
     args.database = 'bacteria'
     args.rank = 'species'
     args.filter = False
     args.threads = inputobject.cpus
     args.runmetadata = inputobject.runmetadata
     args.clean_seqs = False
     args.reffilepath = inputobject.reffilepath
     args.extension = extension
     args.light = light
     # Run CLARK
     CLARK(args, inputobject.commit, inputobject.starttime,
           inputobject.homepath)
Beispiel #16
0
 def __init__(self, path, targetfile, min_length, max_length, cutoff,
              perc_gc, blast, one_based):
     # Determine the path in which the sequence files are located. Allow for ~ expansion
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     self.file = os.path.join(self.path, targetfile)
     assert os.path.isfile(
         self.file), 'Cannot find the supplied FASTA file: {fn}'.format(
             fn=self.file)
     self.reportpath = os.path.join(self.path, 'reports')
     self.probepath = os.path.join(self.path, 'probes')
     make_path(self.reportpath)
     self.min = min_length
     self.max = max_length
     self.cutoff = cutoff
     self.perc_gc = perc_gc
     self.blast = blast
     self.one_based = one_based
     self.cpus = multiprocessing.cpu_count()
     self.queue = Queue()
     self.samples = list()
Beispiel #17
0
def test_sistr_seqsero():
    metadata = MetadataObject()
    method.runmetadata.samples = list()
    fasta = os.path.join(var.sequencepath, 'NC_003198.fasta')
    metadata.name = os.path.split(fasta)[1].split('.')[0]
    # Initialise the general and run categories
    metadata.general = GenObject()
    metadata.run = GenObject()
    metadata.general.fastqfiles = list()
    metadata.general.trimmedcorrectedfastqfiles = [
        os.path.join(var.sequencepath, 'seqsero',
                     '2014-SEQ-1049_seqsero.fastq.gz')
    ]
    # Set the destination folder
    outputdir = os.path.join(var.sequencepath, metadata.name)
    make_path(outputdir)
    # Add the output directory to the metadata
    metadata.general.outputdirectory = outputdir
    metadata.general.logout = os.path.join(outputdir, 'out')
    metadata.general.logerr = os.path.join(outputdir, 'err')
    metadata.run.outputdirectory = outputdir
    metadata.general.bestassemblyfile = True
    # Initialise an attribute to store commands
    metadata.commands = GenObject()
    # Assume that all samples are Salmonella
    metadata.general.referencegenus = 'Salmonella'
    # Set the .fasta file as the best assembly
    metadata.general.bestassemblyfile = fasta
    method.runmetadata.samples.append(metadata)
    method.sistr()
    for sample in method.runmetadata.samples:
        assert sample.sistr.cgmlst_genome_match == 'ERR586739' or sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA'
    method.seqsero()
    for sample in method.runmetadata.samples:
        assert sample.seqsero.predicted_serotype == '- 9:f,g,t:-'
    variable_update()
Beispiel #18
0
 def reports(self):
     """
     Create reports from the abundance estimation
     """
     logging.info(
         'Creating CLARK report for {ft} files'.format(ft=self.extension))
     # Create a workbook to store the report. Using xlsxwriter rather than a simple csv format, as I want to be
     # able to have appropriately sized, multi-line cells
     workbook = xlsxwriter.Workbook(self.report)
     make_path(self.reportpath)
     # New worksheet to store the data
     worksheet = workbook.add_worksheet()
     # Add a bold format for header cells. Using a monotype font size 8
     bold = workbook.add_format({
         'bold': True,
         'font_name': 'Courier New',
         'font_size': 8
     })
     bold.set_align('center')
     # Format for data cells. Monotype, size 8, top vertically justified
     courier = workbook.add_format({
         'font_name': 'Courier New',
         'font_size': 8
     })
     courier.set_align('top')
     # Set the custom width for 5 and 6 to be 15
     worksheet.set_column(5, 5, 15)
     worksheet.set_column(6, 6, 20)
     # Initialise the position within the worksheet to be (0,0)
     row = 0
     col = 0
     # List of the headers to use
     headers = [
         'Strain', 'Name', 'TaxID', 'Lineage', 'Count', 'Proportion_All(%)',
         'Proportion_Classified(%)'
     ]
     # Add an additional header for .fasta analyses
     if self.extension == 'fasta':
         headers.insert(4, 'TotalBP')
     # Populate the headers
     for category in headers:
         # Write the data in the specified cell (row, col) using the bold format
         worksheet.write(row, col, category, bold)
         # Move to the next column to write the next category
         col += 1
     # Data starts in row 1
     row = 1
     # Initialise variables to hold the longest names; used in setting the column width
     longeststrain = 0
     longestname = 0
     longestlineage = 0
     # Extract all the taxonomic groups that pass the cutoff from the abundance file
     for sample in self.runmetadata.samples:
         # Every record starts at column 0
         col = 0
         # Write the strain name
         worksheet.write(row, col, sample.name, courier)
         col += 1
         # Initialise a dictionary to store the species above the cutoff in the sample
         sample.general.passfilter = list()
         try:
             # Abundance file as a dictionary
             abundancedict = DictReader(open(sample.general.abundance))
             # Filter abundance to taxIDs with at least self.cutoff% of the total proportion
             for result in abundancedict:
                 # The UNKNOWN category doesn't contain a 'Lineage' column, and therefore, subsequent columns are
                 # shifted out of proper alignment, and do not contain the appropriate data
                 try:
                     if float(result['Proportion_All(%)']) > self.cutoff:
                         sample.general.passfilter.append(result)
                 except ValueError:
                     pass
             # Determine the longest name of all the strains, and use it to set the width of column 0
             if len(sample.name) > longeststrain:
                 longeststrain = len(sample.name)
                 worksheet.set_column(0, 0, longeststrain)
             # Sort the abundance results based on the highest count
             sortedabundance = sorted(sample.general.passfilter,
                                      key=lambda x: int(x['Count']),
                                      reverse=True)
             # Set of contigs from the classification file. For some reason, certain contigs are represented multiple
             # times in the classification file. As far as I can tell, these multiple representations are always
             # classified the same, and, therefore, should be treated as duplicates, and ignored
             contigset = set()
             for result in sortedabundance:
                 # Add the total number of base pairs classified for each TaxID. As only the total number of contigs
                 # classified as a particular TaxID are in the report, it can be misleading if a large number
                 # of small contigs are classified to a particular TaxID e.g. 56 contigs map to TaxID 28901, and 50
                 # contigs map to TaxID 630, however, added together, those 56 contigs are 4705838 bp, while the 50
                 # contigs added together are only 69602 bp. While this is unlikely a pure culture, only
                 # 69602 / (4705838 + 69602) = 1.5% of the total bp map to TaxID 630 compared to 45% of the contigs
                 if self.extension == 'fasta':
                     # Initialise a variable to store the total bp mapped to the TaxID
                     result['TotalBP'] = int()
                     # Read the classification file into a dictionary
                     classificationdict = DictReader(
                         open(sample.general.classification))
                     # Read through each contig classification in the dictionary
                     for contig in classificationdict:
                         # Pull out each contig with a TaxID that matches the TaxID of the result of interest, and
                         # is not present in a set of contigs that have already been added to the dictionary
                         if result['TaxID'] == contig[
                                 ' Assignment'] and contig[
                                     'Object_ID'] not in contigset:
                             # Increment the total bp mapping to the TaxID by the integer of each contig
                             result['TotalBP'] += int(contig[' Length'])
                             # Avoid duplicates by adding the contig name to the set of contigs
                             contigset.add(contig['Object_ID'])
                 # Print the results to file
                 # Ignore the first header, as it is the strain name, which has already been added to the report
                 dictionaryheaders = headers[1:]
                 for header in dictionaryheaders:
                     data = result[header]
                     worksheet.write(row, col, data, courier)
                     col += 1
                     # Determine the longest name of all the matches, and use it to set the width of column 0
                     if len(result['Name']) > longestname:
                         longestname = len(result['Name'])
                         worksheet.set_column(1, 1, longestname)
                     # Do the same for the lineages
                     if len(result['Lineage']) > longestlineage:
                         longestlineage = len(result['Lineage'])
                         worksheet.set_column(3, 3, longestlineage)
                 # Increase the row
                 row += 1
                 # Set the column to 1
                 col = 1
         except (KeyError, AttributeError):
             # Increase the row
             row += 1
     # Close the workbook
     workbook.close()
Beispiel #19
0
def main(args):
    # Create the path to store the schemes (if necessary)
    make_path(args.path)
    # Allow for Shigella to use the Escherichia MLST profile/alleles
    args.genus = args.genus if args.genus != 'Shigella' else 'Escherichia'
    # As there are multiple profiles for certain organisms, this dictionary has the schemes I use as values
    organismdictionary = {
        'Escherichia': 'Escherichia coli#1',
        'Vibrio': 'Vibrio parahaemolyticus',
        'Campylobacter': 'Campylobacter jejuni',
        'Listeria': 'Listeria monocytogenes',
        'Bacillus': 'Bacillus cereus',
        'Staphylococcus': "Staphylococcus aureus",
        'Salmonella': 'Salmonella enterica'
    }
    # Set the appropriate profile based on the dictionary key:value pairs
    try:
        args.genus = organismdictionary[args.species]
    except (KeyError, AttributeError):
        pass
    with url.urlopen(args.repository_url) as docfile:
        doc = xml.parse(docfile)
        root = doc.childNodes[0]
        found_species = []
        for species_node in root.getElementsByTagName('species'):
            info = getspeciesinfo(species_node, args.genus,
                                  args.force_scheme_name)
            if info is not None:
                found_species.append(info)
        if len(found_species) == 0:
            print("No species matched your query.")
            return
        if len(found_species) > 1:
            print(
                "The following {} species match your query, please be more specific:"
                .format(len(found_species)))
            for info in found_species:
                print(info.name)
                return
        # exit(2)

    # output information for the single matching species
    assert len(found_species) == 1
    species_info = found_species[0]
    species_name_underscores = species_info.name.replace(' ', '_')
    species_name_underscores = species_name_underscores.replace('/', '_')
    species_all_fasta_filename = species_name_underscores + '.fasta'
    species_all_fasta_file = open(
        '{}/{}'.format(args.path, species_all_fasta_filename), 'w')
    log_filename = "mlst_data_download_{}_{}.log".format(
        species_name_underscores, species_info.retrieved)
    log_file = open('{}/{}'.format(args.path, log_filename), "w")
    log_file.write(species_info.retrieved + '\n')
    profile_path = urlparse(species_info.profiles_url).path
    profile_filename = profile_path.split('/')[-1]
    log_file.write("definitions: {}\n".format(profile_filename))
    log_file.write("{} profiles\n".format(species_info.profiles_count))
    log_file.write("sourced from: {}\n\n".format(species_info.profiles_url))
    #
    # with url.urlopen(species_info.profiles_url) as profile_doc:
    #     with open(os.path.join(args.path, profile_filename), 'w') as profile_file:
    localfile, headers = url.urlretrieve(species_info.profiles_url)
    with open(localfile, 'r') as profile_doc:
        with open(os.path.join(args.path, profile_filename),
                  'w') as profile_file:
            profile_file.write(profile_doc.read())
    for locus in species_info.loci:
        locus_path = urlparse(locus.url).path
        locus_filename = locus_path.split('/')[-1]
        log_file.write("locus {}\n".format(locus.name))
        log_file.write(locus_filename + '\n')
        log_file.write("Sourced from {}\n\n".format(locus.url))
        #
        local_locus_doc, headers = url.urlretrieve(locus.url)
        with open(local_locus_doc, 'r') as locus_doc:
            with open(os.path.join(args.path, locus_filename),
                      'w') as locus_file:
                # locus_doc = url.urlopen(locus.url)
                # locus_file = open('{}/{}'.format(args.path, locus_filename), 'w')
                locus_fasta_content = locus_doc.read()
                locus_file.write(locus_fasta_content)
                species_all_fasta_file.write(locus_fasta_content)
                # locus_file.close()
                # locus_doc.close()
    log_file.write("all loci: {}\n".format(species_all_fasta_filename))
    log_file.close()
    species_all_fasta_file.close()
    def main(self):
        # Create metadata objects for all files in the query folder
        self.query_prep()
        for sample in self.runmetadata.samples:
            logging.warning('Processing sample {sn}'.format(sn=sample.name))
            if not self.amino_acid:
                records, gene_names, self.data = \
                    allele_prep(allele_path=self.allele_path,
                                gene_names=self.gene_names,
                                combined_targets=self.combined_targets,
                                amino_acid=self.amino_acid)
            else:
                records, gene_names, self.data = \
                    allele_prep(allele_path=self.aa_allele_path,
                                gene_names=self.gene_names,
                                combined_targets=self.combined_targets,
                                amino_acid=self.amino_acid)

            logging.info('Loading profile')
            if not self.amino_acid:
                profile_data = read_profile(profile_file=self.profile_file)
            else:
                profile_data = read_profile(profile_file=self.aa_profile_file)
            self.blast_alleles(runmetadata=sample,
                               amino_acid=self.amino_acid)
            parseable_blast_outputs(runmetadata=sample,
                                    fieldnames=self.fieldnames,
                                    extended_fieldnames=self.extended_fieldnames,
                                    records=records)
            sample = parse_results(runmetadata=sample,
                                   fieldnames=self.fieldnames,
                                   extended_fieldnames=self.extended_fieldnames,
                                   amino_acid=self.amino_acid,
                                   genome_query=True)
            if not self.amino_acid:
                profile_dict, profile_set = profile_alleles(runmetadata=sample,
                                                            profile_dict=dict(),
                                                            profile_set=list(),
                                                            records=self.gene_names,
                                                            novel_alleles=True,
                                                            genome_query=True,
                                                            allele_path=self.allele_path,
                                                            report_path=self.report_path)
            else:
                profile_dict, profile_set = profile_alleles(runmetadata=sample,
                                                            profile_dict=dict(),
                                                            profile_set=list(),
                                                            records=self.gene_names,
                                                            novel_alleles=True,
                                                            genome_query=True,
                                                            allele_path=self.aa_allele_path,
                                                            report_path=self.aa_report_path)
            profile_matches = match_profile(profile_data=profile_data,
                                            profile_dict=profile_dict,
                                            profile_matches=dict())
            profile_matches, profile_data, new_profiles = \
                create_profile(profile_data=profile_data,
                               profile_set=profile_set,
                               new_profiles=list(),
                               profile_dict=profile_dict,
                               profile_matches=profile_matches)
            if not self.amino_acid:
                sample = sequence_typer(profile_report=self.profile_report,
                                        data=self.data,
                                        runmetadata=sample,
                                        profile_matches=profile_matches,
                                        profile_data=profile_data,
                                        update=True)
                append_profiles(new_profiles=new_profiles,
                                profile_file=self.profile_file,
                                data=self.data,
                                novel_profiles=True,
                                profile_path=self.profile_path,
                                gene_names=self.gene_names)
            else:
                sample = sequence_typer(profile_report=self.aa_profile_report,
                                        data=self.data,
                                        runmetadata=sample,
                                        profile_matches=profile_matches,
                                        profile_data=profile_data,
                                        update=True)
                append_profiles(new_profiles=new_profiles,
                                profile_file=self.aa_profile_file,
                                data=self.data,
                                novel_profiles=True,
                                profile_path=self.aa_profile_path,
                                gene_names=self.gene_names)
            if not self.amino_acid:
                # AA
                sample = self.translate(runmetadata=sample)
                self.aa_allele_prep()
                aa_profile_dict, aa_profile_set = self.aa_allele_match(runmetadata=sample,
                                                                       profile_dict=dict(),
                                                                       profile_set=list(),
                                                                       gene_names=gene_names)
                aa_profile_data = read_profile(profile_file=self.aa_profile_file)
                aa_profile_matches = match_profile(profile_data=aa_profile_data,
                                                   profile_dict=aa_profile_dict,
                                                   profile_matches=dict())
                aa_profile_matches, aa_profile_data, aa_new_profiles = \
                    create_profile(profile_data=aa_profile_data,
                                   profile_set=aa_profile_set,
                                   new_profiles=list(),
                                   profile_dict=aa_profile_dict,
                                   profile_matches=aa_profile_matches)

                sample = sequence_typer(profile_report=self.aa_profile_report,
                                        data=self.data,
                                        runmetadata=sample,
                                        profile_matches=aa_profile_matches,
                                        profile_data=aa_profile_data,
                                        update=True,
                                        amino_acid=True)
                make_path(self.aa_profile_path)
                append_profiles(new_profiles=aa_new_profiles,
                                profile_file=self.aa_profile_file,
                                data=self.data,
                                novel_profiles=True,
                                profile_path=self.aa_profile_path,
                                gene_names=self.gene_names)
                self.aa_notes(runmetadata=sample)
                clear_alleles(combined_targets_db=glob(os.path.join(self.allele_path, 'combinedtargets*')),
                              custom_targets=os.path.join(self.allele_path, 'custom.tfa'))
 def __init__(self, args):
     """
     Initialises the variables required for this class
     :param args: list of arguments passed to the script
     """
     self.debug = args.debug
     SetupLogging(self.debug)
     logging.info(
         'Welcome to the CFIA OLC Workflow for Bacterial Assembly and Typing (COWBAT) version {version}'
         .format(version=__version__))
     # Define variables from the arguments - there may be a more streamlined way to do this
     self.args = args
     if args.sequencepath.startswith('~'):
         self.path = os.path.abspath(
             os.path.expanduser(os.path.join(args.sequencepath)))
     else:
         self.path = os.path.abspath(os.path.join(args.sequencepath))
     self.sequencepath = self.path
     if args.referencefilepath.startswith('~'):
         self.reffilepath = os.path.expanduser(
             os.path.abspath(os.path.join(args.referencefilepath)))
     else:
         self.reffilepath = os.path.abspath(
             os.path.join(args.referencefilepath))
     self.numreads = args.numreads
     self.preprocess = args.preprocess
     # Define the start time
     self.starttime = args.startingtime
     if args.customsamplesheet:
         if args.customsamplesheet.startswith('~'):
             self.customsamplesheet = os.path.expanduser(
                 os.path.abspath(os.path.join(self.customsamplesheet)))
         else:
             self.customsamplesheet = os.path.abspath(
                 os.path.join(args.customsamplesheet))
     else:
         self.customsamplesheet = args.customsamplesheet
     if self.customsamplesheet:
         assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {css}' \
             .format(css=self.customsamplesheet)
     self.basicassembly = args.basicassembly
     if not self.customsamplesheet and not os.path.isfile(
             os.path.join(self.path, 'SampleSheet.csv')):
         self.basicassembly = True
         logging.warning(
             'Could not find a sample sheet. Performing basic assembly (no run metadata captured)'
         )
     # Use the argument for the number of threads to use, or default to the number of cpus in the system
     self.cpus = args.threads if args.threads else multiprocessing.cpu_count(
     ) - 1
     # Assertions to ensure that the provided variables are valid
     make_path(self.path)
     assert os.path.isdir(
         self.path
     ), 'Supplied path location is not a valid directory {0!r:s}'.format(
         self.path)
     self.reportpath = os.path.join(self.path, 'reports')
     make_path(self.reportpath)
     assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}' \
         .format(self.reffilepath)
     self.commit = __version__
     self.homepath = args.homepath
     self.logfile = os.path.join(self.path, 'logfile')
     self.runinfo = str()
     self.pipeline = True
     self.qualityobject = MetadataObject()
     # Initialise the metadata object
     self.runmetadata = MetadataObject()
 def __init__(self, path, fasta_path, records, amino_acid):
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     if fasta_path.startswith('~'):
         self.fasta_path = os.path.abspath(
             os.path.expanduser(os.path.join(fasta_path)))
     else:
         self.fasta_path = os.path.abspath(os.path.join(fasta_path))
     self.working_path = os.path.join(self.path, 'strain_profiles')
     self.sequencepath = os.path.join(self.working_path, 'query')
     make_path(self.sequencepath)
     target_files = [
         fasta
         for fasta in sorted(glob(os.path.join(self.fasta_path, '*.fasta')))
         if os.path.basename(fasta) != 'combinedtargets.fasta'
         or os.path.basename(fasta) != 'custom.tfa'
     ]
     self.query_files = list()
     # Create symlinks of the target files in the local path
     for target in target_files:
         try:
             query_file = os.path.join(
                 self.sequencepath,
                 os.path.basename(target).replace('.tfa', '.fasta'))
             self.query_files.append(query_file)
             os.symlink(target, query_file)
         except FileExistsError:
             pass
     self.targetpath = os.path.join(self.working_path, 'targets')
     make_path(self.targetpath)
     self.profilepath = os.path.join(self.working_path, 'sequence_profile')
     make_path(self.profilepath)
     self.profile_file = os.path.join(self.profilepath, 'profile.txt')
     self.target_file = os.path.join(self.targetpath,
                                     'combinedtargets.fasta')
     shutil.copyfile(src=os.path.join(os.path.join(self.path, 'alleles'),
                                      'combinedtargets.fasta'),
                     dst=self.target_file)
     self.reportpath = os.path.join(self.working_path, 'reports')
     make_path(self.reportpath)
     self.strain_profile_path = os.path.join(self.working_path,
                                             'strain_profiles')
     make_path(self.strain_profile_path)
     self.profile_report = os.path.join(self.strain_profile_path,
                                        'profiles.tsv')
     self.cpus = multiprocessing.cpu_count() - 1
     self.starttime = time()
     self.start = self.starttime
     self.runmetadata = MetadataObject()
     self.runmetadata.samples = list()
     self.records = records
     # Create an object for performing BLAST analyses
     if amino_acid:
         self.amino_acid = amino_acid
     else:
         self.amino_acid = None
     if amino_acid:
         self.program = 'tblastn'
     else:
         self.program = 'blastn'
     # Fields used for custom outfmt 6 BLAST output:
     self.fieldnames = [
         'query_id', 'subject_id', 'identical', 'mismatches', 'gaps',
         'evalue', 'bit_score', 'query_length', 'subject_length',
         'alignment_length', 'query_start', 'query_end', 'subject_start',
         'subject_end', 'query_sequence', 'subject_sequence'
     ]
     self.extended_fieldnames = self.fieldnames.copy()
     self.extended_fieldnames.insert(14, 'percent_match')
     self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \
                   'qstart qend sstart send qseq sseq'
     self.blast_reports = list()
     self.profile_dict = dict()
     self.profile_data = dict()
     self.profile_set = list()
     self.sequence_profile = dict()
     self.profile_matches = dict()
     self.new_profiles = list()
     # A string of the header to use for formatting the profile file, and the report headers
     genes = '\t'.join(sorted(self.records))
     self.data = 'ST\t{genes}\n'.format(genes=genes.rstrip())
     self.gene_names = list()
Beispiel #23
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath):
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.start = startingtime
     self.homepath = scriptpath
     # Define variables based on supplied arguments
     self.args = args
     self.path = os.path.join(args.path)
     assert os.path.isdir(
         self.path
     ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     self.sequencepath = os.path.join(args.sequencepath, '')
     assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \
         .format(self.sequencepath)
     self.databasepath = os.path.join(args.databasepath, '')
     assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \
         .format(self.databasepath)
     # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1
     self.cpus = 4
     # Set variables from the arguments
     self.database = args.database
     self.rank = args.rank
     self.clarkpath = args.clarkpath
     self.cutoff = float(args.cutoff) * 100
     # Initialise variables for the analysis
     self.targetcall = str()
     self.classifycall = str()
     self.devnull = open(os.devnull, 'wb')
     self.filelist = os.path.join(self.path, 'sampleList.txt')
     self.reportlist = os.path.join(self.path, 'reportList.txt')
     self.abundancequeue = Queue()
     self.datapath = str()
     self.reportpath = os.path.join(self.path, 'reports')
     self.clean_seqs = args.clean_seqs
     self.light = args.light
     self.extension = args.extension
     if self.clean_seqs:
         try:
             self.reffilepath = args.reffilepath
         except AttributeError:
             self.clean_seqs = False
     # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects
     # and variables play nice
     try:
         if args.runmetadata:
             self.runmetadata = args.runmetadata
             # Create the name of the final report
             self.report = os.path.join(
                 self.reportpath,
                 'abundance_{ft}.xlsx'.format(ft=self.extension))
             # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK
             if not os.path.isfile(self.report):
                 logging.info(
                     'Performing CLARK analysis on {ft} files'.format(
                         ft=self.extension))
                 if self.extension != 'fastq':
                     for sample in self.runmetadata.samples:
                         sample.general.combined = sample.general.bestassemblyfile
                     # Run the pipeline
                     self.main()
                 else:
                     # Only perform FASTQ analyses if the sample is declared to be a metagenome
                     metagenome = False
                     for sample in self.runmetadata.samples:
                         try:
                             status = sample.run.Description
                         except AttributeError:
                             status = 'unknown'
                         if status == 'metagenome':
                             metagenome = True
                     # If any of the samples are metagenomes, run the CLARK analysis on the raw files
                     if metagenome:
                         fileprep.Fileprep(self)
                         # Run the pipeline
                         self.main()
                 # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects
                 for sample in self.runmetadata.samples:
                     # Create a GenObject to store metadata when this script is run as part of the pipeline
                     clarkextension = 'clark{}'.format(self.extension)
                     setattr(sample, clarkextension, GenObject())
                     # Create a folder to store all the CLARK files
                     sample[clarkextension].outputpath = os.path.join(
                         sample.general.outputdirectory, 'CLARK')
                     make_path(sample[clarkextension].outputpath)
                     if sample.general.bestassemblyfile != 'NA':
                         # Move the files to the CLARK folder
                         try:
                             move(
                                 sample.general.abundance,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.abundance)))
                             move(
                                 sample.general.classification,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.classification)))
                         except (AttributeError, FileNotFoundError):
                             pass
                         # Set the CLARK-specific attributes
                         try:
                             sample[
                                 clarkextension].abundance = sample.general.abundance
                             sample[
                                 clarkextension].classification = sample.general.classification
                             sample[
                                 clarkextension].combined = sample.general.combined
                         except AttributeError:
                             pass
                         if self.extension == 'fastq':
                             # Remove the combined .fastq files
                             try:
                                 if type(sample[clarkextension].combined
                                         ) is list:
                                     os.remove(
                                         sample[clarkextension].combined)
                             except (OSError, AttributeError):
                                 pass
                     # Remove the text files lists of files and reports created by CLARK
                     try:
                         map(
                             lambda x: os.remove(os.path.join(self.path, x)
                                                 ),
                             ['reportList.txt', 'sampleList.txt'])
                     except OSError:
                         pass
         else:
             self.runmetadata = MetadataObject()
             self.report = os.path.join(self.reportpath, 'abundance.xlsx')
             # Create the objects
             self.objectprep()
             self.main()
     except AttributeError:
         self.runmetadata = MetadataObject()
         self.report = os.path.join(self.reportpath, 'abundance.xlsx')
         # Create the objects
         self.objectprep()
         # Set the run description to 'metagenome' in order to process the samples
         for sample in self.runmetadata.samples:
             sample.run.Description = 'metagenome'
         self.main()
     # Optionally filter the .fastq reads based on taxonomic assignment
     if args.filter:
         filtermetagenome.PipelineInit(self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(self)
Beispiel #24
0
 def __init__(self,
              path,
              targetfile,
              analysis_type,
              fasta_path,
              genesippr,
              metadata_file,
              cutoff,
              amino_acid,
              one_based,
              target_alleles=True,
              allele_hashing=False):
     logging.info('Welcome to the CFIA Allele Finder (CAlF)')
     # Determine the path in which the sequence files are located. Allow for ~ expansion
     if path.startswith('~'):
         self.path = os.path.abspath(os.path.expanduser(os.path.join(path)))
     else:
         self.path = os.path.abspath(os.path.join(path))
     self.targetfile = os.path.join(self.path, targetfile)
     assert os.path.isfile(
         self.targetfile
     ), 'Cannot find the supplied FASTA file: {fn}'.format(
         fn=self.targetfile)
     self.reportpath = os.path.join(self.path, 'reports')
     self.allelepath = os.path.join(self.path, 'alleles')
     make_path(self.reportpath)
     make_path(self.allelepath)
     self.analysistype = analysis_type
     if self.analysistype != 'remote':
         if fasta_path.startswith('~'):
             self.fasta_path = os.path.abspath(
                 os.path.expanduser(os.path.join(fasta_path)))
         else:
             self.fasta_path = os.path.abspath(os.path.join(fasta_path))
     else:
         self.fasta_path = None
     self.gensippr = genesippr
     if self.gensippr:
         self.metadata_file = os.path.join(self.path, metadata_file)
     self.cutoff = cutoff
     # If the supplied target allele is to be included in the output allele file, set self.target_alleles to 0 - this
     # will later be used to set the starting index of when iterating over the list of alleles (target allele is
     # stored at index 0)
     if target_alleles:
         self.target_alleles = 0
     else:
         self.target_alleles = 1
     # Set whether the allele identifiers will be generic (_0) or computed hashes of the allele sequence
     if allele_hashing:
         self.allele_hashing = True
     else:
         self.allele_hashing = False
     if amino_acid:
         self.amino_acid = amino_acid
     else:
         self.amino_acid = None
     self.one_based = one_based
     if self.one_based and self.allele_hashing:
         logging.error(
             'Only one of allele_hashing (-a), and 1-based (-o) may be specified'
         )
         raise SystemExit
     self.records = dict()
     self.record_parameters = dict()
     self.expect = dict()
     self.word_size = dict()
     self.filter_low_complexity = dict()
     self.blast_outputs = dict()
     self.alleleset = dict()
     self.illegal_alleleset = dict()
     self.strain_genera = dict()
     self.all_alleles = list()
     self.devnull = open(os.devnull, 'wb')
     self.cpus = multiprocessing.cpu_count()
     self.queue = Queue()
     # Fields used for custom outfmt 6 BLAST output:
     self.fieldnames = [
         'query_id', 'subject_id', 'positives', 'mismatches', 'gaps',
         'evalue', 'bit_score', 'subject_length', 'alignment_length',
         'query_start', 'query_end', 'subject_start', 'subject_end',
         'query_sequence', 'subject_sequence'
     ]
     self.outfmt = '6 qseqid sseqid positive mismatch gaps evalue bitscore slen length qstart qend sstart send ' \
                   'qseq sseq'
     # TODO self.strain_genera needs to be populated properly
     self.metadata_file = str()
     self.local_dict = dict()
     self.genera = str()
     self.gene_dict = dict()
     self.mismatches = dict()
     self.genus_alleles = dict()