Esempio n. 1
0
    def test_filter_dicts(self):
        '''Test _filter_dicts'''
        rf = report_filter.ReportFilter(min_ref_base_assembled=10, ignore_not_has_known_variant=True)
        ref_2_dict = {x: '.' for x in report.columns}
        ref_2_dict['pc_ident'] = 91.0
        ref_2_dict['ref_base_assembled'] = 10
        ref_2_dict['has_known_var'] = '0'
        ref_2_dict['flag'] = flag.Flag(27)
        ref_2_dict['var_type'] = '.'

        rf.report = {
            'ref1': {
                'ref1.scaff1': [
                    {'flag': flag.Flag(27), 'pc_ident': 91.0, 'ref_base_assembled': 9, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                    {'flag': flag.Flag(27), 'pc_ident': 91.5, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'HET'},
                    {'flag': flag.Flag(27), 'pc_ident': 89.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '0', 'var_type': 'SNP'},
                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                ]
            },
            'ref2': {
                'ref2.scaff1': [
                    ref_2_dict
                ]
            },
            'ref3': {
                'ref3.scaff1': [
                    {'flag': flag.Flag(27), 'pc_ident': 84.0, 'ref_base_assembled': 10, 'known_var': '1', 'has_known_var': '0', 'var_type': 'SNP'},
                ]
            },
            'ref4': {
                'ref4.scaff1': [
                    {'flag': flag.Flag(64), 'pc_ident': '.', 'ref_base_assembled': '.', 'known_var': '.', 'has_known_var': '.', 'var_type': '.'},
                ]
            }
        }

        expected = {
            'ref1': {
                'ref1.scaff1': [
                    {'flag': flag.Flag(27), 'pc_ident': 91.5, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'HET'},
                    {'flag': flag.Flag(27), 'pc_ident': 90.0, 'ref_base_assembled': 11, 'known_var': '1', 'has_known_var': '1', 'var_type': 'SNP'},
                ]
            },
            'ref2': {
                'ref2.scaff1': [ref_2_dict]
            }
        }

        rf._filter_dicts()
        self.assertEqual(expected, rf.report)
Esempio n. 2
0
 def test_flag_passes_filter(self):
     '''Test _flag_passes_filter'''
     rf = report_filter.ReportFilter()
     exclude_flags = ['assembly_fail', 'ref_seq_choose_fail']
     f = flag.Flag()
     self.assertTrue(rf._flag_passes_filter(f, exclude_flags))
     f.add('assembled')
     self.assertTrue(rf._flag_passes_filter(f, exclude_flags))
     f = flag.Flag()
     f.add('assembly_fail')
     self.assertFalse(rf._flag_passes_filter(f, exclude_flags))
     f = flag.Flag()
     f.add('ref_seq_choose_fail')
     self.assertFalse(rf._flag_passes_filter(f, exclude_flags))
Esempio n. 3
0
 def test_has(self):
     '''Test has'''
     for x in flag.flags_in_order:
         f = flag.Flag(0)
         self.assertFalse(f.has(x))
         f.add(x)
         self.assertTrue(f.has(x))
Esempio n. 4
0
 def test_line2dict(self):
     '''Test _line2dict'''
     line = '\t'.join([
         'gene1', '187', '42', '3', '750', '750', '98.93', 'SNP', 'SYN',
         '.', '66', '66', 'A', 'gene1.scaffold.1', '1047', '67', '67', 'C',
         '42', 'A', '22,20'
     ])
     s = summary.Summary('out', filenames=['spam', 'eggs'])
     expected = {
         'gene': 'gene1',
         'flag': flag.Flag(187),
         'reads': 42,
         'cluster': '3',
         'gene_len': 750,
         'assembled': 750,
         'pc_ident': 98.93,
         'var_type': 'SNP',
         'var_effect': 'SYN',
         'new_aa': '.',
         'gene_start': 66,
         'gene_end': 66,
         'gene_nt': 'A',
         'scaffold': 'gene1.scaffold.1',
         'scaff_len': 1047,
         'scaff_start': 67,
         'scaff_end': 67,
         'scaff_nt': 'C',
         'read_depth': 42,
         'alt_bases': 'A',
         'ref_alt_depth': '22,20'
     }
     self.assertEqual(s._line2dict(line), expected)
Esempio n. 5
0
 def test_add(self):
     '''Test add'''
     f = flag.Flag()
     expected = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047]
     for i in range(len(flag.flags_in_order)):
         f.add(flag.flags_in_order[i])
         self.assertEqual(f.to_number(), expected[i])
Esempio n. 6
0
    def test_to_summary_number(self):
        '''Test _to_summary_number'''
        s = summary.Summary('out', filenames=['spam', 'eggs'])
        tests = [
            (0, 0),
            (64, 0),
            (7, 1),
            (259, 1),
            (15, 2),
            (27, 3),
        ]

        for t in tests:
            l = [{'flag': flag.Flag(t[0]), 'assembled': 42, 'pc_ident': 99}]
            self.assertEqual(s._to_summary_number(l), t[1])

        l = [{'flag': flag.Flag(27), 'assembled': 42, 'pc_ident': 89}]
        self.assertEqual(s._to_summary_number(l), 0)
Esempio n. 7
0
 def _line2dict(self, line):
     data = line.rstrip().split('\t')
     d = {columns[i]: data[i] for i in range(len(data))}
     d['flag'] = flag.Flag(int(d['flag']))
     for key in int_columns:
         try:
             d[key] = int(d[key])
         except:
             assert d[key] == '.'
     try:
         d['pc_ident'] = float(d['pc_ident'])
     except:
         assert d['pc_ident'] == '.'
     return d
Esempio n. 8
0
    def test_to_long_str(self):
        '''Test to_long_str'''
        f = flag.Flag(13)
        expected = '\n'.join([
            '[X] gene_assembled',
            '[ ] gene_assembled_into_one_contig',
            '[X] gene_region_assembled_twice',
            '[X] complete_orf',
            '[ ] unique_contig',
            '[ ] scaffold_graph_bad',
            '[ ] assembly_fail',
            '[ ] variants_suggest_collapsed_repeat',
            '[ ] hit_both_strands',
        ])

        self.assertEqual(expected, f.to_long_string())
Esempio n. 9
0
    def line2dict(cls, line, filename=None):
        data = line.rstrip().split('\t')
        if len(data) != len(report.columns):
            if filename is not None:
                filename_message = 'Error reading ariba summary file "' + filename + '". '
            else:
                filename_message = ''
            raise Error(
                filename_message +
                'Wrong number of columns in the following line. Expected ' +
                str(len(report.columns)) + ' but got ' + str(len(data)) +
                '\n' + line)

        d = {report.columns[i]: data[i] for i in range(len(data))}
        try:
            d['flag'] = flag.Flag(int(d['flag']))
        except:
            raise Error('Error getting flag in the following line. Got "' +
                        d['flag'] + '" for the flag.\n' + line)

        for key in int_columns:
            try:
                d[key] = int(d[key])
            except:
                assert d[key] == '.'

        for key in float_columns:
            try:
                d[key] = float(d[key])
            except:
                assert d[key] == '.'

        if d['var_description'] == '.':
            d['var_group'] = '.'
        else:
            try:
                d['var_group'] = d['var_description'].split(':')[4]
            except:
                raise Error(
                    'Error getting variant group from the following line:\n' +
                    line)

        return d
Esempio n. 10
0
    def test_line2dict(self):
        '''Test _line2dict'''
        line = 'ariba_refname\trefname\t1\t0\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:1:0:A14T:var_group1:ref has wild type, foo bar\tsome free text'

        expected = {
            'ariba_ref_name': 'ariba_refname',
            'ref_name': 'refname',
            'gene': '1',
            'var_only': '0',
            'flag': flag.Flag(19),
            'reads': 78,
            'cluster': 'cluster',
            'ref_len': 120,
            'ref_base_assembled': 120,
            'pc_ident': 98.33,
            'ctg': 'ctg_name',
            'ctg_len': 279,
            'ctg_cov': '24.4',
            'known_var': '1',
            'var_type': 'SNP',
            'var_seq_type': 'n',
            'known_var_change': 'A14T',
            'has_known_var': '1',
            'ref_ctg_change': 'A14T',
            'ref_ctg_effect': 'SNP',
            'ref_start': 13,
            'ref_end': 13,
            'ref_nt': 'A',
            'ctg_start': 84,
            'ctg_end': 84,
            'ctg_nt': 'T',
            'smtls_total_depth': '17',
            'smtls_nts': 'T',
            'smtls_nts_depth': '17',
            'var_description':
            'noncoding1:1:0:A14T:var_group1:ref has wild type, foo bar',
            'var_group': 'var_group1',
            'free_text': 'some free text'
        }

        self.assertEqual(summary_cluster.SummaryCluster.line2dict(line),
                         expected)
Esempio n. 11
0
    def test_report_line_to_dict(self):
        line = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t999\t23.2\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text'
        expected = {
            'ariba_ref_name': 'ariba_cluster1',
            'ref_name': 'cluster1',
            'gene': '0',
            'var_only': '0',
            'flag': flag.Flag(27),
            'reads': 10000,
            'cluster': 'cluster1',
            'ref_len': 1000,
            'ref_base_assembled': 999,
            'pc_ident': 99.42,
            'ctg': 'cluster1.scaffold.1',
            'ctg_len': 999,
            'ctg_cov': 23.2,
            'known_var': '1',
            'var_type': 'SNP',
            'var_seq_type': 'n',
            'known_var_change': 'C42T',
            'has_known_var': '0',
            'ref_ctg_change': '.',
            'ref_ctg_effect': '.',
            'ref_start': 42,
            'ref_end': 42,
            'ref_nt': 'C',
            'ctg_start': 142,
            'ctg_end': 142,
            'ctg_nt': 'C',
            'smtls_total_depth': '500',
            'smtls_nts': '.',
            'smtls_nts_depth': '500',
            'var_description': 'a:n:C42T:id1:foo',
            'free_text': 'free text',
        }

        self.assertEqual(expected,
                         report_filter.ReportFilter._report_line_to_dict(line))

        bad_line = '\t'.join(line.split('\t')[:-1])
        self.assertEqual(
            None, report_filter.ReportFilter._report_line_to_dict(bad_line))
Esempio n. 12
0
def report_to_resistance_dict(infile):
    '''Takes final ariba report.tsv file, and extracts
    resistance calls, returning a dict of
    drug name -> list of mutations.
    each "mutation" in the list is a tuple of (gene name, mutation).
    Mutation is of the form X42Y, or "incomplete_gene" for katG and
    pncA when they are not complete.
    This all assumes that the reference data are in the "correct"
    form, where the variant descriptions in the var_description column of the
    TSV file ends with a comma-separated list of the drug names'''
    complete_genes = {'katG': 'Isoniazid', 'pncA': 'Pyrazinamide'}
    res_calls = {}
    incomplete_genes = set()
    with open(infile) as f:
        reader = csv.DictReader(f, delimiter='\t')
        for d in reader:
            if d['ref_name'] in complete_genes and d['gene'] == '1':
                f = flag.Flag(int(d['flag']))
                if not f.has('complete_gene'):
                    incomplete_genes.add(d['ref_name'])

            if d['has_known_var'] == '1':
                if 'Original mutation' in d['var_description']:
                    drugs = d['var_description'].split(':')[-1].split(
                        '.')[0].split()[-1].split(',')
                    change = d['var_description'].split()[-1]
                else:
                    drugs = d['var_description'].split()[-1].split(',')
                    change = d['known_var_change']
                for drug in drugs:
                    if drug not in res_calls:
                        res_calls[drug] = []
                    res_calls[drug].append((d['ref_name'], change))

    for gene in incomplete_genes:
        drug = complete_genes[gene]
        if drug not in res_calls:
            res_calls[drug] = []
        res_calls[drug].append((gene, 'Incomplete_gene'))

    return res_calls
Esempio n. 13
0
    def _report_line_to_dict(cls, line):
        '''Takes report line string as input. Returns a dict of column name -> value in line'''
        data = line.split('\t')
        if len(data) != len(report.columns):
            return None

        d = dict(zip(report.columns, data))
        for key in report.int_columns:
            try:
                d[key] = int(d[key])
            except:
                assert d[key] == '.'

        for key in report.float_columns:
            try:
                d[key] = float(d[key])
            except:
                assert d[key] == '.'

        d['flag'] = flag.Flag(int(d['flag']))
        return d
Esempio n. 14
0
    def test_dict_to_report_line(self):
        '''Test _dict_to_report_line'''
        report_dict = {
            'ariba_ref_name': 'ariba_cluster1',
            'ref_name': 'cluster1',
            'gene': '0',
            'var_only': '0',
            'flag': flag.Flag(27),
            'reads': 10000,
            'cluster': 'cluster1',
            'ref_len': 1000,
            'ref_base_assembled': 999,
            'pc_ident': 99.42,
            'ctg': 'cluster1.scaffold.1',
            'ctg_len': 1300,
            'ctg_cov': 42.4,
            'known_var': '1',
            'var_type': 'SNP',
            'var_seq_type': 'n',
            'known_var_change': 'C42T',
            'has_known_var': '0',
            'ref_ctg_change': '.',
            'ref_ctg_effect': '.',
            'ref_start': 42,
            'ref_end': 42,
            'ref_nt': 'C',
            'ctg_start': 142,
            'ctg_end': 142,
            'ctg_nt': 'C',
            'smtls_total_depth': '500',
            'smtls_nts': '.',
            'smtls_nts_depth': '500',
            'var_description': 'a:n:C42T:id1:foo',
            'free_text': 'free text',
        }

        expected = 'ariba_cluster1\tcluster1\t0\t0\t27\t10000\tcluster1\t1000\t999\t99.42\tcluster1.scaffold.1\t1300\t42.4\t1\tSNP\tn\tC42T\t0\t.\t.\t42\t42\tC\t142\t142\tC\t500\t.\t500\ta:n:C42T:id1:foo\tfree text'
        self.assertEqual(
            expected,
            report_filter.ReportFilter._dict_to_report_line(report_dict))
Esempio n. 15
0
    def test_to_cluster_summary_number(self):
        '''Test _to_cluster_summary_assembled'''
        line = 'ariba_refname\trefname\t0\t0\t19\t78\tcluster\t120\t100\t98.33\tctg_name\t279\t24.4\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\tT\t17\tnoncoding1:0:0:A14T:id1:ref has wild type, foo bar\tsome free text'
        data_dict = summary_cluster.SummaryCluster.line2dict(line)

        tests = [
            ('0', 0, 'partial'),
            ('0', 64, 'no'),
            ('0', 1024, 'no'),
            ('0', 1, 'fragmented'),
            ('0', 3, 'yes_nonunique'),
            ('0', 19, 'yes'),
            ('0', 23, 'yes_nonunique'),
            ('0', 51, 'yes_nonunique'),
            ('0', 147, 'yes_nonunique'),
            ('0', 275, 'yes_nonunique'),
            ('1', 0, 'partial'),
            ('1', 64, 'no'),
            ('1', 1024, 'no'),
            ('1', 1, 'fragmented'),
            ('1', 11, 'yes_nonunique'),
            ('1', 27, 'yes'),
            ('1', 29, 'fragmented'),
            ('1', 59, 'yes_nonunique'),
            ('1', 155, 'yes_nonunique'),
            ('1', 283, 'yes_nonunique'),
        ]

        for gene, f, expected in tests:
            cluster = summary_cluster.SummaryCluster()
            data_dict['gene'] = gene
            data_dict['flag'] = flag.Flag(f)
            cluster.add_data_dict(data_dict)
            self.assertEqual(expected, cluster._to_cluster_summary_assembled())
            if expected == 'partial':
                original_number = cluster.data[0]['ref_base_assembled']
                cluster.data[0]['ref_base_assembled'] = 0
                self.assertEqual('no', cluster._to_cluster_summary_assembled())
                cluster.data[0]['ref_base_assembled'] = original_number
Esempio n. 16
0
    def run(self):
        f_in = pyfastaq.utils.open_file_read(self.infile)
        f_out = pyfastaq.utils.open_file_write(self.outfile)
        flag_index = None

        for line in f_in:
            fields = line.rstrip().split()

            if flag_index is None:
                try:
                    flag_index = fields.index('flag')
                except:
                    raise Error(
                        '"flag" column not found in first line of file ' +
                        self.infile + '. Cannot continue')
            else:
                f = flag.Flag(int(fields[flag_index]))
                fields[flag_index] = f.to_comma_separated_string()

            print(*fields, sep='\t', file=f_out)

        f_in.close()
        f_out.close()
Esempio n. 17
0
    def __init__(
        self,
        root_dir,
        name,
        assembly_kmer=0,
        assembler='velvet',
        max_insert=1000,
        min_scaff_depth=10,
        nucmer_min_id=90,
        nucmer_min_len=50,
        nucmer_breaklen=50,
        sspace_k=20,
        reads_insert=500,
        sspace_sd=0.4,
        threads=1,
        bcf_min_dp=10,
        bcf_min_dv=5,
        bcf_min_dv_over_dp=0.3,
        bcf_min_qual=20,
        assembled_threshold=0.95,
        unique_threshold=0.03,
        verbose=False,
        bcftools_exe='bcftools',
        gapfiller_exe='GapFiller.pl',
        samtools_exe='samtools',
        bowtie2_exe='bowtie2',
        bowtie2_preset='very-sensitive-local',
        smalt_exe='smalt',
        spades_exe='spades.py',
        sspace_exe='SSPACE_Basic_v2.0.pl',
        velvet_exe='velvet',  # prefix of velvet{g,h}
        spades_other=None,
        clean=1,
    ):

        self.root_dir = os.path.abspath(root_dir)
        if not os.path.exists(self.root_dir):
            raise Error('Directory ' + self.root_dir +
                        ' not found. Cannot continue')

        self.name = name
        self.reads1 = os.path.join(self.root_dir, 'reads_1.fq')
        self.reads2 = os.path.join(self.root_dir, 'reads_2.fq')
        self.gene_fa = os.path.join(self.root_dir, 'gene.fa')
        self.genes_fa = os.path.join(self.root_dir, 'genes.fa')
        self.gene_bam = os.path.join(self.root_dir, 'gene.reads_mapped.bam')

        for fname in [self.reads1, self.reads2, self.genes_fa]:
            if not os.path.exists(fname):
                raise Error('File ' + fname + ' not found. Cannot continue')

        self.max_insert = max_insert
        self.min_scaff_depth = min_scaff_depth

        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen
        self.assembly_vs_gene_coords = os.path.join(self.root_dir,
                                                    'assembly_vs_gene.coords')

        self.bcf_min_dp = bcf_min_dp
        self.bcf_min_dv = bcf_min_dv
        self.bcf_min_dv_over_dp = bcf_min_dv_over_dp
        self.bcf_min_qual = bcf_min_qual

        self._set_assembly_kmer(assembly_kmer)
        self.assembler = assembler
        assert self.assembler in ['velvet', 'spades']
        self.spades_exe = spades_exe
        self.spades_other = spades_other

        self.bcftools_exe = bcftools_exe

        self.sspace_exe = shutil.which(sspace_exe)
        if self.sspace_exe is None:
            self.gapfiller_exe = None
        else:
            self.sspace_exe = os.path.realpath(
                self.sspace_exe)  # otherwise sspace dies loading packages
            self.gapfiller_exe = shutil.which(gapfiller_exe)
            if self.gapfiller_exe is not None:
                self.gapfiller_exe = os.path.realpath(
                    self.gapfiller_exe
                )  # otherwise gapfiller dies loading packages

        self.samtools_exe = samtools_exe
        self.smalt_exe = smalt_exe
        self.bowtie2_exe = bowtie2_exe
        self.bowtie2_preset = bowtie2_preset

        if self.assembler == 'velvet':
            self.velveth = velvet_exe + 'h'
            self.velvetg = velvet_exe + 'g'

        self.sspace_k = sspace_k
        self.reads_insert = reads_insert
        self.sspace_sd = sspace_sd

        self.threads = threads
        self.verbose = verbose
        self.assembled_threshold = assembled_threshold
        self.unique_threshold = unique_threshold
        self.status_flag = flag.Flag()
        self.flag_file = os.path.join(self.root_dir, 'flag')
        self.clean = clean

        self.assembly_dir = os.path.join(self.root_dir, 'Assembly')
        try:
            os.mkdir(self.assembly_dir)
        except:
            raise Error('Error mkdir ' + self.assembly_dir)
        self.assembler_dir = os.path.join(self.assembly_dir, 'Assemble')
        self.assembly_contigs = os.path.join(self.assembly_dir, 'contigs.fa')
        self.scaffold_dir = os.path.join(self.assembly_dir, 'Scaffold')
        self.scaffolder_scaffolds = os.path.join(self.assembly_dir,
                                                 'scaffolds.fa')
        self.gapfill_dir = os.path.join(self.assembly_dir, 'Gapfill')
        self.gapfilled_scaffolds = os.path.join(self.assembly_dir,
                                                'scaffolds.gapfilled.fa')
        self.final_assembly_fa = os.path.join(self.root_dir, 'assembly.fa')
        self.final_assembly_bam = os.path.join(self.root_dir,
                                               'assembly.reads_mapped.bam')
        self.final_assembly_read_depths = os.path.join(
            self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
        self.final_assembly_vcf = os.path.join(
            self.root_dir, 'assembly.reads_mapped.bam.vcf')
        self.final_assembly = {}
        self.mummer_variants = {}
        self.variant_depths = {}
        self.percent_identities = {}
Esempio n. 18
0
 def test_to_comma_separated_string(self):
     '''Test to_comma_separated_string'''
     f = flag.Flag(27)
     expected = 'assembled,assembled_into_one_contig,complete_gene,unique_contig'
     self.assertEqual(expected, f.to_comma_separated_string())
Esempio n. 19
0
 def test_init_and_to_number(self):
     '''Test __init__ and to_number'''
     for i in range(512):
         f = flag.Flag(i)
         self.assertEqual(f.to_number(), i)
Esempio n. 20
0
 def test_str(self):
     '''Test __str__'''
     for i in range(512):
         f = flag.Flag(i)
         self.assertEqual(str(f), str(i))
Esempio n. 21
0
 def test_set_flag(self):
     '''Test set_flag'''
     for i in range(512):
         f = flag.Flag()
         f.set_flag(i)
         self.assertEqual(f.to_number(), i)
Esempio n. 22
0
    def __init__(
            self,
            root_dir,
            name,
            refdata,
            all_ref_seqs_fasta=None,
            total_reads=None,
            total_reads_bases=None,
            fail_file=None,
            read_store=None,
            reference_names=None,
            logfile=None,
            assembly_coverage=50,
            assembly_kmer=21,
            assembler='fermilite',
            max_insert=1000,
            min_scaff_depth=10,
            nucmer_min_id=90,
            nucmer_min_len=20,
            nucmer_breaklen=200,
            reads_insert=500,
            sspace_k=20,
            sspace_sd=0.4,
            threads=1,
            assembled_threshold=0.95,
            min_var_read_depth=5,
            min_second_var_read_depth=2,
            max_allele_freq=0.90,
            unique_threshold=0.03,
            max_gene_nt_extend=30,
            spades_mode="rna",  #["rna","wgs"]
            spades_options=None,
            clean=True,
            extern_progs=None,
            random_seed=42,
            threads_total=1):
        self.root_dir = os.path.abspath(root_dir)
        self.read_store = read_store
        self.refdata = refdata
        self.name = name
        self.fail_file = fail_file
        self.reference_fa = os.path.join(self.root_dir, 'reference.fa')
        self.reference_names = reference_names
        self.all_reads1 = os.path.join(self.root_dir, 'reads_1.fq')
        self.all_reads2 = os.path.join(self.root_dir, 'reads_2.fq')
        self.references_fa = os.path.join(self.root_dir, 'references.fa')

        if os.path.exists(self.root_dir):
            self._input_files_exist()

        self.total_reads = total_reads
        self.total_reads_bases = total_reads_bases
        self.logfile = logfile
        self.assembly_coverage = assembly_coverage
        self.assembly_kmer = assembly_kmer
        self.assembler = assembler
        self.sspace_k = sspace_k
        self.sspace_sd = sspace_sd
        self.reads_insert = reads_insert
        self.spades_mode = spades_mode
        self.spades_options = spades_options

        self.reads_for_assembly1 = os.path.join(self.root_dir,
                                                'reads_for_assembly_1.fq')
        self.reads_for_assembly2 = os.path.join(self.root_dir,
                                                'reads_for_assembly_2.fq')

        self.ref_sequence = None

        self.max_insert = max_insert
        self.min_scaff_depth = min_scaff_depth

        self.nucmer_min_id = nucmer_min_id
        self.nucmer_min_len = nucmer_min_len
        self.nucmer_breaklen = nucmer_breaklen

        self.min_var_read_depth = min_var_read_depth
        self.min_second_var_read_depth = min_second_var_read_depth
        self.max_allele_freq = max_allele_freq

        self.threads = threads
        self.assembled_threshold = assembled_threshold
        self.unique_threshold = unique_threshold
        self.max_gene_nt_extend = max_gene_nt_extend
        self.status_flag = flag.Flag()
        self.clean = clean

        self.threads_total = threads_total
        self.remaining_clusters = None

        self.assembly_dir = os.path.join(self.root_dir, 'Assembly')
        self.final_assembly_fa = os.path.join(self.root_dir, 'assembly.fa')
        self.final_assembly_bam = os.path.join(self.root_dir,
                                               'assembly.reads_mapped.bam')
        self.final_assembly_read_depths = os.path.join(
            self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
        self.final_assembly_vcf = os.path.join(
            self.root_dir, 'assembly.reads_mapped.bam.vcf')
        self.samtools_vars_prefix = self.final_assembly_bam
        self.assembly_compare = None
        self.variants_from_samtools = {}
        self.assembly_compare_prefix = os.path.join(self.root_dir,
                                                    'assembly_compare')

        self.mummer_variants = {}
        self.variant_depths = {}
        self.percent_identities = {}

        # The log filehandle self.log_fh is set at the start of the run() method.
        # Lots of other methods use self.log_fh. But for unit testing, run() isn't
        # run. So we need to set this to something for unit testing.
        # On the other hand, setting it here breaks a real run of ARIBA because
        # multiprocessing complains with the error:
        # TypeError: cannot serialize '_io.TextIOWrapper' object.
        # Hence the following two lines...
        if unittest:
            self.log_fh = sys.stdout
        else:
            atexit.register(self._atexit)
            self.log_fh = None

        if extern_progs is None:
            self.extern_progs = external_progs.ExternalProgs()
        else:
            self.extern_progs = extern_progs

        if all_ref_seqs_fasta is None:
            self.all_refs_fasta = self.references_fa
        else:
            self.all_refs_fasta = os.path.abspath(all_ref_seqs_fasta)

        self.random_seed = random_seed
        wanted_signals = [
            signal.SIGABRT, signal.SIGINT, signal.SIGSEGV, signal.SIGTERM
        ]
        for s in wanted_signals:
            signal.signal(s, self._receive_signal)