def test_load_ref_data_from_dir(self): '''test _load_reference_data_from_dir''' indir = os.path.join(data_dir, 'clusters_load_ref_data_from_dir') got_refdata, got_clusters = clusters.Clusters._load_reference_data_from_dir( indir) expected_seq_dict = { 'variants_only1': pyfastaq.sequences.Fasta('variants_only1', 'atggcgtgcgatgaataa'), 'presabs1': pyfastaq.sequences.Fasta('presabs1', 'atgatgatgagcccggcgatggaaggcggctag'), 'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'ACGTA'), } self.assertEqual(expected_seq_dict, got_refdata.sequences) self.assertEqual(11, got_refdata.genetic_code) expected_metadata = { 'presabs1': { 'seq_type': 'p', 'variant_only': False, '.': { sequence_metadata.SequenceMetadata( 'presabs1\t1\t0\t.\t.\tpresabs1 description') }, 'n': {}, 'p': {} }, 'variants_only1': { 'seq_type': 'p', 'variant_only': True, '.': set(), 'n': {}, 'p': { 1: { sequence_metadata.SequenceMetadata( 'variants_only1\t1\t1\tC2I\t.\tdescription of variants_only1 C2I' ) } } }, 'noncoding1': { 'seq_type': 'n', 'variant_only': False, '.': { sequence_metadata.SequenceMetadata( 'noncoding1\t0\t0\t.\t.\t.') }, 'n': {}, 'p': {}, } } self.assertEqual(expected_metadata, got_refdata.metadata) expected_clusters = { '0': {'presabs1'}, '1': {'variants_only1'}, '2': {'noncoding1'} } self.assertEqual(expected_clusters, got_clusters)
def test_init_ok(self): '''Test init with good input''' fasta_in = os.path.join(data_dir, 'reference_data_init_ok.in.fa') tsv_in = os.path.join(data_dir, 'reference_data_init_ok.in.tsv') meta1 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tR2S\t.\tconfers killer rabbit resistance') meta2 = sequence_metadata.SequenceMetadata( "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability" ) expected_metadata = { 'gene1': { 'seq_type': 'p', 'variant_only': False, 'n': {}, 'p': { 1: {meta1} }, '.': set(), }, 'gene2': { 'seq_type': 'p', 'variant_only': False, 'n': {}, 'p': { 41: {meta2} }, '.': set(), } } ref_data = reference_data.ReferenceData([fasta_in], [tsv_in]) self.assertEqual(expected_metadata, ref_data.metadata) expected_seqs_dict = { 'gene1': pyfastaq.sequences.Fasta('gene1', 'CATCGTCGTCTATCGTCGTCCTAG'), 'gene2': pyfastaq.sequences.Fasta('gene2', 'AAAAACCCCGGGGTTTT') } self.assertEqual(expected_seqs_dict, ref_data.sequences) self.assertEqual({}, ref_data.ariba_to_original_name) self.assertEqual({}, ref_data.extra_parameters) rename_file = os.path.join(data_dir, 'reference_data_init_ok.rename.tsv') parameters_file = os.path.join(data_dir, 'reference_data_init_ok.params.json') ref_data = reference_data.ReferenceData( [fasta_in], [tsv_in], rename_file=rename_file, parameters_file=parameters_file) expected_rename_dict = { 'gene1': 'original_gene1', 'gene2': 'original_gene2' } self.assertEqual(expected_rename_dict, ref_data.ariba_to_original_name) expected_extra_parameters = {'foo': 'bar', 'spam': 'eggs'} self.assertEqual(expected_extra_parameters, ref_data.extra_parameters)
def test_get_variants_variants_only(self): '''test get_variants variants only''' meta1 = sequence_metadata.SequenceMetadata( 'variants_only\t1\t0\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)' ) meta2 = sequence_metadata.SequenceMetadata( 'variants_only\t1\t0\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)' ) meta3 = sequence_metadata.SequenceMetadata( 'variants_only\t1\t0\tD4E\tid1\tref has variant type E (GAA=E, GAC=D)' ) metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv' with open(metadata_tsv, 'w') as f: print(meta1, file=f) print(meta2, file=f) print(meta3, file=f) fasta_in = os.path.join( data_dir, 'assembly_variants_test_get_variants_variants_only.fa') refdata = reference_data.ReferenceData([fasta_in], [metadata_tsv]) os.unlink(metadata_tsv) nucmer_snp_file = os.path.join( data_dir, 'assembly_variants_test_get_variants_variants_only.snps') v2 = pymummer.variant.Variant( pymummer.snp.Snp( '14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) v3 = pymummer.variant.Variant( pymummer.snp.Snp( '15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1')) ctg_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 41)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } ref_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 41)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } expected = { 'contig1': [ (4, 'p', 'A5D', 'NONSYN', [v2, v3], set(), set()), (None, 'p', None, None, None, {meta1}, set()), (None, 'p', None, None, None, {meta3}, set()), ], 'contig2': [(None, 'p', None, None, None, {meta3}, set())], } a_variants = assembly_variants.AssemblyVariants( refdata, nucmer_snp_file) got = a_variants.get_variants('variants_only', ctg_nucmer_coords, ref_nucmer_coords) self.assertEqual(expected, got)
def test_one_var_one_ctg_noncdg(self): '''test _get_one_variant_for_one_contig_non_coding''' fasta_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.fa') tsv_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_noncdg.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) ref_sequence_name = 'non_coding' refdata_var_dict = refdata.metadata[ref_sequence_name] v0 = pymummer.variant.Variant( pymummer.snp.Snp( '2\tT\tA\t2\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig')) # ref has A at position 3, which is variant type. This gives contig the wild type C. Shouldn't report v1 = pymummer.variant.Variant( pymummer.snp.Snp( '3\tA\tC\t3\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig')) # ref has T at position 5, which is wild type. This gives contig variant type A. Should report v2 = pymummer.variant.Variant( pymummer.snp.Snp( '5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig')) meta0 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tC3A\tid1\tref has variant type A') meta2 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tT5A\tid1\tref has wild type T') mummer_variants = [v0, v1, v2] expected_tuples = [ (1, 'n', 'T2A', 'SNP', [v0], set(), set()), #0 None, #1 (4, 'n', 'T5A', 'SNP', [v2], {meta2}, set()), #2 ] expected_used_variants = [ set(), #0 {meta0}, #1 {meta2}, #2 ] assert len(mummer_variants) == len(expected_tuples) == len( expected_used_variants) for i in range(len(mummer_variants)): got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_non_coding( refdata_var_dict, mummer_variants[i]) self.assertEqual(expected_tuples[i], got_tuple) self.assertEqual(expected_used_variants[i], got_used_variants)
def _load_metadata_tsv(cls, filename, metadata_dict): if filename is None: return {} f = pyfastaq.utils.open_file_read(filename) for line in f: try: metadata = sequence_metadata.SequenceMetadata(line) except: print('Problem with this line of metadata, which will be ignored:', line.rstrip(), file=sys.stderr) continue if metadata.name not in metadata_dict: metadata_dict[metadata.name] = { 'seq_type': metadata.seq_type, 'variant_only': metadata.variant_only, 'n': {}, 'p': {}, '.': set() } elif metadata.seq_type != metadata_dict[metadata.name]['seq_type'] or metadata.variant_only != metadata_dict[metadata.name]['variant_only']: raise Error('Inconsistent metadata for sequence ' + metadata.name + '. Cannot continue') if metadata.variant is None: metadata_dict[metadata.name]['.'].add(metadata) else: if metadata.variant.position not in metadata_dict[metadata.name][metadata.seq_type]: metadata_dict[metadata.name][metadata.seq_type][metadata.variant.position] = set() metadata_dict[metadata.name][metadata.seq_type][metadata.variant.position].add(metadata) pyfastaq.utils.close(f) return metadata_dict
def test_all_non_wild_type_variants(self): '''Test all_non_wild_type_variants''' tsv_file = os.path.join( data_dir, 'reference_data_test_all_non_wild_type_variants.tsv') fasta_in = os.path.join( data_dir, 'reference_data_test_all_non_wild_type_variants.ref.fa') refdata = reference_data.ReferenceData([fasta_in], [tsv_file]) v1 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tP3Q\t.\tref has wild type P') v2 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tG4I\t.\tref has wild type F') v3 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tI5V\t.\tref has variant V instead of I') v4 = sequence_metadata.SequenceMetadata( 'var_only_gene\t1\t1\tF6I\t.\tref has wild type F') p1 = sequence_metadata.SequenceMetadata( 'presence_absence_gene\t1\t0\tN2I\t.\tref has wild type N') p2 = sequence_metadata.SequenceMetadata( 'presence_absence_gene\t1\t0\tA4G\t.\tref has variant G instead of A' ) n1 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tA2C\t.\tref has wild type A') n2 = sequence_metadata.SequenceMetadata( 'non_coding\t0\t0\tC4T\t.\tref has variant T instead of C') var_only_expected = { 'n': {}, 'p': { 2: {v1}, 3: {v2}, 4: {v3}, 5: {v4} } } pres_abs_expected = { 'n': {}, 'p': { 1: {p1}, 3: {p2} }, } non_coding_expected = {'n': {1: {n1}, 3: {n2}}, 'p': {}} self.assertEqual(var_only_expected, refdata.all_non_wild_type_variants('var_only_gene')) self.assertEqual( pres_abs_expected, refdata.all_non_wild_type_variants('presence_absence_gene')) self.assertEqual(non_coding_expected, refdata.all_non_wild_type_variants('non_coding')) self.assertEqual({ 'n': {}, 'p': {} }, refdata.all_non_wild_type_variants('not_a_known_sequence'))
def test_rename_metadata_set(self): '''Test _rename_metadata_set''' metaset = { sequence_metadata.SequenceMetadata( 'foo 1\t1\t0\t.\t.\tdescription'), sequence_metadata.SequenceMetadata( 'foo 1\t1\t0\tI42L\t.\tspam eggs') } expected = { sequence_metadata.SequenceMetadata( 'new_name\t1\t0\t.\t.\tdescription'), sequence_metadata.SequenceMetadata( 'new_name\t1\t0\tI42L\t.\tspam eggs') } got = reference_data.ReferenceData._rename_metadata_set( metaset, 'new_name') self.assertEqual(expected, got)
def test_load_input_check_seq_names_ok(self): '''Test _load_input_files_and_check_seq_names with good input''' fasta_files = [ os.path.join( data_dir, 'reference_data_load_input_check_seq_names.good.fa.' + x) for x in ['1', '2'] ] metadata_files = [ os.path.join( data_dir, 'reference_data_load_input_check_seq_names.good.csv.' + x) for x in ['1', '2'] ] expected_seqs = { 'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT'), 'seq2': pyfastaq.sequences.Fasta('seq2', 'TTTT') } meta1 = sequence_metadata.SequenceMetadata( 'seq1\t0\t0\tA1G\t.\tfree text') meta2 = sequence_metadata.SequenceMetadata( "seq2\t0\t0\t.\t.\tspam eggs") expected_meta = { 'seq1': { 'seq_type': 'n', 'variant_only': False, 'n': { 0: {meta1} }, 'p': {}, '.': set(), }, 'seq2': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta2}, } } got_seqs, got_meta = reference_data.ReferenceData._load_input_files_and_check_seq_names( fasta_files, metadata_files) self.assertEqual(expected_seqs, got_seqs) self.assertEqual(expected_meta, got_meta)
def test_init_on_good_input(self): '''test init ok on good input''' data = sequence_metadata.SequenceMetadata( 'gene\t1\t0\tI42L\tid\tspam spam wonderful spam') self.assertEqual(data.name, 'gene') self.assertEqual(data.seq_type, 'p') self.assertEqual(data.variant_only, False) self.assertEqual(data.variant.wild_value, 'I') self.assertEqual(data.variant.variant_value, 'L') self.assertEqual(data.variant.identifier, 'id') self.assertEqual(data.free_text, 'spam spam wonderful spam')
def test_str(self): '''test __str__''' lines = [ 'gene1\t1\t1\tA42G\tid1\tspam', 'gene2\t0\t0\t.\t.\t.', 'gene3\t0\t0\t.\t.\teggs', 'gene4\t1\t0\tI42K\tid\tthis mutation kills tardigrades', ] for line in lines: self.assertEqual(line, str(sequence_metadata.SequenceMetadata(line)))
def test_load_all_metadata_tsvs(self): '''Test _load_all_metadata_tsvs''' input_files = [ os.path.join(data_dir, 'reference_data_load_all_metadata_tsvs.' + x + '.tsv') for x in ['1', '2'] ] meta1 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA42G\t.\tfree text') meta2 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tG13T\t.\tconfers killer rabbit resistance') meta3 = sequence_metadata.SequenceMetadata( "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability" ) expected = { 'gene1': { 'seq_type': 'n', 'variant_only': False, 'n': { 12: {meta2}, 41: {meta1} }, 'p': {}, '.': set(), }, 'gene2': { 'seq_type': 'p', 'variant_only': False, 'n': {}, 'p': { 41: {meta3} }, '.': set(), } } got = reference_data.ReferenceData._load_all_metadata_tsvs(input_files) self.assertEqual(expected, got)
def test_to_string(self): '''test to_string''' lines = [ ('gene1', '0', '0', 'A42G', 'id1', 'spam'), ('gene2', '0', '0', '.', '.', '.'), ('gene3', '0', '0', '.', '.', 'eggs'), ('gene4', '1', '0', 'I42K', 'id', 'this mutation kills tardigrades'), ] for line in lines: m = sequence_metadata.SequenceMetadata('\t'.join(line)) for separator in ('_', '\t'): expected = separator.join(line) self.assertEqual(expected, m.to_string(separator=separator))
def test_init_fails_on_bad_lines(self): '''Test init fails on bad lines''' lines = [ 'only one column. There can NOT be only one\n', 'two\tcolumns is not enough\n', 'three\tcolumns\tis still not enough\n', 'four\tcolumns\tis\talso not enough\n', 'five\tcolumns\tis\talso\tnot enough\n', 'seven\tcolumns\tis\tone\tmore\tthan\nwe want', ] for line in lines: with self.assertRaises(sequence_metadata.Error): sequence_metadata.SequenceMetadata(line) tests = [ ('gene\tx\t0\t.\t.\tfoo\n', sequence_metadata.Error), ('gene\t1\t2\t.\t.\tfoo\n', sequence_metadata.Error), ('gene\t1\t1\tI42\t.\tfoo\n', sequence_variant.Error), ] for line, err in tests: with self.assertRaises(err): sequence_metadata.SequenceMetadata(line)
def test_has_variant(self): '''test has_variant''' tests = [ ('gene1\t0\t0\t.\t.\t.', False), ('gene1\t0\t0\tA2T\t.\t,', True), ('gene1\t0\t0\tT2A\t.\t.', False), ('gene1\t1\t0\tI2Y\t.\t.', True), ('gene1\t1\t0\tY2I\t.\t.', False), ] seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC* for line, expected in tests: metadata = sequence_metadata.SequenceMetadata(line) self.assertEqual(expected, metadata.has_variant(seq))
def test_get_remaining_known_ref_variants_amino_acids(self): '''test _get_remaining_known_ref_variants with amino acids''' ref_var1 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tD2E\tid1\tfoo bar') ref_var2 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tD3E\tid1\tfoo bar baz') ref_var3 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tD3I\tid1\tfoo bar baz spam') ref_var4 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tD10E\tid1\tfoo bar baz spam egg') ref_var5 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tD14E\tid1\tfoo bar baz spam egg chips') ref_var6 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tD15E\tid1\tfoo bar baz spam egg chips') ref_var7 = sequence_metadata.SequenceMetadata( 'gene1\t1\t0\tD40E\tid1\tfoo bar baz spam egg chips') known_ref_variants = { 1: {ref_var1}, 2: {ref_var2, ref_var3}, 9: {ref_var4}, 13: {ref_var5}, 14: {ref_var6}, 39: {ref_var7} } used_ref_variants = {ref_var3, ref_var5} nucmer_coords = [ pyfastaq.intervals.Interval(6, 25), pyfastaq.intervals.Interval(30, 100) ] expected = [(None, 'p', None, None, None, {x}, set()) for x in [ref_var2, ref_var6]] got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants( known_ref_variants, used_ref_variants, nucmer_coords) self.assertEqual(expected, got)
def test_get_remaining_known_ref_variants_nucleotides(self): '''test _get_remaining_known_ref_variants with nucleotides''' ref_var1 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA2C\tid1\tfoo bar') ref_var2 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA3C\tid1\tfoo bar baz') ref_var3 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA3T\tid1\tfoo bar baz spam') ref_var4 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA10C\tid1\tfoo bar baz spam egg') ref_var5 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA14C\tid1\tfoo bar baz spam egg chips') ref_var6 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA15C\tid1\tfoo bar baz spam egg chips') ref_var7 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA40C\tid1\tfoo bar baz spam egg chips') known_ref_variants = { 1: {ref_var1}, 2: {ref_var2, ref_var3}, 9: {ref_var4}, 13: {ref_var5}, 14: {ref_var6}, 39: {ref_var7} } used_ref_variants = {ref_var3, ref_var5} nucmer_coords = [ pyfastaq.intervals.Interval(2, 13), pyfastaq.intervals.Interval(30, 100) ] expected = [(None, 'n', None, None, None, {x}, set()) for x in [ref_var2, ref_var4, ref_var7]] got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants( known_ref_variants, used_ref_variants, nucmer_coords) self.assertEqual(expected, got)
def test_rename_sequences(self): '''Test rename_sequences''' fasta_in = os.path.join(data_dir, 'reference_data_rename_sequences.fa') tsv_in = os.path.join(data_dir, 'reference_data_rename_sequences_metadata.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) tmp_out = 'tmp.test_rename_sequences.out' refdata.rename_sequences(tmp_out) expected_file = os.path.join( data_dir, 'reference_data_test_rename_sequences.out') self.assertTrue(filecmp.cmp(expected_file, tmp_out, shallow=False)) os.unlink(tmp_out) meta1 = sequence_metadata.SequenceMetadata( 'noncoding1\t0\t0\t.\t.\toriginal name "noncoding1 blah"') meta3 = sequence_metadata.SequenceMetadata( 'pres_abs1_1\t0\t0\t.\t.\toriginal name "pres_abs1 foo bar spam eggs"' ) meta5 = sequence_metadata.SequenceMetadata( 'pres_abs1\t0\t0\t.\t.\toriginal name "pres\'abs1"') meta6 = sequence_metadata.SequenceMetadata( 'pres_abs2\t0\t0\t.\t.\toriginal name "pres_abs2"') meta7 = sequence_metadata.SequenceMetadata( 'pres_abs3\t0\t0\t.\t.\toriginal name "pres!abs3"') meta8 = sequence_metadata.SequenceMetadata( 'var_only1_2\t0\t0\t.\t.\toriginal name "var_only1 hello"') meta9 = sequence_metadata.SequenceMetadata( 'var_only1\t0\t0\t.\t.\toriginal name "var,only1"') meta10 = sequence_metadata.SequenceMetadata( 'var_only1_1\t0\t0\t.\t.\toriginal name "var:only1 boo"') meta11 = sequence_metadata.SequenceMetadata( 'var_only2\t0\t0\t.\t.\toriginal name "var_only2"') expected_meta = { 'noncoding1': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta1} }, 'pres_abs1_1': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta3} }, 'pres_abs1': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta5} }, 'pres_abs2': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta6} }, 'pres_abs3': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta7} }, 'var_only1_2': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta8} }, 'var_only1': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta9} }, 'var_only1_1': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta10} }, 'var_only2': { 'seq_type': 'n', 'variant_only': False, 'n': {}, 'p': {}, '.': {meta11} }, } self.maxDiff = None self.assertEqual(set(expected_meta.keys()), set(refdata.metadata.keys())) self.assertEqual(expected_meta, refdata.metadata) expected_seqs_dict = { 'noncoding1': pyfastaq.sequences.Fasta('noncoding1', 'AAAA'), 'pres_abs1_1': pyfastaq.sequences.Fasta('pres_abs1_1', 'ACGT'), 'pres_abs1': pyfastaq.sequences.Fasta('pres_abs1', 'CCCC'), 'pres_abs2': pyfastaq.sequences.Fasta('pres_abs2', 'TTTT'), 'pres_abs3': pyfastaq.sequences.Fasta('pres_abs3', 'GGGG'), 'var_only1_2': pyfastaq.sequences.Fasta('var_only1_2', 'AAAA'), 'var_only1': pyfastaq.sequences.Fasta('var_only1', 'GGGG'), 'var_only1_1': pyfastaq.sequences.Fasta('var_only1_1', 'CCCC'), 'var_only2': pyfastaq.sequences.Fasta('var_only2', 'TTTT'), } self.assertEqual(expected_seqs_dict, refdata.sequences) expected_rename_dict = { 'pres!abs3': 'pres_abs3', 'pres\'abs1': 'pres_abs1', 'pres_abs1': 'pres_abs1_1', 'var,only1': 'var_only1', 'var:only1': 'var_only1_1', 'var_only1': 'var_only1_2', } self.assertEqual(expected_rename_dict, refdata.rename_dict)
def test_rename_names_in_metadata(self): '''Test _rename_names_in_metadata''' meta1 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA42G\t.\tfree text') meta2 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tA42T\t.\tfree text2') meta3 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\t.\t.\tfree text3') meta4 = sequence_metadata.SequenceMetadata( 'gene1\t0\t0\tG13T\t.\tconfers killer rabbit resistance') meta5 = sequence_metadata.SequenceMetadata( "gene2\t1\t0\tI42L\t.\tremoves tardigrade's space-living capability" ) meta1rename = sequence_metadata.SequenceMetadata( 'new_gene1\t0\t0\tA42G\t.\tfree text') meta2rename = sequence_metadata.SequenceMetadata( 'new_gene1\t0\t0\tA42T\t.\tfree text2') meta3rename = sequence_metadata.SequenceMetadata( 'new_gene1\t0\t0\t.\t.\tfree text3') meta4rename = sequence_metadata.SequenceMetadata( 'new_gene1\t0\t0\tG13T\t.\tconfers killer rabbit resistance') metadata = { 'gene1': { 'n': { 12: {meta4}, 41: {meta1, meta2} }, 'p': {}, '.': {meta3}, }, 'gene2': { 'n': {}, 'p': { 41: {meta5} }, '.': set(), } } expected = { 'new_gene1': { 'n': { 12: {meta4rename}, 41: {meta1rename, meta2rename} }, 'p': {}, '.': {meta3rename}, }, 'gene2': { 'n': {}, 'p': { 41: {meta5} }, '.': set(), } } rename_dict = {'gene1': 'new_gene1'} got = reference_data.ReferenceData._rename_names_in_metadata( metadata, rename_dict) self.assertEqual(expected, got)
def test_one_var_one_ctg_cdg(self): '''test _get_one_variant_for_one_contig_coding''' fasta_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_cdg.fa') tsv_in = os.path.join(data_dir, 'assembly_variants_one_var_one_ctg_cdg.tsv') refdata = reference_data.ReferenceData([fasta_in], [tsv_in]) ref_sequence_name = 'presence_absence' ref_sequence = refdata.sequence(ref_sequence_name) refdata_var_dict = refdata.metadata[ref_sequence_name] v0 = pymummer.variant.Variant( pymummer.snp.Snp( '6\tT\tA\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) v1 = pymummer.variant.Variant( pymummer.snp.Snp( '9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) v2 = pymummer.variant.Variant( pymummer.snp.Snp( '18\tG\tT\t18\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) v3 = pymummer.variant.Variant( pymummer.snp.Snp( '21\tC\tT\t21\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) v4 = pymummer.variant.Variant( pymummer.snp.Snp( '7\tA\tT\t7\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) v5 = pymummer.variant.Variant( pymummer.snp.Snp( '12\tA\tC\t11\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) v6 = pymummer.variant.Variant( pymummer.snp.Snp( '4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) self.assertTrue( v6.update_indel( pymummer.snp.Snp( '5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) v7 = pymummer.variant.Variant( pymummer.snp.Snp( '4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) self.assertTrue( v7.update_indel( pymummer.snp.Snp( '4\t.\tA\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) v8 = pymummer.variant.Variant( pymummer.snp.Snp( '4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) self.assertTrue( v8.update_indel( pymummer.snp.Snp( '5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) self.assertTrue( v8.update_indel( pymummer.snp.Snp( '6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) v9 = pymummer.variant.Variant( pymummer.snp.Snp( '4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) self.assertTrue( v9.update_indel( pymummer.snp.Snp( '5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) self.assertTrue( v9.update_indel( pymummer.snp.Snp( '6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) self.assertTrue( v9.update_indel( pymummer.snp.Snp( '7\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) self.assertTrue( v9.update_indel( pymummer.snp.Snp( '8\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) self.assertTrue( v9.update_indel( pymummer.snp.Snp( '9\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) v10 = pymummer.variant.Variant( pymummer.snp.Snp( '4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')) self.assertTrue( v10.update_indel( pymummer.snp.Snp( '4\t.\tT\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) self.assertTrue( v10.update_indel( pymummer.snp.Snp( '4\t.\tT\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig') )) mummer_variants = [[v0], [v1], [v2], [v3], [v4], [v5], [v6], [v7], [v8], [v9], [v10]] meta0 = sequence_metadata.SequenceMetadata( 'presence_absence\t1\t0\tD2E\tid1\tref has wild type D (GAT=D, GAA=E)' ) meta4 = sequence_metadata.SequenceMetadata( 'presence_absence\t1\t0\tS3R\tid1\tref has variant type R (AGA=R, AGT=S)' ) expected_tuples = [ (1, 'p', 'D2E', 'NONSYN', [v0], {meta0}, set()), #0 None, #1 (5, 'p', 'M6I', 'NONSYN', [v2], set(), set()), #2 (6, 'p', '.', 'SYN', [v3], set(), set()), #3 (2, 'p', 'R3trunc', 'TRUNC', [v4], set(), {meta4}), #4 None, #5 (1, 'p', 'D2fs', 'FSHIFT', [v6], set(), {meta0}), #6 (1, 'p', 'D2fs', 'FSHIFT', [v7], set(), {meta0}), #7 (1, 'p', 'D2del', 'DEL', [v8], set(), {meta0}), #8 (1, 'p', 'D2_R3del', 'DEL', [v9], set(), {meta0}), #9 (1, 'p', 'D2_R3insI', 'INS', [v10], set(), {meta0}) #10 ] expected_used_variants = [ refdata_var_dict['p'][1], #0 refdata_var_dict['p'][2], #1 set(), #2 set(), #3 refdata_var_dict['p'][2], #4 refdata_var_dict['p'][3], #5 refdata_var_dict['p'][1], #6 refdata_var_dict['p'][1], #7 refdata_var_dict['p'][1], #8 refdata_var_dict['p'][1], #9 refdata_var_dict['p'][1], #10 ] assert len(mummer_variants) == len(expected_tuples) == len( expected_used_variants) for i in range(len(mummer_variants)): got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_coding( ref_sequence, refdata_var_dict, mummer_variants[i]) self.assertEqual(expected_tuples[i], got_tuple) self.assertEqual(expected_used_variants[i], got_used_variants)
def test_get_variants_presence_absence(self): '''test get_variants presence absence genes''' meta1 = sequence_metadata.SequenceMetadata( 'presence_absence\t1\t0\tD2E\tid1\tref has wild type D, contig has var (GAT=D, GAA=E)' ) meta2 = sequence_metadata.SequenceMetadata( 'presence_absence\t1\t0\tS3R\tid1\tref has variant type R, contig has wild (AGA=R, AGT=S)' ) meta3 = sequence_metadata.SequenceMetadata( 'presence_absence\t1\t0\tD4E\tid1\tref has variant type E, contig has var (GAA=E, GAC=D)' ) meta4 = sequence_metadata.SequenceMetadata( 'presence_absence\t1\t0\tA5D\tid1\tref has wild type A, contig has var (GCG=A, GAC=D)' ) meta5 = sequence_metadata.SequenceMetadata( 'presence_absence\t1\t0\tR13S\tid1\tref and qry have wild type') metadata_tsv = 'tmp.test_get_variants_presence_absence.metadata.tsv' with open(metadata_tsv, 'w') as f: print(meta1, file=f) print(meta2, file=f) print(meta3, file=f) print(meta4, file=f) print(meta5, file=f) fasta_in = os.path.join( data_dir, 'assembly_variants_test_get_variants_presence_absence.fa') refdata = reference_data.ReferenceData([fasta_in], [metadata_tsv]) os.unlink(metadata_tsv) nucmer_snp_file = os.path.join( data_dir, 'assembly_variants_test_get_variants_presence_absence.snps') v2 = pymummer.variant.Variant( pymummer.snp.Snp( '14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1')) v3 = pymummer.variant.Variant( pymummer.snp.Snp( '15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1')) ref_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 30)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } ctg_nucmer_coords = { 'contig1': [pyfastaq.intervals.Interval(0, 30)], 'contig2': [pyfastaq.intervals.Interval(10, 41)], } expected = { 'contig1': [ (4, 'p', 'A5D', 'NONSYN', [v2, v3], {meta4}, set()), (None, 'p', None, None, None, {meta1}, set()), (None, 'p', None, None, None, {meta3}, set()), ], 'contig2': [ (None, 'p', None, None, None, {meta3}, set()), (None, 'p', None, None, None, {meta4}, set()), (None, 'p', None, None, None, {meta5}, set()), ], } a_variants = assembly_variants.AssemblyVariants( refdata, nucmer_snp_file) got = a_variants.get_variants('presence_absence', ctg_nucmer_coords, ref_nucmer_coords) self.assertEqual(expected, got)