def test_variants_to_tsv_lines_noncoding(self): '''test _variants_to_tsv_lines noncoding sequences''' padded_seqs = { 'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'), 'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'), 'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTAT---AATTAG'), 'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTGTTGTAATTAG'), 'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'), } unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs) insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs) variant1 = sequence_variant.Variant('n', 'C5T', 'id1') variant2 = sequence_variant.Variant('n', 'A5T', 'id2') variants = { 'seq1': [(variant1, 'description 1')], 'seq5': [(variant2, 'description 2')], } expected = [ 'seq1\t0\t1\tC5T\tid1\tdescription 1', 'seq2\t0\t1\tC5T\tid1\tdescription 1', 'seq4\t0\t1\tG8T\tid1\tdescription 1', 'seq5\t0\t1\tA8T\tid1\tdescription 1', 'seq5\t0\t1\tA5T\tid2\tdescription 2', 'seq3\t0\t1\tA5T\tid2\tdescription 2', 'seq4\t0\t1\tG5T\tid2\tdescription 2', ] got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, False, True) self.assertEqual(expected, got)
def test_variants_to_tsv_lines_coding(self): '''test _variants_to_tsv_lines coding sequences''' padded_seqs = { 'seq1': pyfastaq.sequences.Fasta('seq1', 'ATG---GCTAATTAG'), # M-AN* 'seq2': pyfastaq.sequences.Fasta('seq2', 'ATG---GCTAATTAG'), # MFAN* 'seq3': pyfastaq.sequences.Fasta('seq3', 'ATGTTT---AATTAG'), # MF-N* 'seq4': pyfastaq.sequences.Fasta('seq4', 'ATGTTTTGTAATTAG'), # MFCN* 'seq5': pyfastaq.sequences.Fasta('seq5', 'ATGTTTGATAATTAG'), # MFDN* } unpadded_seqs = aln_to_metadata.AlnToMetadata._make_unpadded_seqs(padded_seqs) insertions = aln_to_metadata.AlnToMetadata._make_unpadded_insertion_coords(padded_seqs) variant1 = sequence_variant.Variant('p', 'A2D', 'id1') variant2 = sequence_variant.Variant('p', 'F2E', 'id2') variants = { 'seq1': [(variant1, 'description 1')], 'seq5': [(variant2, 'description 2')], } expected = [ 'seq1\t1\t0\tA2D\tid1\tdescription 1', 'seq2\t1\t0\tA2D\tid1\tdescription 1', 'seq4\t1\t0\tC3D\tid1\tdescription 1', 'seq5\t1\t0\tA3D\tid1\tdescription 1', 'seq5\t1\t0\tF2E\tid2\tdescription 2', 'seq3\t1\t0\tF2E\tid2\tdescription 2', 'seq4\t1\t0\tF2E\tid2\tdescription 2', ] got = aln_to_metadata.AlnToMetadata._variants_to_tsv_lines(variants, unpadded_seqs, padded_seqs, insertions, True, False) self.assertEqual(expected, got)
def test_variant_ids_are_unique(self): '''test variant_ids_are_unique''' variants = { 'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')], 'seq2': [(sequence_variant.Variant('p', 'L2M', 'id2'), 'description2')] } self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants)) variants['seq2'].append((sequence_variant.Variant('p', 'I3K', 'id1'), 'description3')) with self.assertRaises(aln_to_metadata.Error): self.assertTrue(aln_to_metadata.AlnToMetadata._variant_ids_are_unique(variants))
def test_has_variant(self): '''test has_variant''' seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC* tests = [ (sequence_variant.Variant('n', 'A2T', '.'), True), (sequence_variant.Variant('n', 'T2A', '.'), False), (sequence_variant.Variant('p', 'I2Y', '.'), True), (sequence_variant.Variant('p', 'Y2I', '.'), False), ] for var, expected in tests: self.assertEqual(expected, var.has_variant(seq))
def test_load_vars_file_good_file(self): '''test _load_vars_file good input file''' infile = os.path.join(data_dir, 'aln_to_metadata_load_vars_file_good.tsv') variant1 = sequence_variant.Variant('p', 'A42B', 'id1') variant2 = sequence_variant.Variant('p', 'C43D', 'id2') variant3 = sequence_variant.Variant('p', 'E100F', 'id3') expected = { 'seq1': [(variant1, 'description 1')], 'seq2': [(variant2, 'description 2'), (variant3, 'description 3')] } got = aln_to_metadata.AlnToMetadata._load_vars_file(infile, True) self.assertEqual(expected, got)
def test_init_str(self): '''Test init ok and str''' variants = ['I42K', 'i42k', 'I42k', 'i42K'] expected = 'I42K' for var in variants: self.assertEqual(expected, str(sequence_variant.Variant('p', var, '.')))
def test_check_variants_match_sequences(self): '''test _check_variants_match_sequences''' seqs = { 'seq1': pyfastaq.sequences.Fasta('seq1', 'ATGCTTTAG'), 'seq2': pyfastaq.sequences.Fasta('seq2', 'ATGCTTCTTTAG'), 'seq3': pyfastaq.sequences.Fasta('seq3', 'ATG---TAG') } variants = {'seq1': [(sequence_variant.Variant('p', 'L2M', 'id1'), 'description1')]} self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True)) variants = {'seq1': [(sequence_variant.Variant('p', 'M2L', 'id1'), 'description1')]} self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True)) variants = {'seq1': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]} with self.assertRaises(aln_to_metadata.Error): self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True)) variants = {'seq4': [(sequence_variant.Variant('p', 'A2M', 'id1'), 'description1')]} with self.assertRaises(aln_to_metadata.Error): self.assertTrue(aln_to_metadata.AlnToMetadata._check_variants_match_sequences(seqs, variants, True))
def test_sanity_check_against_seq_translate(self): '''test sanity_check_against_seq with translate True''' seq = 'AGTACGACGTAC' # translates to STTY tests = [ ('S1X', True), ('x1s', True), ('a1y', False), ('x5y', False) ] for var, expected in tests: variant = sequence_variant.Variant('p', var, '.') self.assertEqual(expected, variant.sanity_check_against_seq(seq, translate_seq=True))
def test_sanity_check_against_seq_no_translate(self): '''test sanity_check_against_seq with translate False''' seq = 'BrissSpecialStvff' tests = [ ('I3K', True), ('K3I', True), ('A2b', False), ('x1000y', False) ] for var, expected in tests: variant = sequence_variant.Variant('p', var, '.') self.assertEqual(expected, variant.sanity_check_against_seq(seq))
def test_init_ok(self): '''Test init ok''' variants = [('I42K', '.'), ('i42k', 'id1'), ('I42k', 'id2'), ('i42K', 'id3')] for var, identifier in variants: aa_var = sequence_variant.Variant('p', var, identifier) self.assertEqual(41, aa_var.position) self.assertEqual('I', aa_var.wild_value) self.assertEqual('K', aa_var.variant_value) if identifier == '.': self.assertIsNone(aa_var.identifier) else: self.assertEqual(identifier, aa_var.identifier)
def test_init_fails_on_bad_variant_strings(self): '''Test init fails on bad variant strings''' bad_variants = [ 'x', 'x1', '1x', '1x1', 'I42K43', 'I-1K', ] for var in bad_variants: with self.assertRaises(sequence_variant.Error): sequence_variant.Variant('p', var, '.')
def _get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variants_list): aa_var_effect, aa_var_string, aa_var_position = AssemblyVariants._get_variant_effect(mummer_variants_list, ref_sequence) var_tuple = None used_known_variants = set() # if this variant is at the same position as a known variant in the reference if refdata_var_dict is not None and aa_var_position in refdata_var_dict['p']: if aa_var_effect == 'NONSYN': aa_variant = sequence_variant.Variant('p', aa_var_string, '.') variants_at_this_position = {x for x in refdata_var_dict['p'][aa_variant.position]} matching_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.variant_value} not_interesting_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.wild_value} variants_at_this_position = variants_at_this_position.difference(matching_variants) else: matching_variants = set() variants_at_this_position = refdata_var_dict['p'][aa_var_position] not_interesting_variants = set() if len(not_interesting_variants) == 0: var_tuple = ( aa_var_position, 'p', aa_var_string, aa_var_effect, mummer_variants_list, matching_variants, variants_at_this_position ) used_known_variants.update(matching_variants, variants_at_this_position) else: # this variant is not at a known position in the reference var_tuple = ( aa_var_position, 'p', aa_var_string, aa_var_effect, mummer_variants_list, set(), set() ) return var_tuple, used_known_variants
def _load_vars_file(cls, vars_file, refs_are_coding): var_type = 'p' if refs_are_coding else 'n' f = pyfastaq.utils.open_file_read(vars_file) variants = {} for line in f: try: ref_name, variant, identifier, description = line.rstrip().split('\t') variant = sequence_variant.Variant(var_type, variant, identifier) except: pyfastaq.utils.close(f) raise Error('Error in this line of variants file:\n' + line) if ref_name not in variants: variants[ref_name] = [] variants[ref_name].append((variant, description)) pyfastaq.utils.close(f) return variants
def __init__(self, line): try: self.name, seq_type, var_only, variant, variant_id, self.free_text = line.rstrip().split('\t') except: raise Error('Error parsing line of file:\n' + line) if seq_type not in {'0', '1'}: raise Error('Error. Second column must be "0" or "1". Cannot continue. Line was:\n' + line) self.seq_type = 'n' if seq_type == '0' else 'p' if var_only not in {'0', '1'}: raise Error('Error. Third column must be "0" or "1". Cannot continue. Line was:\n' + line) self.variant_only = var_only == '1' if variant == '.': self.variant = None else: self.variant = sequence_variant.Variant(self.seq_type, variant, variant_id)
def test_nucleotide_range(self): '''test nucleotide_range''' sv = sequence_variant.Variant('n', 'A2T', '.') self.assertEqual((1, 1), sv.nucleotide_range()) sv = sequence_variant.Variant('p', 'I42L', '.') self.assertEqual((123, 125), sv.nucleotide_range())