def test_init(self): """Initialization functions as expected with varied input types """ SequenceCollection(self.seqs1) SequenceCollection(self.seqs2) SequenceCollection(self.seqs3) SequenceCollection([])
def test_init_validate(self): """initialization with validation functions as expected """ SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True)
def test_init_validate(self): SequenceCollection(self.seqs1, validate=True) SequenceCollection(self.seqs1, validate=True) # can't validate self.seqs2 as a DNASequence self.assertRaises(SequenceCollectionError, SequenceCollection, self.invalid_s1, validate=True)
def test_degap(self): expected = SequenceCollection([ DNA('ACCGTTGG', metadata={'id': "d1"}), DNA('TTACCGGTGGCC', metadata={'id': "d2"}), DNA('ACCGTTGC', metadata={'id': "d3"}) ]) actual = self.a1.degap() self.assertEqual(actual, expected) expected = SequenceCollection([ RNA('UUAU', metadata={'id': "r1"}), RNA('ACGUU', metadata={'id': "r2"}) ]) actual = self.a2.degap() self.assertEqual(actual, expected)
def test_fastq_to_sequence_collection(self): for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: for observed_kwargs in kwargs: _drop_kwargs(observed_kwargs, 'seq_num') constructor = observed_kwargs.get('constructor', Sequence) # Can't use partials for this because the read # function below can't operate on partials expected_kwargs = {} if hasattr(constructor, 'lowercase'): expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' expected = SequenceCollection( [constructor( c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': np.array(c[3], np.uint8)}, **expected_kwargs) for c in components]) observed = _fastq_to_sequence_collection(valid, **observed_kwargs) self.assertEqual(observed, expected)
def test_valid_files(self): for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, 'seq_num') constructor = kwarg.get('constructor', Sequence) expected = SequenceCollection([ constructor(c['sequence'], metadata={ 'id': c['id'], 'machine_name': c['machine_name'], 'run_number': c['run_number'], 'lane_number': c['lane_number'], 'tile_number': c['tile_number'], 'x': c['x'], 'y': c['y'], 'index': c['index'], 'read_number': c['read_number'] }, positional_metadata={ 'quality': np.array(c['quality'], dtype=np.uint8) }) for c in components ]) observed = _qseq_to_sequence_collection(valid, **kwarg) self.assertEqual(observed, expected)
def test_k_word_frequencies(self): expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.k_word_frequencies(k=1), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False), [expected1, expected2]) self.assertEqual(self.empty.k_word_frequencies(k=1), []) # Test to ensure floating point precision bug isn't present. See the # tests for BiologicalSequence.k_word_frequencies for more details. sc = SequenceCollection( [RNA('C' * 10, id='s1'), RNA('G' * 10, id='s2')]) self.assertEqual( sc.k_word_frequencies(1), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})])
def test_ne(self): self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3])) self.assertTrue(self.s4 != Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
def test_init_fail_no_id(self): seq = Sequence('ACGTACGT') with six.assertRaisesRegex( self, SequenceCollectionError, "'id' must be included in the sequence " "metadata"): SequenceCollection([seq])
def test_fastq_to_sequence_collection(self): for valid_files, kwargs, components in self.valid_configurations: for valid in valid_files: for observed_kwargs in kwargs: _drop_kwargs(observed_kwargs, 'seq_num') constructor = observed_kwargs.get('constructor', Sequence) expected_kwargs = {} expected_kwargs['lowercase'] = 'introns' observed_kwargs['lowercase'] = 'introns' expected = SequenceCollection([ constructor(c[2], metadata={ 'id': c[0], 'description': c[1] }, positional_metadata={ 'quality': np.array(c[3], np.uint8) }, **expected_kwargs) for c in components ]) observed = _fastq_to_sequence_collection( valid, **observed_kwargs) self.assertEqual(observed, expected)
def test_degap(self): expected = SequenceCollection([ RNA('GAUUACA', metadata={'id': "r1"}), RNA('UUG', metadata={'id': "r2"}), RNA('UUGCC', metadata={'id': "r3"}) ]) actual = self.s2.degap() self.assertEqual(actual, expected)
def setUp(self): self.d1 = DNASequence('GATTACA', id="d1") self.d2 = DNASequence('TTG', id="d2") self.d3 = DNASequence('GTATACA', id="d3") self.d1_lower = DNASequence('gattaca', id="d1") self.d2_lower = DNASequence('ttg', id="d2") self.d3_lower = DNASequence('gtataca', id="d3") self.r1 = RNASequence('GAUUACA', id="r1") self.r2 = RNASequence('UUG', id="r2") self.r3 = RNASequence('U-----UGCC--', id="r3") self.i1 = DNASequence('GATXACA', id="i1") self.seqs1 = [self.d1, self.d2] self.seqs1_lower = [self.d1_lower, self.d2_lower] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')] self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'), ('r3', 'U-----UGCC--')] self.seqs3_t = self.seqs1_t + self.seqs2_t self.s1 = SequenceCollection(self.seqs1) self.s1_lower = SequenceCollection(self.seqs1_lower) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([]) self.invalid_s1 = SequenceCollection([self.i1])
def setUp(self): self.d1 = DNA('GATTACA', metadata={'id': "d1"}) self.d2 = DNA('TTG', metadata={'id': "d2"}) self.d3 = DNA('GTATACA', metadata={'id': "d3"}) self.r1 = RNA('GAUUACA', metadata={'id': "r1"}) self.r2 = RNA('UUG', metadata={'id': "r2"}) self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"}) self.seqs1 = [self.d1, self.d2] self.seqs2 = [self.r1, self.r2, self.r3] self.seqs3 = self.seqs1 + self.seqs2 self.seqs4 = [self.d1, self.d3] self.s1 = SequenceCollection(self.seqs1) self.s2 = SequenceCollection(self.seqs2) self.s3 = SequenceCollection(self.seqs3) self.s4 = SequenceCollection(self.seqs4) self.empty = SequenceCollection([])
def test_ne(self): """inequality operator functions as expected """ self.assertFalse(self.s1 != self.s1) self.assertTrue(self.s1 != self.s2) # SequenceCollections with different number of sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2])) self.assertTrue(self.s1 != Alignment([self.d1, self.d2])) # SequenceCollections with different sequences are not equal self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
def test_generator_to_fasta_sequence_lowercase_exception(self): seq = Sequence('ACgt', metadata={'id': ''}) fh = io.StringIO() with six.assertRaisesRegex(self, AttributeError, "lowercase specified but class Sequence " "does not support lowercase " "functionality"): _generator_to_fasta(SequenceCollection([seq]), fh, lowercase='introns') fh.close()
def test_eq(self): self.assertTrue(self.s1 == self.s1) self.assertFalse(self.s1 == self.s2) # different objects can be equal self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2])) self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1) # SequenceCollections with different number of sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1])) class FakeSequenceCollection(SequenceCollection): pass # SequenceCollections of different types are not equal self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3])) self.assertFalse(self.s4 == Alignment([self.d1, self.d3])) # SequenceCollections with different sequences are not equal self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))
def filter_seqs(seqs, remove_ids): """ Given a collections of sequences and a set of IDs to remove, return all sequences that do not match those IDs. :type seqs: skbio.SequenceCollection :param seqs: The sequences to be filtered :type remove_ids: set :param remove_ids: A set of sequence IDs to remove from the sequence data :rtype: SequenceCollection """ return SequenceCollection( [seq for seq in seqs if seq.id not in remove_ids])
def test_valid_files(self): for valid, kwargs, components in self.valid_files: for kwarg in kwargs: _drop_kwargs(kwarg, 'seq_num') constructor = kwarg.get('constructor', BiologicalSequence) expected = SequenceCollection([constructor(c[1], id=c[0], quality=c[2]) for c in components]) observed = _qseq_to_sequence_collection(valid, **kwarg) # TODO remove when #656 is resolved self.assertEqual(observed, expected) for o, e in zip(observed, expected): self.assertTrue(o.equals(e))
def test_distances(self): s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(hamming) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def test_update_ids_ids_parameter(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc"), RNA('UUG', id="def"), RNA('U-----UGCC--', id="ghi") ]) exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi')) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(ids=[]) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_update_ids_default_behavior(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="1"), RNA('UUG', id="2"), RNA('U-----UGCC--', id="3") ]) exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids() self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids() self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_update_ids_prefix(self): # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="abc1"), RNA('UUG', id="abc2"), RNA('U-----UGCC--', id="abc3") ]) exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(prefix='abc') self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_sequence_collection_to_fastq_kwargs_passed(self): for components, kwargs_expected_fp in self.valid_files: for kwargs, expected_fp in kwargs_expected_fp: obj = SequenceCollection([ NucleotideSequence(c[2], id=c[0], description=c[1], quality=c[3]) for c in components ]) fh = StringIO() _sequence_collection_to_fastq(obj, fh, **kwargs) observed = fh.getvalue() fh.close() with open(expected_fp, 'U') as f: expected = f.read() self.assertEqual(observed, expected)
def test_update_ids_fn_parameter(self): def append_42(ids): return [id_ + '-42' for id_ in ids] # 3 seqs exp_sc = SequenceCollection([ RNA('GAUUACA', id="r1-42"), RNA('UUG', id="r2-42"), RNA('U-----UGCC--', id="r3-42") ]) exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'} obs_sc, obs_id_map = self.s2.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, exp_sc) self.assertEqual(obs_id_map, exp_id_map) # empty obs_sc, obs_id_map = self.empty.update_ids(fn=append_42) self._assert_sequence_collections_equal(obs_sc, self.empty) self.assertEqual(obs_id_map, {})
def test_sequence_collection_to_fastq_kwargs_passed(self): for components, kwargs_expected_fp in self.valid_files: for kwargs, expected_fp in kwargs_expected_fp: obj = SequenceCollection([ DNA(c[2], metadata={'id': c[0], 'description': c[1]}, positional_metadata={'quality': c[3]}, lowercase='introns') for c in components]) fh = StringIO() kwargs['lowercase'] = 'introns' _sequence_collection_to_fastq(obj, fh, **kwargs) observed = fh.getvalue() fh.close() with open(expected_fp, 'U') as f: expected = f.read() self.assertEqual(observed, expected)
def generateReference(assay_list): from skbio import DNA from skbio import SequenceCollection reference = [] for assay in assay_list: name = assay.name if assay.AND: for operand in assay.AND: if isinstance(operand, Target): name = name + "_%s" % operand.gene_name if operand.gene_name else name for amplicon in operand.amplicons: name = name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, id=name) reference.append(seq) else: for amplicon in assay.target.amplicons: name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else name seq = DNA(amplicon.sequence, {'id': name}) reference.append(seq) return SequenceCollection(reference)
def test_kmer_frequencies(self): expected1 = Counter({'GAT': 1, 'TAC': 1}) expected2 = Counter({'TTG': 1}) self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=False), [expected1, expected2]) expected1 = defaultdict(float) expected1['A'] = 3 / 7. expected1['C'] = 1 / 7. expected1['G'] = 1 / 7. expected1['T'] = 2 / 7. expected2 = defaultdict(float) expected2['G'] = 1 / 3. expected2['T'] = 2 / 3. self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True), [expected1, expected2]) expected1 = defaultdict(float) expected1['GAT'] = 1 / 2. expected1['TAC'] = 1 / 2. expected2 = defaultdict(float) expected2['TTG'] = 1 / 1. self.assertEqual( self.s1.kmer_frequencies(k=3, overlap=False, relative=True), [expected1, expected2]) self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), []) # Test to ensure floating point precision bug isn't present. See the # tests for Sequence.kmer_frequencies for more details. sc = SequenceCollection([ RNA('C' * 10, metadata={'id': 's1'}), RNA('G' * 10, metadata={'id': 's2'}) ]) self.assertEqual( sc.kmer_frequencies(1, relative=True), [defaultdict(float, {'C': 1.0}), defaultdict(float, {'G': 1.0})])
def test_distances(self): s1 = SequenceCollection([ DNA("ACGT", metadata={'id': "d1"}), DNA("ACGG", metadata={'id': "d2"}) ]) expected = [[0, 0.25], [0.25, 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) def h(s1, s2): return hamming(s1.values, s2.values) actual = s1.distances(h) self.assertEqual(actual, expected) # alt distance function provided def dumb_distance(s1, s2): return 42. expected = [[0, 42.], [42., 0]] expected = DistanceMatrix(expected, ['d1', 'd2']) actual = s1.distances(dumb_distance) self.assertEqual(actual, expected)
def setUp(self): super(TestQSeqToSequenceCollection, self).setUp() self.valid_files += [ (get_data_path('empty'), [{}, {'variant': 'sanger'}], SequenceCollection([])) ]
def setUp(self): self.bio_seq1 = DNA( 'ACGT-acgt.', metadata={'id': 'seq1', 'description': 'desc1'}, positional_metadata={'quality': [10, 20, 30, 10, 0, 0, 0, 255, 1, 255]}, lowercase='introns') self.bio_seq2 = DNA( 'A', metadata={'id': ' \n \nseq \t2 '}, positional_metadata={'quality': [42]}, lowercase='introns') self.bio_seq3 = RNA( 'AACGGuA', metadata={'description': 'desc3'}, positional_metadata={'quality': [0, 0, 0, 0, 0, 0, 0]}, lowercase='introns') self.dna_seq = DNA( 'ACGTTGCAccGG', positional_metadata={'quality': [55, 10, 0, 99, 1, 1, 8, 77, 40, 10, 10, 0]}, lowercase='introns') self.rna_seq = RNA('ACGUU', positional_metadata={'quality': [10, 9, 8, 7, 6]}, lowercase='introns') self.prot_seq = Protein( 'pQqqqPPQQQ', metadata={'id': 'proteinseq', 'description': "\ndetailed\ndescription \t\twith " " new\n\nlines\n\n\n"}, positional_metadata={'quality': [42, 42, 255, 255, 42, 42, 42, 42, 42, 43]}, lowercase='introns') seqs = [ RNA('UUUU', metadata={'id': 's\te\tq\t1', 'description': 'desc\n1'}, positional_metadata={'quality': [1234, 0, 0, 2]}, lowercase='introns'), Sequence( 'CATC', metadata={'id': 's\te\tq\t2', 'description': 'desc\n2'}, positional_metadata={'quality': [1, 11, 111, 11112]}), Protein('sits', metadata={'id': 's\te\tq\t3', 'description': 'desc\n3'}, positional_metadata={'quality': [12345, 678909, 999999, 4242424242]}, validate=False) ] self.seq_coll = SequenceCollection(seqs) self.align = Alignment(seqs) def empty_gen(): raise StopIteration() yield def single_seq_gen(): yield self.bio_seq1 # generate sequences with descriptions containing newlines (to test # description_newline_replacement) def newline_description_gen(): yield self.prot_seq yield DNA('AGGAGAATA', metadata={'id': 'foo', 'description': '\n\n\n\n'}, positional_metadata={'quality': range(9)}, lowercase='introns') # generate sequences with ids containing whitespace (to test # id_whitespace_replacement) def whitespace_id_gen(): yield self.bio_seq2 yield RNA('UA', metadata={'id': '\n\t \t', 'description': 'a\nb'}, positional_metadata={'quality': [1000, 1]}) # multiple sequences of mixed types, lengths, and metadata. lengths are # chosen to exercise various splitting cases when testing max_width, # including exercising the different splitting algorithms used for # sequence data vs. quality scores def multi_seq_gen(): for seq in (self.bio_seq1, self.bio_seq2, self.bio_seq3, self.dna_seq, self.rna_seq, self.prot_seq): yield seq # can be serialized if no qual file is provided, else it should raise # an error because one seq has qual scores and the other doesn't def mixed_qual_score_gen(): missing_qual_seq = DNA( 'AAAAT', metadata={'id': 'da,dadadada', 'description': '10 hours'}, lowercase='introns') for seq in self.bio_seq1, missing_qual_seq: yield seq self.mixed_qual_score_gen = mixed_qual_score_gen() # store sequence generator to serialize, writer kwargs (if any), and # fasta and qual filepaths of expected results self.objs_fps = list(map(lambda e: (e[0], e[1], get_data_path(e[2]), get_data_path(e[3])), [ (empty_gen(), {}, 'empty', 'empty'), (single_seq_gen(), {'lowercase': 'introns'}, 'fasta_single_seq', 'qual_single_seq'), # no splitting of sequence or qual data across lines b/c max_width # is sufficiently large (single_seq_gen(), {'max_width': 32, 'lowercase': 'introns'}, 'fasta_single_seq', 'qual_single_seq'), # splitting algorithm for sequence and qual scores is different; # make sure individual qual scores aren't split across lines even # if they exceed max_width (single_seq_gen(), {'max_width': 1, 'lowercase': 'introns'}, 'fasta_max_width_1', 'qual_max_width_1'), (multi_seq_gen(), {'lowercase': 'introns'}, 'fasta_multi_seq', 'qual_multi_seq'), (multi_seq_gen(), {'max_width': 5, 'lowercase': 'introns'}, 'fasta_max_width_5', 'qual_max_width_5'), (newline_description_gen(), {'description_newline_replacement': ':-)', 'lowercase': 'introns'}, 'fasta_description_newline_replacement_multi_char', 'qual_description_newline_replacement_multi_char'), (newline_description_gen(), {'description_newline_replacement': '', 'lowercase': 'introns'}, 'fasta_description_newline_replacement_empty_str', 'qual_description_newline_replacement_empty_str',), (newline_description_gen(), {'description_newline_replacement': None, 'lowercase': 'introns'}, 'fasta_description_newline_replacement_none', 'qual_description_newline_replacement_none'), (whitespace_id_gen(), {'id_whitespace_replacement': '>:o'}, 'fasta_id_whitespace_replacement_multi_char', 'qual_id_whitespace_replacement_multi_char'), (whitespace_id_gen(), {'id_whitespace_replacement': ''}, 'fasta_id_whitespace_replacement_empty_str', 'qual_id_whitespace_replacement_empty_str'), (whitespace_id_gen(), {'id_whitespace_replacement': None}, 'fasta_id_whitespace_replacement_none', 'qual_id_whitespace_replacement_none'), ])) def blank_seq_gen(): for seq in self.bio_seq1, Sequence(''): yield seq # generators or parameter combos that cannot be written in fasta # format, paired with kwargs (if any), error type, and expected error # message regexp self.invalid_objs = [ (blank_seq_gen(), {}, ValueError, '2nd.*empty'), (single_seq_gen(), {'max_width': 0}, ValueError, 'max_width=0'), (multi_seq_gen(), {'id_whitespace_replacement': '-\n_'}, ValueError, 'Newline character'), (multi_seq_gen(), {'description_newline_replacement': '-.-\n'}, ValueError, 'Newline character'), (mixed_qual_score_gen(), {'qual': io.StringIO()}, ValueError, '2nd sequence.*does not have quality scores') ]