def test_k_word_frequencies(self):
        expected1 = defaultdict(float)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(float)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.k_word_frequencies(k=1),
                         [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(float)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False),
                         [expected1, expected2])

        self.assertEqual(self.empty.k_word_frequencies(k=1), [])

        # Test to ensure floating point precision bug isn't present. See the
        # tests for BiologicalSequence.k_word_frequencies for more details.
        sc = SequenceCollection([RNA('C' * 10, id='s1'),
                                 RNA('G' * 10, id='s2')])
        self.assertEqual(sc.k_word_frequencies(1),
                         [defaultdict(float, {'C': 1.0}),
                          defaultdict(float, {'G': 1.0})])
Beispiel #2
0
    def test_k_word_frequencies(self):
        expected1 = defaultdict(float)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(float)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.k_word_frequencies(k=1),
                         [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(float)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False),
                         [expected1, expected2])

        self.assertEqual(self.empty.k_word_frequencies(k=1), [])

        # Test to ensure floating point precision bug isn't present. See the
        # tests for BiologicalSequence.k_word_frequencies for more details.
        sc = SequenceCollection(
            [RNA('C' * 10, id='s1'),
             RNA('G' * 10, id='s2')])
        self.assertEqual(
            sc.k_word_frequencies(1),
            [defaultdict(float, {'C': 1.0}),
             defaultdict(float, {'G': 1.0})])
Beispiel #3
0
 def test_init(self):
     """Initialization functions as expected with varied input types
     """
     SequenceCollection(self.seqs1)
     SequenceCollection(self.seqs2)
     SequenceCollection(self.seqs3)
     SequenceCollection([])
Beispiel #4
0
 def test_init_validate(self):
     SequenceCollection(self.seqs1, validate=True)
     SequenceCollection(self.seqs1, validate=True)
     # can't validate self.seqs2 as a DNASequence
     self.assertRaises(SequenceCollectionError,
                       SequenceCollection,
                       self.invalid_s1,
                       validate=True)
Beispiel #5
0
 def test_init_validate(self):
     """initialization with validation functions as expected
     """
     SequenceCollection(self.seqs1, validate=True)
     SequenceCollection(self.seqs1, validate=True)
     # can't validate self.seqs2 as a DNASequence
     self.assertRaises(SequenceCollectionError, SequenceCollection,
                       self.invalid_s1, validate=True)
def extract_seq_ids(data, fmt='fasta', variant=None):
    """
    Given FASTQ-format data (string), parse out only the
    sequence IDs and return.
    """
    fh = StringIO(data)
    if fmt == 'fastq':
        sc = SequenceCollection.read(fh, format=fmt, variant=variant)
    else:
        sc = SequenceCollection.read(fh, format=fmt)
    return frozenset(entry.id for entry in sc)
Beispiel #7
0
def extract_seq_ids(data, fmt='fasta', variant=None):
    """
    Given FASTQ-format data (string), parse out only the
    sequence IDs and return.
    """
    fh = StringIO(data)
    if fmt == 'fastq':
        sc = SequenceCollection.read(fh, format=fmt, variant=variant)
    else:
        sc = SequenceCollection.read(fh, format=fmt)
    return frozenset(entry.id for entry in sc)
Beispiel #8
0
    def test_degap(self):
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs1_t]
        expected = SequenceCollection.from_fasta_records(expected, DNASequence)
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.a2.degap()
        self.assertEqual(actual, expected)
Beispiel #9
0
    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace(".", "").replace("-", "")) for id_, seq in self.seqs1_t]
        expected = SequenceCollection.from_fasta_records(expected, DNASequence)
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = [(id_, seq.replace(".", "").replace("-", "")) for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.a2.degap()
        self.assertEqual(actual, expected)
Beispiel #10
0
    def test_distances(self):
        s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")])
        expected = [[0, 0.25], [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(hamming)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.

        expected = [[0, 42.], [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)
Beispiel #11
0
    def test_degap(self):
        expected = SequenceCollection([
            DNA('ACCGTTGG', metadata={'id': "d1"}),
            DNA('TTACCGGTGGCC', metadata={'id': "d2"}),
            DNA('ACCGTTGC', metadata={'id': "d3"})
        ])
        actual = self.a1.degap()
        self.assertEqual(actual, expected)

        expected = SequenceCollection([
            RNA('UUAU', metadata={'id': "r1"}),
            RNA('ACGUU', metadata={'id': "r2"})
        ])
        actual = self.a2.degap()
        self.assertEqual(actual, expected)
Beispiel #12
0
    def test_distances(self):
        s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")])
        expected = [[0, 0.25],
                    [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(hamming)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42.],
                    [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)
Beispiel #13
0
    def test_ne(self):
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass

        # SequenceCollections of different types are not equal
        self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3]))
        self.assertTrue(self.s4 != Alignment([self.d1, self.d3]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1, self.r1]))
Beispiel #14
0
def convert_phylip(infile, outfile, format):
    seqs = SequenceCollection.read(
        infile, format='phylip',
        data_parser=phylip.relaxed_ids
    )

    seqs.write(outfile, format=format)
Beispiel #15
0
 def test_init_fail_no_id(self):
     seq = Sequence('ACGTACGT')
     with six.assertRaisesRegex(
             self, SequenceCollectionError,
             "'id' must be included in the sequence "
             "metadata"):
         SequenceCollection([seq])
Beispiel #16
0
    def test_fastq_to_sequence_collection(self):
        for valid_files, kwargs, components in self.valid_configurations:
            for valid in valid_files:
                for observed_kwargs in kwargs:
                    _drop_kwargs(observed_kwargs, 'seq_num')
                    constructor = observed_kwargs.get('constructor', Sequence)

                    expected_kwargs = {}
                    expected_kwargs['lowercase'] = 'introns'
                    observed_kwargs['lowercase'] = 'introns'

                    expected = SequenceCollection([
                        constructor(c[2],
                                    metadata={
                                        'id': c[0],
                                        'description': c[1]
                                    },
                                    positional_metadata={
                                        'quality': np.array(c[3], np.uint8)
                                    },
                                    **expected_kwargs) for c in components
                    ])

                    observed = _fastq_to_sequence_collection(
                        valid, **observed_kwargs)
                    self.assertEqual(observed, expected)
Beispiel #17
0
    def test_valid_files(self):
        for valid, kwargs, components in self.valid_files:
            for kwarg in kwargs:
                _drop_kwargs(kwarg, 'seq_num')
                constructor = kwarg.get('constructor', Sequence)
                expected = SequenceCollection([
                    constructor(c['sequence'],
                                metadata={
                                    'id': c['id'],
                                    'machine_name': c['machine_name'],
                                    'run_number': c['run_number'],
                                    'lane_number': c['lane_number'],
                                    'tile_number': c['tile_number'],
                                    'x': c['x'],
                                    'y': c['y'],
                                    'index': c['index'],
                                    'read_number': c['read_number']
                                },
                                positional_metadata={
                                    'quality':
                                    np.array(c['quality'], dtype=np.uint8)
                                }) for c in components
                ])

                observed = _qseq_to_sequence_collection(valid, **kwarg)
                self.assertEqual(observed, expected)
Beispiel #18
0
    def test_fastq_to_sequence_collection(self):
        for valid_files, kwargs, components in self.valid_configurations:
            for valid in valid_files:
                for observed_kwargs in kwargs:
                    _drop_kwargs(observed_kwargs, 'seq_num')
                    constructor = observed_kwargs.get('constructor', Sequence)

                    # Can't use partials for this because the read
                    # function below can't operate on partials
                    expected_kwargs = {}
                    if hasattr(constructor, 'lowercase'):
                        expected_kwargs['lowercase'] = 'introns'
                        observed_kwargs['lowercase'] = 'introns'

                    expected = SequenceCollection(
                        [constructor(
                            c[2], metadata={'id': c[0], 'description': c[1]},
                            positional_metadata={'quality': np.array(c[3],
                                                 np.uint8)},
                            **expected_kwargs)
                         for c in components])

                    observed = _fastq_to_sequence_collection(valid,
                                                             **observed_kwargs)
                    self.assertEqual(observed, expected)
def main():
    args = handle_program_options()

    if osp.isfile(args.out_dir):
        print("--out_dir (-o) option must be a valid directory and not a file",
              file=sys.stderr)
        sys.exit(1)

    # will fail gracefully if dir exists
    skbu.create_dir(args.out_dir)

    metagenomes = []
    if args.metagenome_id is not None:
        metagenomes.append(args.metagenome_id)
    elif args.metagenome_file is not None:
        metagenomes.extend(parse_metagenome_file(args.metagenome_file))
        
    if args.verbose:
        msg = 'Processing requested for {} metagenome(s) found in: {}'
        print(msg.format(len(metagenomes), args.metagenome_file))

    # MG-RAST stage.file ids for downloading
    derep_passed = '150.1'
    screen_passed = '299.1'

    for mg_id in metagenomes:
        if args.verbose:
            print('Processing metagenome: {}'.format(mg_id))
            print('\tDownloading: Dereplication Passed...', end='')
            sys.stdout.flush()
        derepp_rsp = mgapi.mgrast_request('download', mg_id,
                                          {'file': derep_passed},
                                          auth_key=args.auth_key)
        derepp_sc = SequenceCollection.read(StringIO(derepp_rsp.text),
                                            format='fastq',
                                            variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(derepp_sc)))
            print('\tDownloading: Screen Passed...', end='')
            sys.stdout.flush()
        screenp_rsp = mgapi.mgrast_request('download', mg_id,
                                           {'file': screen_passed},
                                           auth_key=args.auth_key)
        screenp_ids = extract_seq_ids(screenp_rsp.text, fmt='fastq',
                                      variant='illumina1.8')
        if args.verbose:
            print('{} sequences'.format(len(screenp_ids)))

        # filter dereplication passed with IDs from screen passed
        failed_screen = filter_seqs(derepp_sc, screenp_ids)
        if args.verbose:
            nsp = len(screenp_ids)
            print('\tRemoved {} sequences from Dereplication Passed'.format(nsp))
            print('\tleaving {} sequences'.format(len(failed_screen)))

        out_fp = osp.join(args.out_dir, mg_id + '_screen_failed.fastq')
        failed_screen.write(out_fp, format='fastq', variant='illumina1.8')
        if args.verbose:
            print('Sequence data written to: ' + out_fp)
Beispiel #20
0
 def test_make_mini_otu_files(self):
     os.system("mkdir tmp")
     self.extension_seqs = SequenceCollection.read(self.extension_seqs)
     result = _make_mini_otu_files(self.key_node,
                                   self.extension_genus_dic_few,
                                   self.extension_seqs)
     os.system("rm -r tmp")
     self.assertEqual(result, """>P1\nTTAAAAAA\n""")
Beispiel #21
0
 def test_degap(self):
     expected = SequenceCollection([
         RNA('GAUUACA', metadata={'id': "r1"}),
         RNA('UUG', metadata={'id': "r2"}),
         RNA('UUGCC', metadata={'id': "r3"})
     ])
     actual = self.s2.degap()
     self.assertEqual(actual, expected)
 def test_make_mini_otu_files(self):
     os.system("mkdir tmp")
     self.extension_seqs = SequenceCollection.read(self.extension_seqs)
     result = _make_mini_otu_files(self.key_node,
                                   self.extension_genus_dic_few,
                                   self.extension_seqs)
     os.system("rm -r tmp")
     self.assertEqual(result, """>P1\nTTAAAAAA\n""")
Beispiel #23
0
    def setUp(self):
        self.d1 = DNA('GATTACA', metadata={'id': "d1"})
        self.d2 = DNA('TTG', metadata={'id': "d2"})
        self.d3 = DNA('GTATACA', metadata={'id': "d3"})
        self.r1 = RNA('GAUUACA', metadata={'id': "r1"})
        self.r2 = RNA('UUG', metadata={'id': "r2"})
        self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"})

        self.seqs1 = [self.d1, self.d2]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2
        self.seqs4 = [self.d1, self.d3]

        self.s1 = SequenceCollection(self.seqs1)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.s4 = SequenceCollection(self.seqs4)
        self.empty = SequenceCollection([])
Beispiel #24
0
    def test_ne(self):
        """inequality operator functions as expected
        """
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2]))
        self.assertTrue(self.s1 != Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 !=
                        SequenceCollection([self.d1, self.r1]))
Beispiel #25
0
 def test_generator_to_fasta_sequence_lowercase_exception(self):
     seq = Sequence('ACgt', metadata={'id': ''})
     fh = io.StringIO()
     with six.assertRaisesRegex(self, AttributeError,
                                "lowercase specified but class Sequence "
                                "does not support lowercase "
                                "functionality"):
         _generator_to_fasta(SequenceCollection([seq]), fh,
                             lowercase='introns')
     fh.close()
Beispiel #26
0
    def test_distances(self):
        s1 = SequenceCollection([DNA("ACGT", metadata={'id': "d1"}),
                                 DNA("ACGG", metadata={'id': "d2"})])
        expected = [[0, 0.25],
                    [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])

        def h(s1, s2):
            return hamming(s1.values, s2.values)
        actual = s1.distances(h)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42.],
                    [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)
Beispiel #27
0
    def test_eq(self):
        self.assertTrue(self.s1 == self.s1)
        self.assertFalse(self.s1 == self.s2)

        # different objects can be equal
        self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2]))
        self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1)

        # SequenceCollections with different number of sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass

        # SequenceCollections of different types are not equal
        self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3]))
        self.assertFalse(self.s4 == Alignment([self.d1, self.d3]))

        # SequenceCollections with different sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))
Beispiel #28
0
    def setUp(self):
        """Initialize values to be used in tests
        """
        self.d1 = DNASequence("GATTACA", id="d1")
        self.d2 = DNASequence("TTG", id="d2")
        self.d1_lower = DNASequence("gattaca", id="d1")
        self.d2_lower = DNASequence("ttg", id="d2")
        self.r1 = RNASequence("GAUUACA", id="r1")
        self.r2 = RNASequence("UUG", id="r2")
        self.r3 = RNASequence("U-----UGCC--", id="r3")

        self.i1 = DNASequence("GATXACA", id="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2

        self.seqs1_t = [("d1", "GATTACA"), ("d2", "TTG")]
        self.seqs2_t = [("r1", "GAUUACA"), ("r2", "UUG"), ("r3", "U-----UGCC--")]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])
    def setUp(self):
        self.d1 = DNASequence('GATTACA', id="d1")
        self.d2 = DNASequence('TTG', id="d2")
        self.d3 = DNASequence('GTATACA', id="d3")
        self.d1_lower = DNASequence('gattaca', id="d1")
        self.d2_lower = DNASequence('ttg', id="d2")
        self.d3_lower = DNASequence('gtataca', id="d3")
        self.r1 = RNASequence('GAUUACA', id="r1")
        self.r2 = RNASequence('UUG', id="r2")
        self.r3 = RNASequence('U-----UGCC--', id="r3")

        self.i1 = DNASequence('GATXACA', id="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2
        self.seqs4 = [self.d1, self.d3]

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.s4 = SequenceCollection(self.seqs4)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])
Beispiel #30
0
    def setUp(self):
        self.d1 = DNA('GATTACA', metadata={'id': "d1"})
        self.d2 = DNA('TTG', metadata={'id': "d2"})
        self.d3 = DNA('GTATACA', metadata={'id': "d3"})
        self.r1 = RNA('GAUUACA', metadata={'id': "r1"})
        self.r2 = RNA('UUG', metadata={'id': "r2"})
        self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"})

        self.seqs1 = [self.d1, self.d2]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2
        self.seqs4 = [self.d1, self.d3]

        self.s1 = SequenceCollection(self.seqs1)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.s4 = SequenceCollection(self.seqs4)
        self.empty = SequenceCollection([])
Beispiel #31
0
    def test_kmer_frequencies(self):
        expected1 = Counter({'GAT': 1, 'TAC': 1})
        expected2 = Counter({'TTG': 1})
        self.assertEqual(
            self.s1.kmer_frequencies(k=3, overlap=False, relative=False),
            [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(float)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True),
                         [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(float)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(
            self.s1.kmer_frequencies(k=3, overlap=False, relative=True),
            [expected1, expected2])

        self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), [])

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Sequence.kmer_frequencies for more details.
        sc = SequenceCollection([
            RNA('C' * 10, metadata={'id': 's1'}),
            RNA('G' * 10, metadata={'id': 's2'})
        ])
        self.assertEqual(
            sc.kmer_frequencies(1, relative=True),
            [defaultdict(float, {'C': 1.0}),
             defaultdict(float, {'G': 1.0})])
Beispiel #32
0
    def test_distances(self):
        s1 = SequenceCollection([
            DNA("ACGT", metadata={'id': "d1"}),
            DNA("ACGG", metadata={'id': "d2"})
        ])
        expected = [[0, 0.25], [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])

        def h(s1, s2):
            return hamming(s1.values, s2.values)

        actual = s1.distances(h)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.

        expected = [[0, 42.], [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)
Beispiel #33
0
def filter_seqs(seqs, remove_ids):
    """
    Given a collections of sequences and a set of IDs to remove,
    return all sequences that do not match those IDs.

    :type seqs: skbio.SequenceCollection
    :param seqs: The sequences to be filtered
    :type remove_ids: set
    :param remove_ids: A set of sequence IDs to remove from the sequence data
    :rtype: SequenceCollection
    """
    return SequenceCollection(
        [seq for seq in seqs if seq.id not in remove_ids])
Beispiel #34
0
    def test_valid_files(self):
        for valid, kwargs, components in self.valid_files:
            for kwarg in kwargs:
                _drop_kwargs(kwarg, 'seq_num')
                constructor = kwarg.get('constructor', BiologicalSequence)
                expected = SequenceCollection([constructor(c[1], id=c[0],
                                               quality=c[2]) for c in
                                               components])

                observed = _qseq_to_sequence_collection(valid, **kwarg)
                # TODO remove when #656 is resolved
                self.assertEqual(observed, expected)
                for o, e in zip(observed, expected):
                    self.assertTrue(o.equals(e))
Beispiel #35
0
    def test_kmer_frequencies(self):
        expected1 = Counter({'GAT': 1, 'TAC': 1})
        expected2 = Counter({'TTG': 1})
        self.assertEqual(
            self.s1.kmer_frequencies(k=3, overlap=False, relative=False),
            [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(float)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True),
                         [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(float)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(
            self.s1.kmer_frequencies(k=3, overlap=False, relative=True),
            [expected1, expected2])

        self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), [])

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Sequence.kmer_frequencies for more details.
        sc = SequenceCollection([RNA('C' * 10, metadata={'id': 's1'}),
                                 RNA('G' * 10, metadata={'id': 's2'})])
        self.assertEqual(sc.kmer_frequencies(1, relative=True),
                         [defaultdict(float, {'C': 1.0}),
                          defaultdict(float, {'G': 1.0})])
Beispiel #36
0
    def test_update_ids_default_behavior(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="1"),
            RNA('UUG', id="2"),
            RNA('U-----UGCC--', id="3")
        ])
        exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids()
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids()
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})
Beispiel #37
0
    def test_update_ids_prefix(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="abc1"),
            RNA('UUG', id="abc2"),
            RNA('U-----UGCC--', id="abc3")
        ])
        exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(prefix='abc')
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(prefix='abc')
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})
Beispiel #38
0
    def test_update_ids_ids_parameter(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="abc"),
            RNA('UUG', id="def"),
            RNA('U-----UGCC--', id="ghi")
        ])
        exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi'))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(ids=[])
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})
Beispiel #39
0
    def test_sequence_collection_to_fastq_kwargs_passed(self):
        for components, kwargs_expected_fp in self.valid_files:
            for kwargs, expected_fp in kwargs_expected_fp:
                obj = SequenceCollection([
                    NucleotideSequence(c[2],
                                       id=c[0],
                                       description=c[1],
                                       quality=c[3]) for c in components
                ])

                fh = StringIO()
                _sequence_collection_to_fastq(obj, fh, **kwargs)
                observed = fh.getvalue()
                fh.close()

                with open(expected_fp, 'U') as f:
                    expected = f.read()

                self.assertEqual(observed, expected)
Beispiel #40
0
    def test_update_ids_fn_parameter(self):
        def append_42(ids):
            return [id_ + '-42' for id_ in ids]

        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="r1-42"),
            RNA('UUG', id="r2-42"),
            RNA('U-----UGCC--', id="r3-42")
        ])
        exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(fn=append_42)
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(fn=append_42)
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})
Beispiel #41
0
    def test_sequence_collection_to_fastq_kwargs_passed(self):
        for components, kwargs_expected_fp in self.valid_files:
            for kwargs, expected_fp in kwargs_expected_fp:
                obj = SequenceCollection([
                    DNA(c[2], metadata={'id': c[0], 'description': c[1]},
                        positional_metadata={'quality': c[3]},
                        lowercase='introns')
                    for c in components])

                fh = StringIO()
                kwargs['lowercase'] = 'introns'
                _sequence_collection_to_fastq(obj, fh, **kwargs)
                observed = fh.getvalue()
                fh.close()

                with open(expected_fp, 'U') as f:
                    expected = f.read()

                self.assertEqual(observed, expected)
Beispiel #42
0
def generateReference(assay_list):
    from skbio import DNA
    from skbio import SequenceCollection
    reference = []
    for assay in assay_list:
        name = assay.name
        if assay.AND:
            for operand in assay.AND:
                if isinstance(operand, Target):
                    name = name + "_%s" % operand.gene_name if operand.gene_name else name
                    for amplicon in operand.amplicons:
                        name = name + "_%s" % amplicon.variant_name if amplicon.variant_name else name
                        seq = DNA(amplicon.sequence, id=name)
                        reference.append(seq)
        else:
            for amplicon in assay.target.amplicons:
                name = assay.name + "_%s" % amplicon.variant_name if amplicon.variant_name else name
                seq = DNA(amplicon.sequence, {'id': name})
                reference.append(seq)
    return SequenceCollection(reference)
Beispiel #43
0
    log_choices = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
    parser.add_argument(
        '--log-level', '-l', default="INFO", choices=log_choices,
        help="Set logging level. Default is info."
    )

    return parser

if __name__ == '__main__':
    parser = get_argument_parser()
    args = parser.parse_args()

    level = getattr(logging, args.log_level.upper(), logging.INFO)
    logging.basicConfig(level=level)

    sequences = SequenceCollection.read(args.infile, format=args.format)

    if args.parallel == 0 and len(sequences) > 16:
        pool_size = multiprocessing.cpu_count()
    else:
        pool_size = 1

    dmatrix = create_distance_matrix(sequences, d2.distance, pool_size,
                                     statistic=d2.d2_neighbourhood_dna)

    print(dmatrix)
    phylo_tree = nj(dmatrix)
    print(phylo_tree.ascii_art())
    phylo_tree.write(args.outfile, format=args.target)
Beispiel #44
0
class SequenceCollectionTests(TestCase):
    def setUp(self):
        self.d1 = DNA('GATTACA', metadata={'id': "d1"})
        self.d2 = DNA('TTG', metadata={'id': "d2"})
        self.d3 = DNA('GTATACA', metadata={'id': "d3"})
        self.r1 = RNA('GAUUACA', metadata={'id': "r1"})
        self.r2 = RNA('UUG', metadata={'id': "r2"})
        self.r3 = RNA('U-----UGCC--', metadata={'id': "r3"})

        self.seqs1 = [self.d1, self.d2]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2
        self.seqs4 = [self.d1, self.d3]

        self.s1 = SequenceCollection(self.seqs1)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.s4 = SequenceCollection(self.seqs4)
        self.empty = SequenceCollection([])

    def test_init(self):
        SequenceCollection(self.seqs1)
        SequenceCollection(self.seqs2)
        SequenceCollection(self.seqs3)
        SequenceCollection([])

    def test_init_fail(self):
        # sequences with overlapping ids
        s1 = [self.d1, self.d1]
        self.assertRaises(SequenceCollectionError, SequenceCollection, s1)

    def test_init_fail_no_id(self):
        seq = Sequence('ACGTACGT')
        with six.assertRaisesRegex(self, SequenceCollectionError,
                                   "'id' must be included in the sequence "
                                   "metadata"):
            SequenceCollection([seq])

    def test_contains(self):
        self.assertTrue('d1' in self.s1)
        self.assertTrue('r2' in self.s2)
        self.assertFalse('r2' in self.s1)

    def test_eq(self):
        self.assertTrue(self.s1 == self.s1)
        self.assertFalse(self.s1 == self.s2)

        # different objects can be equal
        self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2]))
        self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1)

        # SequenceCollections with different number of sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3]))
        self.assertFalse(self.s4 == Alignment([self.d1, self.d3]))

        # SequenceCollections with different sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))

    def test_getitem(self):
        self.assertEqual(self.s1[0], self.d1)
        self.assertEqual(self.s1[1], self.d2)
        self.assertEqual(self.s2[0], self.r1)
        self.assertEqual(self.s2[1], self.r2)

        self.assertRaises(IndexError, self.empty.__getitem__, 0)
        self.assertRaises(KeyError, self.empty.__getitem__, '0')

    def test_iter(self):
        s1_iter = iter(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_len(self):
        self.assertEqual(len(self.s1), 2)
        self.assertEqual(len(self.s2), 3)
        self.assertEqual(len(self.s3), 5)
        self.assertEqual(len(self.empty), 0)

    def test_ne(self):
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3]))
        self.assertTrue(self.s4 != Alignment([self.d1, self.d3]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 !=
                        SequenceCollection([self.d1, self.r1]))

    def test_repr(self):
        self.assertEqual(repr(self.s1),
                         "<SequenceCollection: n=2; "
                         "mean +/- std length=5.00 +/- 2.00>")
        self.assertEqual(repr(self.s2),
                         "<SequenceCollection: n=3; "
                         "mean +/- std length=7.33 +/- 3.68>")
        self.assertEqual(repr(self.s3),
                         "<SequenceCollection: n=5; "
                         "mean +/- std length=6.40 +/- 3.32>")
        self.assertEqual(repr(self.empty),
                         "<SequenceCollection: n=0; "
                         "mean +/- std length=0.00 +/- 0.00>")

    def test_reversed(self):
        s1_iter = reversed(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1[::-1]):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_kmer_frequencies(self):
        expected1 = Counter({'GAT': 1, 'TAC': 1})
        expected2 = Counter({'TTG': 1})
        self.assertEqual(
            self.s1.kmer_frequencies(k=3, overlap=False, relative=False),
            [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(float)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.kmer_frequencies(k=1, relative=True),
                         [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(float)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(
            self.s1.kmer_frequencies(k=3, overlap=False, relative=True),
            [expected1, expected2])

        self.assertEqual(self.empty.kmer_frequencies(k=1, relative=True), [])

        # Test to ensure floating point precision bug isn't present. See the
        # tests for Sequence.kmer_frequencies for more details.
        sc = SequenceCollection([RNA('C' * 10, metadata={'id': 's1'}),
                                 RNA('G' * 10, metadata={'id': 's2'})])
        self.assertEqual(sc.kmer_frequencies(1, relative=True),
                         [defaultdict(float, {'C': 1.0}),
                          defaultdict(float, {'G': 1.0})])

    def test_str(self):
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(str(self.s1), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(str(self.s2), exp2)
        exp4 = ""
        self.assertEqual(str(self.empty), exp4)

    def test_distances(self):
        s1 = SequenceCollection([DNA("ACGT", metadata={'id': "d1"}),
                                 DNA("ACGG", metadata={'id': "d2"})])
        expected = [[0, 0.25],
                    [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])

        def h(s1, s2):
            return hamming(s1.values, s2.values)
        actual = s1.distances(h)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42.],
                    [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)

    def test_distribution_stats(self):
        actual1 = self.s1.distribution_stats()
        self.assertEqual(actual1[0], 2)
        self.assertAlmostEqual(actual1[1], 5.0, 3)
        self.assertAlmostEqual(actual1[2], 2.0, 3)

        actual2 = self.s2.distribution_stats()
        self.assertEqual(actual2[0], 3)
        self.assertAlmostEqual(actual2[1], 7.333, 3)
        self.assertAlmostEqual(actual2[2], 3.682, 3)

        actual3 = self.s3.distribution_stats()
        self.assertEqual(actual3[0], 5)
        self.assertAlmostEqual(actual3[1], 6.400, 3)
        self.assertAlmostEqual(actual3[2], 3.323, 3)

        actual4 = self.empty.distribution_stats()
        self.assertEqual(actual4[0], 0)
        self.assertEqual(actual4[1], 0.0)
        self.assertEqual(actual4[2], 0.0)

    def test_degap(self):
        expected = SequenceCollection([
            RNA('GAUUACA', metadata={'id': "r1"}),
            RNA('UUG', metadata={'id': "r2"}),
            RNA('UUGCC', metadata={'id': "r3"})])
        actual = self.s2.degap()
        self.assertEqual(actual, expected)

    def test_get_seq(self):
        self.assertEqual(self.s1.get_seq('d1'), self.d1)
        self.assertEqual(self.s1.get_seq('d2'), self.d2)

    def test_ids(self):
        self.assertEqual(self.s1.ids(), ['d1', 'd2'])
        self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3'])
        self.assertEqual(self.s3.ids(),
                         ['d1', 'd2', 'r1', 'r2', 'r3'])
        self.assertEqual(self.empty.ids(), [])

    def test_update_ids_default_behavior(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', metadata={'id': "1"}),
            RNA('UUG', metadata={'id': "2"}),
            RNA('U-----UGCC--', metadata={'id': "3"})
        ])
        exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids()
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids()
        self.assertEqual(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_prefix(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', metadata={'id': "abc1"}),
            RNA('UUG', metadata={'id': "abc2"}),
            RNA('U-----UGCC--', metadata={'id': "abc3"})
        ])
        exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(prefix='abc')
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(prefix='abc')
        self.assertEqual(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_func_parameter(self):
        def append_42(ids):
            return [id_ + '-42' for id_ in ids]

        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', metadata={'id': "r1-42"}),
            RNA('UUG', metadata={'id': "r2-42"}),
            RNA('U-----UGCC--', metadata={'id': "r3-42"})
        ])
        exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(func=append_42)
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(func=append_42)
        self.assertEqual(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_ids_parameter(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', metadata={'id': "abc"}),
            RNA('UUG', metadata={'id': "def"}),
            RNA('U-----UGCC--', metadata={'id': "ghi"})
        ])
        exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi'))
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(ids=[])
        self.assertEqual(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_sequence_attributes_propagated(self):
        # 1 seq
        exp_sc = Alignment([
            DNA('ACGT', metadata={'id': "abc", 'description': 'desc'},
                positional_metadata={'quality': range(4)})
        ])
        exp_id_map = {'abc': 'seq1'}

        obj = Alignment([
            DNA('ACGT', metadata={'id': "seq1", 'description': 'desc'},
                positional_metadata={'quality': range(4)})
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc',))
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # 2 seqs
        exp_sc = Alignment([
            DNA('ACGT', metadata={'id': "abc", 'description': 'desc1'},
                positional_metadata={'quality': range(4)}),
            DNA('TGCA', metadata={'id': "def", 'description': 'desc2'},
                positional_metadata={'quality': range(4)[::-1]})
        ])
        exp_id_map = {'abc': 'seq1', 'def': 'seq2'}

        obj = Alignment([
            DNA('ACGT', metadata={'id': "seq1", 'description': 'desc1'},
                positional_metadata={'quality': (0, 1, 2, 3)}),
            DNA('TGCA', metadata={'id': "seq2", 'description': 'desc2'},
                positional_metadata={'quality': (3, 2, 1, 0)})
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def'))
        self.assertEqual(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

    def test_update_ids_invalid_parameter_combos(self):
        with six.assertRaisesRegex(self, SequenceCollectionError,
                                   'ids and func'):
            self.s1.update_ids(func=lambda e: e, ids=['foo', 'bar'])

        with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'):
            self.s1.update_ids(ids=['foo', 'bar'], prefix='abc')

        with six.assertRaisesRegex(self, SequenceCollectionError, 'prefix'):
            self.s1.update_ids(func=lambda e: e, prefix='abc')

    def test_update_ids_invalid_ids(self):
        # incorrect number of new ids
        with six.assertRaisesRegex(self, SequenceCollectionError, '3 != 2'):
            self.s1.update_ids(ids=['foo', 'bar', 'baz'])
        with six.assertRaisesRegex(self, SequenceCollectionError, '4 != 2'):
            self.s1.update_ids(func=lambda e: ['foo', 'bar', 'baz', 'abc'])

        # duplicates
        with six.assertRaisesRegex(self, SequenceCollectionError, 'foo'):
            self.s2.update_ids(ids=['foo', 'bar', 'foo'])
        with six.assertRaisesRegex(self, SequenceCollectionError, 'bar'):
            self.s2.update_ids(func=lambda e: ['foo', 'bar', 'bar'])

    def test_is_empty(self):
        self.assertFalse(self.s1.is_empty())
        self.assertFalse(self.s2.is_empty())
        self.assertFalse(self.s3.is_empty())

        self.assertTrue(self.empty.is_empty())

    def test_iteritems(self):
        self.assertEqual(list(self.s1.iteritems()),
                         [(s.metadata['id'], s) for s in self.s1])

    def test_sequence_count(self):
        self.assertEqual(self.s1.sequence_count(), 2)
        self.assertEqual(self.s2.sequence_count(), 3)
        self.assertEqual(self.s3.sequence_count(), 5)
        self.assertEqual(self.empty.sequence_count(), 0)

    def test_sequence_lengths(self):
        self.assertEqual(self.s1.sequence_lengths(), [7, 3])
        self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12])
        self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12])
        self.assertEqual(self.empty.sequence_lengths(), [])
from qiime_default_reference import get_template_alignment, get_reference_sequences

from skbio import SequenceCollection

gapped_sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_template_alignment())][:500]

sequences = [(s.id, str(s)) for s in SequenceCollection.read(get_reference_sequences())][:500]

motif_1 = "GGTGCAAGCCGGTGGAAACA"
Beispiel #46
0
class SequenceCollectionTests(TestCase):

    """Tests of the SequenceCollection class """

    def setUp(self):
        """Initialize values to be used in tests
        """
        self.d1 = DNASequence('GATTACA', id="d1")
        self.d2 = DNASequence('TTG', id="d2")
        self.d1_lower = DNASequence('gattaca', id="d1")
        self.d2_lower = DNASequence('ttg', id="d2")
        self.r1 = RNASequence('GAUUACA', id="r1")
        self.r2 = RNASequence('UUG', id="r2")
        self.r3 = RNASequence('U-----UGCC--', id="r3")

        self.i1 = DNASequence('GATXACA', id="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])

    def test_init(self):
        """Initialization functions as expected with varied input types
        """
        SequenceCollection(self.seqs1)
        SequenceCollection(self.seqs2)
        SequenceCollection(self.seqs3)
        SequenceCollection([])

    def test_init_fail(self):
        """initialization with sequences with overlapping ids fails
        """
        s1 = [self.d1, self.d1]
        self.assertRaises(SequenceCollectionError, SequenceCollection, s1)

    def test_init_validate(self):
        """initialization with validation functions as expected
        """
        SequenceCollection(self.seqs1, validate=True)
        SequenceCollection(self.seqs1, validate=True)
        # can't validate self.seqs2 as a DNASequence
        self.assertRaises(SequenceCollectionError, SequenceCollection,
                          self.invalid_s1, validate=True)

    def test_from_fasta_records(self):
        """Initialization from list of tuples functions as expected
        """
        SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence)
        SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence)
        SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)

    def test_contains(self):
        """in operator functions as expected
        """
        self.assertTrue('d1' in self.s1)
        self.assertTrue('r2' in self.s2)
        self.assertFalse('r2' in self.s1)

    def test_eq(self):
        """equality operator functions as expected
        """
        self.assertTrue(self.s1 == self.s1)
        self.assertFalse(self.s1 == self.s2)

        # different objects can be equal
        self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2]))
        self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1)

        # SequenceCollections with different number of sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertFalse(self.s1 == FakeSequenceCollection([self.d1, self.d2]))
        self.assertFalse(self.s1 == Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))

    def test_getitem(self):
        """getitem functions as expected
        """
        self.assertEqual(self.s1[0], self.d1)
        self.assertEqual(self.s1[1], self.d2)
        self.assertEqual(self.s2[0], self.r1)
        self.assertEqual(self.s2[1], self.r2)

        self.assertRaises(IndexError, self.empty.__getitem__, 0)
        self.assertRaises(KeyError, self.empty.__getitem__, '0')

    def test_iter(self):
        """iter functions as expected
        """
        s1_iter = iter(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_len(self):
        """len functions as expected
        """
        self.assertEqual(len(self.s1), 2)
        self.assertEqual(len(self.s2), 3)
        self.assertEqual(len(self.s3), 5)
        self.assertEqual(len(self.empty), 0)

    def test_ne(self):
        """inequality operator functions as expected
        """
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertTrue(self.s1 != FakeSequenceCollection([self.d1, self.d2]))
        self.assertTrue(self.s1 != Alignment([self.d1, self.d2]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 !=
                        SequenceCollection([self.d1, self.r1]))

    def test_repr(self):
        """repr functions as expected
        """
        self.assertEqual(repr(self.s1),
                         "<SequenceCollection: n=2; "
                         "mean +/- std length=5.00 +/- 2.00>")
        self.assertEqual(repr(self.s2),
                         "<SequenceCollection: n=3; "
                         "mean +/- std length=7.33 +/- 3.68>")
        self.assertEqual(repr(self.s3),
                         "<SequenceCollection: n=5; "
                         "mean +/- std length=6.40 +/- 3.32>")
        self.assertEqual(repr(self.empty),
                         "<SequenceCollection: n=0; "
                         "mean +/- std length=0.00 +/- 0.00>")

    def test_reversed(self):
        """reversed functions as expected
        """
        s1_iter = reversed(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1[::-1]):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_k_word_frequencies(self):
        """k_word_frequencies functions as expected
        """
        expected1 = defaultdict(int)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(int)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.k_word_frequencies(k=1),
                         [expected1, expected2])

        expected1 = defaultdict(int)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(int)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False),
                         [expected1, expected2])

        self.assertEqual(self.empty.k_word_frequencies(k=1), [])

    def test_str(self):
        """str functions as expected
        """
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(str(self.s1), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(str(self.s2), exp2)
        exp4 = ""
        self.assertEqual(str(self.empty), exp4)

    def test_distances(self):
        """distances functions as expected
        """
        s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")])
        expected = [[0, 0.25],
                    [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(hamming)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42.],
                    [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)

    def test_distribution_stats(self):
        """distribution_stats functions as expected
        """
        actual1 = self.s1.distribution_stats()
        self.assertEqual(actual1[0], 2)
        self.assertAlmostEqual(actual1[1], 5.0, 3)
        self.assertAlmostEqual(actual1[2], 2.0, 3)

        actual2 = self.s2.distribution_stats()
        self.assertEqual(actual2[0], 3)
        self.assertAlmostEqual(actual2[1], 7.333, 3)
        self.assertAlmostEqual(actual2[2], 3.682, 3)

        actual3 = self.s3.distribution_stats()
        self.assertEqual(actual3[0], 5)
        self.assertAlmostEqual(actual3[1], 6.400, 3)
        self.assertAlmostEqual(actual3[2], 3.323, 3)

        actual4 = self.empty.distribution_stats()
        self.assertEqual(actual4[0], 0)
        self.assertEqual(actual4[1], 0.0)
        self.assertEqual(actual4[2], 0.0)

    def test_degap(self):
        """degap functions as expected
        """
        expected = [(id_, seq.replace('.', '').replace('-', ''))
                    for id_, seq in self.seqs2_t]
        expected = SequenceCollection.from_fasta_records(expected, RNASequence)
        actual = self.s2.degap()
        self.assertEqual(actual, expected)

    def test_get_seq(self):
        """getseq functions asexpected
        """
        self.assertEqual(self.s1.get_seq('d1'), self.d1)
        self.assertEqual(self.s1.get_seq('d2'), self.d2)

    def test_ids(self):
        """ids functions as expected
        """
        self.assertEqual(self.s1.ids(), ['d1', 'd2'])
        self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3'])
        self.assertEqual(self.s3.ids(),
                         ['d1', 'd2', 'r1', 'r2', 'r3'])
        self.assertEqual(self.empty.ids(), [])

    def test_int_map(self):
        """int_map functions as expected
        """
        expected1 = {"1": self.d1, "2": self.d2}
        expected2 = {"1": "d1", "2": "d2"}
        self.assertEqual(self.s1.int_map(), (expected1, expected2))

        expected1 = {"h-1": self.d1, "h-2": self.d2}
        expected2 = {"h-1": "d1", "h-2": "d2"}
        self.assertEqual(self.s1.int_map(prefix='h-'), (expected1, expected2))

    def test_is_empty(self):
        """is_empty functions as expected
        """
        self.assertFalse(self.s1.is_empty())
        self.assertFalse(self.s2.is_empty())
        self.assertFalse(self.s3.is_empty())

        self.assertTrue(self.empty.is_empty())

    def test_is_valid(self):
        """is_valid functions as expected
        """
        self.assertTrue(self.s1.is_valid())
        self.assertTrue(self.s2.is_valid())
        self.assertTrue(self.s3.is_valid())
        self.assertTrue(self.empty.is_valid())

        self.assertFalse(self.invalid_s1.is_valid())

    def test_iteritems(self):
        """iteritems functions as expected
        """
        self.assertEqual(list(self.s1.iteritems()),
                         [(s.id, s) for s in self.s1])

    def test_lower(self):
        """lower functions as expected
        """
        self.assertEqual(self.s1.lower(), self.s1_lower)

    def test_sequence_count(self):
        """num_seqs functions as expected
        """
        self.assertEqual(self.s1.sequence_count(), 2)
        self.assertEqual(self.s2.sequence_count(), 3)
        self.assertEqual(self.s3.sequence_count(), 5)
        self.assertEqual(self.empty.sequence_count(), 0)

    def test_sequence_lengths(self):
        """sequence_lengths functions as expected
        """
        self.assertEqual(self.s1.sequence_lengths(), [7, 3])
        self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12])
        self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12])
        self.assertEqual(self.empty.sequence_lengths(), [])

    def test_to_fasta(self):
        """to_fasta functions as expected
        """
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(self.s1.to_fasta(), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(self.s2.to_fasta(), exp2)

    def test_toFasta(self):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            exp = ">d1\nGATTACA\n>d2\nTTG\n"
            self.assertEqual(self.s1.toFasta(), exp)

    def test_upper(self):
        """upper functions as expected
        """
        self.assertEqual(self.s1_lower.upper(), self.s1)
Beispiel #47
0
 def test_from_fasta_records(self):
     """Initialization from list of tuples functions as expected
     """
     SequenceCollection.from_fasta_records(self.seqs1_t, DNASequence)
     SequenceCollection.from_fasta_records(self.seqs2_t, RNASequence)
     SequenceCollection.from_fasta_records(self.seqs3_t, NucleotideSequence)
class SequenceCollectionTests(TestCase):
    def setUp(self):
        self.d1 = DNASequence('GATTACA', id="d1")
        self.d2 = DNASequence('TTG', id="d2")
        self.d3 = DNASequence('GTATACA', id="d3")
        self.d1_lower = DNASequence('gattaca', id="d1")
        self.d2_lower = DNASequence('ttg', id="d2")
        self.d3_lower = DNASequence('gtataca', id="d3")
        self.r1 = RNASequence('GAUUACA', id="r1")
        self.r2 = RNASequence('UUG', id="r2")
        self.r3 = RNASequence('U-----UGCC--', id="r3")

        self.i1 = DNASequence('GATXACA', id="i1")

        self.seqs1 = [self.d1, self.d2]
        self.seqs1_lower = [self.d1_lower, self.d2_lower]
        self.seqs2 = [self.r1, self.r2, self.r3]
        self.seqs3 = self.seqs1 + self.seqs2
        self.seqs4 = [self.d1, self.d3]

        self.seqs1_t = [('d1', 'GATTACA'), ('d2', 'TTG')]
        self.seqs2_t = [('r1', 'GAUUACA'), ('r2', 'UUG'),
                        ('r3', 'U-----UGCC--')]
        self.seqs3_t = self.seqs1_t + self.seqs2_t

        self.s1 = SequenceCollection(self.seqs1)
        self.s1_lower = SequenceCollection(self.seqs1_lower)
        self.s2 = SequenceCollection(self.seqs2)
        self.s3 = SequenceCollection(self.seqs3)
        self.s4 = SequenceCollection(self.seqs4)
        self.empty = SequenceCollection([])

        self.invalid_s1 = SequenceCollection([self.i1])

    def test_init(self):
        SequenceCollection(self.seqs1)
        SequenceCollection(self.seqs2)
        SequenceCollection(self.seqs3)
        SequenceCollection([])

    def test_init_fail(self):
        # sequences with overlapping ids
        s1 = [self.d1, self.d1]
        self.assertRaises(SequenceCollectionError, SequenceCollection, s1)

    def test_init_validate(self):
        SequenceCollection(self.seqs1, validate=True)
        SequenceCollection(self.seqs1, validate=True)
        # can't validate self.seqs2 as a DNASequence
        self.assertRaises(SequenceCollectionError, SequenceCollection,
                          self.invalid_s1, validate=True)

    def test_contains(self):
        self.assertTrue('d1' in self.s1)
        self.assertTrue('r2' in self.s2)
        self.assertFalse('r2' in self.s1)

    def test_eq(self):
        self.assertTrue(self.s1 == self.s1)
        self.assertFalse(self.s1 == self.s2)

        # different objects can be equal
        self.assertTrue(self.s1 == SequenceCollection([self.d1, self.d2]))
        self.assertTrue(SequenceCollection([self.d1, self.d2]) == self.s1)

        # SequenceCollections with different number of sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertFalse(self.s4 == FakeSequenceCollection([self.d1, self.d3]))
        self.assertFalse(self.s4 == Alignment([self.d1, self.d3]))

        # SequenceCollections with different sequences are not equal
        self.assertFalse(self.s1 == SequenceCollection([self.d1, self.r1]))

    def test_getitem(self):
        self.assertEqual(self.s1[0], self.d1)
        self.assertEqual(self.s1[1], self.d2)
        self.assertEqual(self.s2[0], self.r1)
        self.assertEqual(self.s2[1], self.r2)

        self.assertRaises(IndexError, self.empty.__getitem__, 0)
        self.assertRaises(KeyError, self.empty.__getitem__, '0')

    def test_iter(self):
        s1_iter = iter(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_len(self):
        self.assertEqual(len(self.s1), 2)
        self.assertEqual(len(self.s2), 3)
        self.assertEqual(len(self.s3), 5)
        self.assertEqual(len(self.empty), 0)

    def test_ne(self):
        self.assertFalse(self.s1 != self.s1)
        self.assertTrue(self.s1 != self.s2)

        # SequenceCollections with different number of sequences are not equal
        self.assertTrue(self.s1 != SequenceCollection([self.d1]))

        class FakeSequenceCollection(SequenceCollection):
            pass
        # SequenceCollections of different types are not equal
        self.assertTrue(self.s4 != FakeSequenceCollection([self.d1, self.d3]))
        self.assertTrue(self.s4 != Alignment([self.d1, self.d3]))

        # SequenceCollections with different sequences are not equal
        self.assertTrue(self.s1 !=
                        SequenceCollection([self.d1, self.r1]))

    def test_repr(self):
        self.assertEqual(repr(self.s1),
                         "<SequenceCollection: n=2; "
                         "mean +/- std length=5.00 +/- 2.00>")
        self.assertEqual(repr(self.s2),
                         "<SequenceCollection: n=3; "
                         "mean +/- std length=7.33 +/- 3.68>")
        self.assertEqual(repr(self.s3),
                         "<SequenceCollection: n=5; "
                         "mean +/- std length=6.40 +/- 3.32>")
        self.assertEqual(repr(self.empty),
                         "<SequenceCollection: n=0; "
                         "mean +/- std length=0.00 +/- 0.00>")

    def test_reversed(self):
        s1_iter = reversed(self.s1)
        count = 0
        for actual, expected in zip(s1_iter, self.seqs1[::-1]):
            count += 1
            self.assertEqual(actual, expected)
        self.assertEqual(count, len(self.seqs1))
        self.assertRaises(StopIteration, lambda: next(s1_iter))

    def test_k_word_frequencies(self):
        expected1 = defaultdict(float)
        expected1['A'] = 3 / 7.
        expected1['C'] = 1 / 7.
        expected1['G'] = 1 / 7.
        expected1['T'] = 2 / 7.
        expected2 = defaultdict(float)
        expected2['G'] = 1 / 3.
        expected2['T'] = 2 / 3.
        self.assertEqual(self.s1.k_word_frequencies(k=1),
                         [expected1, expected2])

        expected1 = defaultdict(float)
        expected1['GAT'] = 1 / 2.
        expected1['TAC'] = 1 / 2.
        expected2 = defaultdict(float)
        expected2['TTG'] = 1 / 1.
        self.assertEqual(self.s1.k_word_frequencies(k=3, overlapping=False),
                         [expected1, expected2])

        self.assertEqual(self.empty.k_word_frequencies(k=1), [])

        # Test to ensure floating point precision bug isn't present. See the
        # tests for BiologicalSequence.k_word_frequencies for more details.
        sc = SequenceCollection([RNA('C' * 10, id='s1'),
                                 RNA('G' * 10, id='s2')])
        self.assertEqual(sc.k_word_frequencies(1),
                         [defaultdict(float, {'C': 1.0}),
                          defaultdict(float, {'G': 1.0})])

    def test_str(self):
        exp1 = ">d1\nGATTACA\n>d2\nTTG\n"
        self.assertEqual(str(self.s1), exp1)
        exp2 = ">r1\nGAUUACA\n>r2\nUUG\n>r3\nU-----UGCC--\n"
        self.assertEqual(str(self.s2), exp2)
        exp4 = ""
        self.assertEqual(str(self.empty), exp4)

    def test_distances(self):
        s1 = SequenceCollection([DNA("ACGT", "d1"), DNA("ACGG", "d2")])
        expected = [[0, 0.25],
                    [0.25, 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(hamming)
        self.assertEqual(actual, expected)

        # alt distance function provided
        def dumb_distance(s1, s2):
            return 42.
        expected = [[0, 42.],
                    [42., 0]]
        expected = DistanceMatrix(expected, ['d1', 'd2'])
        actual = s1.distances(dumb_distance)
        self.assertEqual(actual, expected)

    def test_distribution_stats(self):
        actual1 = self.s1.distribution_stats()
        self.assertEqual(actual1[0], 2)
        self.assertAlmostEqual(actual1[1], 5.0, 3)
        self.assertAlmostEqual(actual1[2], 2.0, 3)

        actual2 = self.s2.distribution_stats()
        self.assertEqual(actual2[0], 3)
        self.assertAlmostEqual(actual2[1], 7.333, 3)
        self.assertAlmostEqual(actual2[2], 3.682, 3)

        actual3 = self.s3.distribution_stats()
        self.assertEqual(actual3[0], 5)
        self.assertAlmostEqual(actual3[1], 6.400, 3)
        self.assertAlmostEqual(actual3[2], 3.323, 3)

        actual4 = self.empty.distribution_stats()
        self.assertEqual(actual4[0], 0)
        self.assertEqual(actual4[1], 0.0)
        self.assertEqual(actual4[2], 0.0)

    def test_degap(self):
        expected = SequenceCollection([
            RNASequence('GAUUACA', id="r1"),
            RNASequence('UUG', id="r2"),
            RNASequence('UUGCC', id="r3")])
        actual = self.s2.degap()
        self.assertEqual(actual, expected)

    def test_get_seq(self):
        self.assertEqual(self.s1.get_seq('d1'), self.d1)
        self.assertEqual(self.s1.get_seq('d2'), self.d2)

    def test_ids(self):
        self.assertEqual(self.s1.ids(), ['d1', 'd2'])
        self.assertEqual(self.s2.ids(), ['r1', 'r2', 'r3'])
        self.assertEqual(self.s3.ids(),
                         ['d1', 'd2', 'r1', 'r2', 'r3'])
        self.assertEqual(self.empty.ids(), [])

    def _assert_sequence_collections_equal(self, observed, expected):
        """Compare SequenceCollections strictly."""
        # TODO remove this custom equality testing code when SequenceCollection
        # has an equals method (part of #656). We need this method to include
        # IDs in the comparison (not part of SequenceCollection.__eq__).
        self.assertEqual(observed, expected)
        for obs_seq, exp_seq in zip(observed, expected):
            self.assertTrue(obs_seq.equals(exp_seq))

    def test_update_ids_default_behavior(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="1"),
            RNA('UUG', id="2"),
            RNA('U-----UGCC--', id="3")
        ])
        exp_id_map = {'1': 'r1', '2': 'r2', '3': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids()
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids()
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_prefix(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="abc1"),
            RNA('UUG', id="abc2"),
            RNA('U-----UGCC--', id="abc3")
        ])
        exp_id_map = {'abc1': 'r1', 'abc2': 'r2', 'abc3': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(prefix='abc')
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(prefix='abc')
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_fn_parameter(self):
        def append_42(ids):
            return [id_ + '-42' for id_ in ids]

        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="r1-42"),
            RNA('UUG', id="r2-42"),
            RNA('U-----UGCC--', id="r3-42")
        ])
        exp_id_map = {'r1-42': 'r1', 'r2-42': 'r2', 'r3-42': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(fn=append_42)
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(fn=append_42)
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_ids_parameter(self):
        # 3 seqs
        exp_sc = SequenceCollection([
            RNA('GAUUACA', id="abc"),
            RNA('UUG', id="def"),
            RNA('U-----UGCC--', id="ghi")
        ])
        exp_id_map = {'abc': 'r1', 'def': 'r2', 'ghi': 'r3'}
        obs_sc, obs_id_map = self.s2.update_ids(ids=('abc', 'def', 'ghi'))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # empty
        obs_sc, obs_id_map = self.empty.update_ids(ids=[])
        self._assert_sequence_collections_equal(obs_sc, self.empty)
        self.assertEqual(obs_id_map, {})

    def test_update_ids_sequence_attributes_propagated(self):
        # 1 seq
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc', quality=range(4))
        ])
        exp_id_map = {'abc': 'seq1'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc', quality=range(4))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc',))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

        # 2 seqs
        exp_sc = Alignment([
            DNA('ACGT', id="abc", description='desc1', quality=range(4)),
            DNA('TGCA', id="def", description='desc2', quality=range(4)[::-1])
        ])
        exp_id_map = {'abc': 'seq1', 'def': 'seq2'}

        obj = Alignment([
            DNA('ACGT', id="seq1", description='desc1', quality=(0, 1, 2, 3)),
            DNA('TGCA', id="seq2", description='desc2', quality=(3, 2, 1, 0))
        ])

        obs_sc, obs_id_map = obj.update_ids(ids=('abc', 'def'))
        self._assert_sequence_collections_equal(obs_sc, exp_sc)
        self.assertEqual(obs_id_map, exp_id_map)

    def test_update_ids_invalid_parameter_combos(self):
        with self.assertRaisesRegexp(SequenceCollectionError, 'ids and fn'):
            self.s1.update_ids(fn=lambda e: e, ids=['foo', 'bar'])

        with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'):
            self.s1.update_ids(ids=['foo', 'bar'], prefix='abc')

        with self.assertRaisesRegexp(SequenceCollectionError, 'prefix'):
            self.s1.update_ids(fn=lambda e: e, prefix='abc')

    def test_update_ids_invalid_ids(self):
        # incorrect number of new ids
        with self.assertRaisesRegexp(SequenceCollectionError, '3 != 2'):
            self.s1.update_ids(ids=['foo', 'bar', 'baz'])
        with self.assertRaisesRegexp(SequenceCollectionError, '4 != 2'):
            self.s1.update_ids(fn=lambda e: ['foo', 'bar', 'baz', 'abc'])

        # duplicates
        with self.assertRaisesRegexp(SequenceCollectionError, 'foo'):
            self.s2.update_ids(ids=['foo', 'bar', 'foo'])
        with self.assertRaisesRegexp(SequenceCollectionError, 'bar'):
            self.s2.update_ids(fn=lambda e: ['foo', 'bar', 'bar'])

    def test_is_empty(self):
        self.assertFalse(self.s1.is_empty())
        self.assertFalse(self.s2.is_empty())
        self.assertFalse(self.s3.is_empty())

        self.assertTrue(self.empty.is_empty())

    def test_is_valid(self):
        self.assertTrue(self.s1.is_valid())
        self.assertTrue(self.s2.is_valid())
        self.assertTrue(self.s3.is_valid())
        self.assertTrue(self.empty.is_valid())

        self.assertFalse(self.invalid_s1.is_valid())

    def test_iteritems(self):
        self.assertEqual(list(self.s1.iteritems()),
                         [(s.id, s) for s in self.s1])

    def test_lower(self):
        self.assertEqual(self.s1.lower(), self.s1_lower)

    def test_sequence_count(self):
        self.assertEqual(self.s1.sequence_count(), 2)
        self.assertEqual(self.s2.sequence_count(), 3)
        self.assertEqual(self.s3.sequence_count(), 5)
        self.assertEqual(self.empty.sequence_count(), 0)

    def test_sequence_lengths(self):
        self.assertEqual(self.s1.sequence_lengths(), [7, 3])
        self.assertEqual(self.s2.sequence_lengths(), [7, 3, 12])
        self.assertEqual(self.s3.sequence_lengths(), [7, 3, 7, 3, 12])
        self.assertEqual(self.empty.sequence_lengths(), [])

    def test_upper(self):
        self.assertEqual(self.s1_lower.upper(), self.s1)