Esempio n. 1
0
    def test_split_by_label_length_issuer_separated_only_uses_given_label_list(self, split_mock):
        corpus = resources.create_multi_label_corpus()
        splitter = splitting.Splitter(corpus, random_seed=INITIAL_SEED)

        for utt in corpus.utterances.values():
            utt.set_label_list(annotations.LabelList.create_single(
                'another label', idx='some-idx'
            ))

        split_mock.return_value = {
            'train': ['spk-1', 'spk-2'],
            'test': ['spk-3']
        }
        res = splitter.split_by_label_length(
            {'train': 0.5, 'test': 0.5},
            label_list_idx='default',
            separate_issuers=True
        )

        assert res['train'].utterances.keys() == {
            'utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5'
        }
        assert res['test'].utterances.keys() == {
            'utt-6', 'utt-7', 'utt-8'
        }

        split_mock.assert_called_with(
            {
                'spk-1': {'length': 32},
                'spk-2': {'length': 33},
                'spk-3': {'length': 32},
            },
            {'train': 0.5, 'test': 0.5},
            seed=mock.ANY
        )
    def test_split_by_number_of_utterances_seed(self):
        corpus = resources.create_multi_label_corpus()
        res1 = splitting.Splitter(
            corpus, random_seed=15).split_by_number_of_utterances({
                'train': 0.6,
                'test': 0.2
            })

        corpus = resources.create_multi_label_corpus()
        res2 = splitting.Splitter(
            corpus, random_seed=15).split_by_number_of_utterances({
                'train': 0.6,
                'test': 0.2
            })

        assert set(res1['train'].utterances.keys()) == set(
            res2['train'].utterances.keys())
        assert set(res1['test'].utterances.keys()) == set(
            res2['test'].utterances.keys())
    def test_split_by_proportionally_distribute_labels_by_number_seed(self):
        corpus = resources.create_multi_label_corpus()
        splitter = splitting.Splitter(corpus, random_seed=15)
        res1 = splitter.split_by_proportionally_distribute_labels(
            {
                'train': 0.6,
                'test': 0.2
            }, use_lengths=False)

        corpus = resources.create_multi_label_corpus()
        splitter = splitting.Splitter(corpus, random_seed=15)
        res2 = splitter.split_by_proportionally_distribute_labels(
            {
                'train': 0.6,
                'test': 0.2
            }, use_lengths=False)

        assert set(res1['train'].utterances.keys()) == set(
            res2['train'].utterances.keys())
        assert set(res1['test'].utterances.keys()) == set(
            res2['test'].utterances.keys())
Esempio n. 4
0
    def test_split_by_label_length_only_uses_given_label_list(self, split_mock):
        corpus = resources.create_multi_label_corpus()
        splitter = splitting.Splitter(corpus, random_seed=INITIAL_SEED)

        for utt in corpus.utterances.values():
            utt.set_label_list(annotations.LabelList.create_single(
                'another label', idx='some-idx'
            ))

        split_mock.return_value = {
            'train': ['utt-1', 'utt-3'],
            'test': ['utt-3', 'utt-4'],
            'dev': ['utt-5', 'utt-6'],
        }
        res = splitter.split_by_label_length(
            {'train': 0.6, 'test': 0.2, 'dev': 0.2},
            label_list_idx='default'
        )

        assert res['train'].utterances.keys() == {'utt-1', 'utt-3'}
        assert res['test'].utterances.keys() == {'utt-3', 'utt-4'}
        assert res['dev'].utterances.keys() == {'utt-5', 'utt-6'}

        split_mock.assert_called_with(
            {
                'utt-1': {'length': 16},
                'utt-2': {'length': 16},
                'utt-3': {'length': 11},
                'utt-4': {'length': 16},
                'utt-5': {'length': 6},
                'utt-6': {'length': 16},
                'utt-7': {'length': 11},
                'utt-8': {'length': 5},
            },
            {'train': 0.6, 'test': 0.2, 'dev': 0.2},
            seed=mock.ANY
        )
def splitter():
    corpus = resources.create_multi_label_corpus()
    return splitting.Splitter(corpus)
Esempio n. 6
0
 def setUp(self):
     self.corpus = resources.create_multi_label_corpus()
     self.splitter = splitting.Splitter(self.corpus)
Esempio n. 7
0
def splitter():
    corpus = resources.create_multi_label_corpus()
    return splitting.Splitter(corpus, random_seed=INITIAL_SEED)