コード例 #1
0
def test_get_identifiers_randomly_splitted():
    res = utils.split_identifiers(identifiers=[
        'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'v', 't'
    ], proportions={
        'a': 0.3333,
        'b': 0.6666
    })

    assert len(res['a']) == 4
    assert len(res['b']) == 8
    assert len(set(res['a'] + res['b'])) == 12
コード例 #2
0
ファイル: test_utils.py プロジェクト: xjc90s/audiomate
def test_split_identifiers():
    identifiers = [str(x) for x in range(10)]
    random.shuffle(identifiers)
    proportions = {'a': 0.3, 'b': 0.4, 'c': 0.3}

    result = utils.split_identifiers(identifiers, proportions, seed=220)

    assert len(result['a']) == 3
    assert sorted(result['a']) == ['0', '1', '6']
    assert len(result['b']) == 4
    assert sorted(result['b']) == ['2', '4', '5', '8']
    assert len(result['c']) == 3
    assert sorted(result['c']) == ['3', '7', '9']
コード例 #3
0
def run_split_identifiers():
    identifiers = list(range(10000))
    proportions = {'a': 0.2, 'b': 0.2, 'c': 0.6}
    utils.split_identifiers(identifiers, proportions)
コード例 #4
0
    def split_by_number_of_utterances(self,
                                      proportions={},
                                      separate_issuers=False):
        """
        Split the corpus into subsets with the given number of utterances.
        The corpus gets splitted into len(proportions) parts, so the number of utterances are
        distributed according to the proportions.

        Args:
            proportions (dict): A dictionary containing the relative size of the target subsets.
                                The key is an identifier for the subset.
            separate_issuers (bool): If True it makes sure that all utterances of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split_by_number_of_utterances(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            60
            >>> subsets['test'].num_utterances
            20
        """

        if separate_issuers:
            # Count number of utterances per issuer
            issuer_utt_count = collections.defaultdict(int)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                issuer_utt_count[utterance.issuer.idx] += 1
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            issuer_utt_count = {
                k: {
                    'count': int(v)
                }
                for k, v in issuer_utt_count.items()
            }

            # Split with total utt duration per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(
                issuer_utt_count, proportions=proportions)

            # Collect utterances of all issuers per split
            splits = collections.defaultdict(list)

            for split_idx, issuer_ids in issuer_splits.items():
                for issuer_idx in issuer_ids:
                    splits[split_idx].extend(issuer_utts[issuer_idx])
        else:
            utterance_idxs = sorted(list(self.corpus.utterances.keys()))
            self.rand.shuffle(utterance_idxs)
            splits = utils.split_identifiers(identifiers=utterance_idxs,
                                             proportions=proportions)

        return self._subviews_from_utterance_splits(splits)