def test_get_identifiers_randomly_splitted(): res = utils.split_identifiers(identifiers=[ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'v', 't' ], proportions={ 'a': 0.3333, 'b': 0.6666 }) assert len(res['a']) == 4 assert len(res['b']) == 8 assert len(set(res['a'] + res['b'])) == 12
def test_split_identifiers(): identifiers = [str(x) for x in range(10)] random.shuffle(identifiers) proportions = {'a': 0.3, 'b': 0.4, 'c': 0.3} result = utils.split_identifiers(identifiers, proportions, seed=220) assert len(result['a']) == 3 assert sorted(result['a']) == ['0', '1', '6'] assert len(result['b']) == 4 assert sorted(result['b']) == ['2', '4', '5', '8'] assert len(result['c']) == 3 assert sorted(result['c']) == ['3', '7', '9']
def run_split_identifiers(): identifiers = list(range(10000)) proportions = {'a': 0.2, 'b': 0.2, 'c': 0.6} utils.split_identifiers(identifiers, proportions)
def split_by_number_of_utterances(self, proportions={}, separate_issuers=False): """ Split the corpus into subsets with the given number of utterances. The corpus gets splitted into len(proportions) parts, so the number of utterances are distributed according to the proportions. Args: proportions (dict): A dictionary containing the relative size of the target subsets. The key is an identifier for the subset. separate_issuers (bool): If True it makes sure that all utterances of an issuer are in the same subset. Returns: (dict): A dictionary containing the subsets with the identifier from the input as key. Example:: >>> spl = Splitter(corpus) >>> corpus.num_utterances 100 >>> subsets = spl.split_by_number_of_utterances(proportions={ >>> "train" : 0.6, >>> "dev" : 0.2, >>> "test" : 0.2 >>> }) >>> print(subsets) {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>, 'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>, 'train': <audiomate.corpus.subview.Subview at 0x104ce7438>} >>> subsets['train'].num_utterances 60 >>> subsets['test'].num_utterances 20 """ if separate_issuers: # Count number of utterances per issuer issuer_utt_count = collections.defaultdict(int) issuer_utts = collections.defaultdict(list) for utterance in self.corpus.utterances.values(): issuer_utt_count[utterance.issuer.idx] += 1 issuer_utts[utterance.issuer.idx].append(utterance.idx) issuer_utt_count = { k: { 'count': int(v) } for k, v in issuer_utt_count.items() } # Split with total utt duration per issuer as weight issuer_splits = utils.get_identifiers_splitted_by_weights( issuer_utt_count, proportions=proportions) # Collect utterances of all issuers per split splits = collections.defaultdict(list) for split_idx, issuer_ids in issuer_splits.items(): for issuer_idx in issuer_ids: splits[split_idx].extend(issuer_utts[issuer_idx]) else: utterance_idxs = sorted(list(self.corpus.utterances.keys())) self.rand.shuffle(utterance_idxs) splits = utils.split_identifiers(identifiers=utterance_idxs, proportions=proportions) return self._subviews_from_utterance_splits(splits)