def test_merge_prepared_data(self): prep_list = [ (PreparedDataList([ PreparedData(0, 1, "", "", "", 0, "0,1,2", 0, 0, "", 0, ""), ]), SymbolIdDict({ 0: (0, "a"), 1: (0, "b"), 2: (0, "c"), })), (PreparedDataList([ PreparedData(0, 2, "", "", "", 0, "0,1,2", 0, 0, "", 0, ""), ]), SymbolIdDict({ 0: (0, "b"), 1: (0, "a"), 2: (0, "d"), })), ] res, conv = merge_prepared_data(prep_list) self.assertEqual(6, len(conv)) self.assertEqual("todo", conv.get_symbol(0)) self.assertEqual("todo", conv.get_symbol(1)) self.assertEqual("a", conv.get_symbol(2)) self.assertEqual("b", conv.get_symbol(3)) self.assertEqual("c", conv.get_symbol(4)) self.assertEqual("d", conv.get_symbol(5)) self.assertEqual(2, len(res)) self.assertEqual(0, res[0].i) self.assertEqual(1, res[1].i) self.assertEqual(1, res[0].entry_id) self.assertEqual(2, res[1].entry_id) self.assertEqual("2,3,4", res[0].serialized_symbol_ids) self.assertEqual("3,2,5", res[1].serialized_symbol_ids)
def __add(existing_set: PreparedDataList, restset: PreparedDataList, symbols: SymbolIdDict, func: Callable[[OrderedDictType[int, List[str]], SymbolIdDict], OrderedSet[int]], **kwargs) -> Tuple[PreparedDataList, PreparedDataList]: logger = getLogger(__name__) new_set = existing_set new_restset = PreparedDataList() available_speaker_data = get_speaker_wise(restset) existing_speaker_data = get_speaker_wise(existing_set) for speaker_id, speaker_available in available_speaker_data.items(): speaker_existing = existing_speaker_data[ speaker_id] if speaker_id in existing_speaker_data else PreparedDataList( ) speaker_existing_dict = prep_data_list_to_dict_with_symbols( speaker_existing, symbols) speaker_available_dict = prep_data_list_to_dict_with_symbols( speaker_available, symbols) selected_keys = func(speaker_available=speaker_available, speaker_available_dict=speaker_available_dict, speaker_existing=speaker_existing, speaker_existing_dict=speaker_existing_dict, **kwargs) not_selected_keys = set( speaker_available_dict.keys()).difference(selected_keys) selected_data = select_entities_from_prep_data(selected_keys, speaker_available) not_selected_data = select_entities_from_prep_data( not_selected_keys, speaker_available) assert len(selected_data) + len(not_selected_data) == len( speaker_available) if len(selected_data) == 0: logger.warning( f"The part in the destination set for speaker with id {speaker_id} is empty! There exist a total of {len(speaker_available)} entries for that speaker." ) if len(not_selected_data) == 0: logger.warning( f"The part in rest set for speaker with id {speaker_id} is empty! There exist a total of {len(speaker_available)} entries for that speaker." ) new_set.extend(selected_data) new_restset.extend(not_selected_data) logger.info( f"Took {len(selected_data)}/{len(speaker_available)} utterances from speaker {speaker_id} ({selected_data.get_total_duration_s()/60:.2f}min/{selected_data.get_total_duration_s()/60/60:.2f}h)." ) return new_set, new_restset
def test_sort_prep_data_list(self): l = PreparedDataList([ self.get_dummy_prep_data(entry_id=2), self.get_dummy_prep_data(entry_id=1), self.get_dummy_prep_data(entry_id=3) ]) l.sort(key=PreparedDataList._get_key_for_sorting, reverse=False) self.assertEqual(1, l[0].entry_id) self.assertEqual(2, l[1].entry_id) self.assertEqual(3, l[2].entry_id)
def _get_speaker_occ_stats(speaker_order: List[str], speakers: SpeakersDict, trainset: PreparedDataList, valset: PreparedDataList, testset: PreparedDataList, restset: PreparedDataList): trn_speakers = [[speakers.get_speaker(x.speaker_id)] for x in trainset.items()] val_speakers = [[speakers.get_speaker(x.speaker_id)] for x in valset.items()] tst_speakers = [[speakers.get_speaker(x.speaker_id)] for x in testset.items()] rst_speakers = [[speakers.get_speaker(x.speaker_id)] for x in restset.items()] utterances_count_df = get_occ_df_of_all_symbols( symbols=speaker_order, data_trn=trn_speakers, data_val=val_speakers, data_tst=tst_speakers, data_rst=rst_speakers, ) utterances_count_df.columns = [ 'SPEAKER_NAME', 'TRAIN_UTTERANCES_COUNT', 'VAL_UTTERANCES_COUNT', 'TEST_UTTERANCES_COUNT', 'REST_UTTERANCES_COUNT', 'TOTAL_UTTERANCES_COUNT' ] print(utterances_count_df) utterances_percent_df = get_rel_occ_df_of_all_symbols(utterances_count_df) utterances_percent_df.columns = [ 'SPEAKER_NAME', 'TRAIN_UTTERANCES_PERCENT', 'VAL_UTTERANCES_PERCENT', 'TEST_UTTERANCES_PERCENT', 'REST_UTTERANCES_PERCENT' ] print(utterances_percent_df) utterances_distribution_percent_df = get_dist_among_other_symbols_df_of_all_symbols( occs_df=utterances_count_df, data_trn=trn_speakers, data_val=val_speakers, data_tst=tst_speakers, data_rst=rst_speakers, ) utterances_distribution_percent_df.columns = [ 'SPEAKER_NAME', 'TRAIN_UTTERANCES_DISTRIBUTION_PERCENT', 'VAL_UTTERANCES_DISTRIBUTION_PERCENT', 'TEST_UTTERANCES_DISTRIBUTION_PERCENT', 'REST_UTTERANCES_DISTRIBUTION_PERCENT', 'TOTAL_UTTERANCES_DISTRIBUTION_PERCENT' ] print(utterances_distribution_percent_df) return utterances_count_df, utterances_percent_df, utterances_distribution_percent_df
def test_split_prepared_data(self): data = PreparedDataList([ self.get_dummy_prep_data(i=0), self.get_dummy_prep_data(i=1), self.get_dummy_prep_data(i=2), self.get_dummy_prep_data(i=3), self.get_dummy_prep_data(i=4), self.get_dummy_prep_data(i=5), self.get_dummy_prep_data(i=6), self.get_dummy_prep_data(i=7), self.get_dummy_prep_data(i=8), self.get_dummy_prep_data(i=9), self.get_dummy_prep_data(i=10), self.get_dummy_prep_data(i=11), ]) train, test, val = split_prepared_data_train_test_val( data, test_size=1 / 6, validation_size=2 / 6, seed=0, shuffle=False) self.assertEqual(2, len(test)) self.assertEqual(4, len(val)) self.assertEqual(6, len(train))
def get_total_set(trainset: PreparedDataList, valset: PreparedDataList, testset: PreparedDataList, restset: PreparedDataList) -> PreparedDataList: total_set = [] total_set.extend(trainset) total_set.extend(testset) total_set.extend(valset) total_set.extend(restset) total_set = PreparedDataList(total_set) return total_set
def add_rest( existing_set: PreparedDataList, restset: PreparedDataList, symbols: SymbolIdDict) -> Tuple[PreparedDataList, PreparedDataList]: # logger = getLogger(__name__) new_set = existing_set new_restset = PreparedDataList() new_set.extend(restset) return new_set, new_restset
def add_n_divergent_random_seconds( existing_set: PreparedDataList, restset: PreparedDataList, symbols: SymbolIdDict, seed: int, seconds: float, n: int) -> List[Tuple[PreparedDataList, PreparedDataList]]: logger = getLogger(__name__) new_datasets: List[Tuple[PreparedDataList, PreparedDataList]] = [] available_speaker_data = get_speaker_wise(restset) for speaker_id, speaker_available in available_speaker_data.items(): speaker_available_dict = prep_data_list_to_dict_with_symbols( speaker_available, symbols) speaker_avail_durations_s = prep_data_list_to_dict_with_durations_s( speaker_available) selected_list_of_keys = n_divergent_random_seconds( n=n, seconds=seconds, durations_s=speaker_avail_durations_s, data=speaker_available_dict, seed=seed, ) for i, k in enumerate(selected_list_of_keys): not_selected_keys = set( speaker_available_dict.keys()).difference(k) selected_data = select_entities_from_prep_data( k, speaker_available) not_selected_data = select_entities_from_prep_data( not_selected_keys, speaker_available) assert len(selected_data) + len(not_selected_data) == len( speaker_available) new_set = PreparedDataList(existing_set + selected_data) new_restset = PreparedDataList(not_selected_data) new_datasets.append((new_set, new_restset)) logger.info( f"{i+1}/{n}: Took {len(selected_data)}/{len(speaker_available)} utterances from speaker {speaker_id} ({selected_data.get_total_duration_s()/60:.2f}min/{selected_data.get_total_duration_s()/60/60:.2f}h)." ) return new_datasets
def select_entities_from_prep_data(keys: Set[int], select_from: PreparedDataList) -> PreparedDataList: keys_exist = len(keys.difference(select_from.get_entry_ids())) == 0 assert keys_exist res = PreparedDataList(item for item in select_from.items() if item.entry_id in keys) return res
def dict_with_symbols_to_prep_data_list(d: Dict[int, List[int]], select_from: PreparedDataList): res = [x for x in select_from.items() if x.entry_id in d] return res
def prep_data_list_to_dict_with_durations_s(l: PreparedDataList) -> OrderedDictType[int, float]: res = OrderedDict({x.entry_id: x.duration_s for x in l.items()}) return res
def prep_data_list_to_dict_with_symbols(l: PreparedDataList, symbols: SymbolIdDict) -> OrderedDictType[int, List[str]]: res = OrderedDict({x.entry_id: symbols.get_symbols(x.serialized_symbol_ids) for x in l.items()}) return res
def prep_data_list_to_dict_with_symbol_ids(l: PreparedDataList) -> OrderedDictType[int, List[int]]: res = OrderedDict({x.entry_id: deserialize_list(x.serialized_symbol_ids) for x in l.items()}) return res
def merge_prep_data_lists(l1: PreparedDataList, l2: PreparedDataList) -> PreparedDataList: res = PreparedDataList(l1) res.extend(l2) return res
def _get_ngram_stats_df_core(symbol_order: List[str], symbols: SymbolIdDict, trainset: PreparedDataList, valset: PreparedDataList, testset: PreparedDataList, restset: PreparedDataList, n: int, logger: Logger): logger.info(f"Get {n}-grams...") trn_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in trainset.items() ] val_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in valset.items() ] tst_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in testset.items() ] rst_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in restset.items() ] trn_symbols_one_gram = [get_ngrams(x, n=n) for x in trn_symbols] val_symbols_one_gram = [get_ngrams(x, n=n) for x in val_symbols] tst_symbols_one_gram = [get_ngrams(x, n=n) for x in tst_symbols] rst_symbols_one_gram = [get_ngrams(x, n=n) for x in rst_symbols] logger.info("Get stats...") occurences_count_df = get_occ_df_of_all_symbols( symbols=symbol_order, data_trn=trn_symbols_one_gram, data_val=val_symbols_one_gram, data_tst=tst_symbols_one_gram, data_rst=rst_symbols_one_gram, ) occurences_count_df.columns = [ FIRST_COL_NAME, 'TRAIN_OCCURRENCES_COUNT', 'VAL_OCCURRENCES_COUNT', 'TEST_OCCURRENCES_COUNT', 'REST_OCCURRENCES_COUNT', 'TOTAL_OCCURRENCES_COUNT' ] print(occurences_count_df) occurrences_percent_df = get_rel_occ_df_of_all_symbols(occurences_count_df) occurrences_percent_df.columns = [ FIRST_COL_NAME, 'TRAIN_OCCURRENCES_PERCENT', 'VAL_OCCURRENCES_PERCENT', 'TEST_OCCURRENCES_PERCENT', 'REST_OCCURRENCES_PERCENT' ] print(occurrences_percent_df) occurrences_distribution_percent_df = get_dist_among_other_symbols_df_of_all_symbols( occs_df=occurences_count_df, data_trn=trn_symbols_one_gram, data_val=val_symbols_one_gram, data_tst=tst_symbols_one_gram, data_rst=rst_symbols_one_gram, ) occurrences_distribution_percent_df.columns = [ FIRST_COL_NAME, 'TRAIN_OCCURRENCES_DISTRIBUTION_PERCENT', 'VAL_OCCURRENCES_DISTRIBUTION_PERCENT', 'TEST_OCCURRENCES_DISTRIBUTION_PERCENT', 'REST_OCCURRENCES_DISTRIBUTION_PERCENT', 'TOTAL_OCCURRENCES_DISTRIBUTION_PERCENT' ] print(occurrences_distribution_percent_df) utterance_occurrences_count_df = get_utter_occ_df_of_all_symbols( symbols=symbol_order, data_trn=trn_symbols_one_gram, data_val=val_symbols_one_gram, data_tst=tst_symbols_one_gram, data_rst=rst_symbols_one_gram, ) utterance_occurrences_count_df.columns = [ FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_COUNT', 'VAL_UTTERANCE_OCCURRENCES_COUNT', 'TEST_UTTERANCE_OCCURRENCES_COUNT', 'REST_UTTERANCE_OCCURRENCES_COUNT', 'TOTAL_UTTERANCE_OCCURRENCES_COUNT' ] print(utterance_occurrences_count_df) utterance_occurrences_percent_df = get_rel_utter_occ_df_of_all_symbols( utterance_occurrences_count_df) utterance_occurrences_percent_df.columns = [ FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_PERCENT', 'VAL_UTTERANCE_OCCURRENCES_PERCENT', 'TEST_UTTERANCE_OCCURRENCES_PERCENT', 'REST_UTTERANCE_OCCURRENCES_PERCENT' ] print(utterance_occurrences_percent_df) uniform_occurrences_count_df = get_uniform_distr_df_for_occs( symbols=symbol_order, occ_df=occurences_count_df, ) uniform_occurrences_count_df.columns = [ FIRST_COL_NAME, 'TRAIN_UNIFORM_OCCURRENCES_COUNT', 'VAL_UNIFORM_OCCURRENCES_COUNT', 'TEST_UNIFORM_OCCURRENCES_COUNT', 'REST_UNIFORM_OCCURRENCES_COUNT', 'TOTAL_UNIFORM_OCCURRENCES_COUNT' ] print(uniform_occurrences_count_df) uniform_occurrences_percent_df = get_rel_uniform_distr_df_for_occs( symbols=symbol_order, ) uniform_occurrences_percent_df.columns = [ FIRST_COL_NAME, 'UNIFORM_OCCURRENCES_PERCENT' ] print(uniform_occurrences_percent_df) return occurences_count_df, occurrences_percent_df, occurrences_distribution_percent_df, utterance_occurrences_count_df, utterance_occurrences_percent_df, uniform_occurrences_count_df, uniform_occurrences_percent_df
def prepare_core(merge_data: MergedDataset) -> PreparedDataList: # logger = getLogger(__name__) restset = PreparedDataList.init_from_merged_ds(merge_data) return restset