コード例 #1
0
    def test_merge_prepared_data(self):
        prep_list = [
            (PreparedDataList([
                PreparedData(0, 1, "", "", "", 0, "0,1,2", 0, 0, "", 0, ""),
            ]), SymbolIdDict({
                0: (0, "a"),
                1: (0, "b"),
                2: (0, "c"),
            })),
            (PreparedDataList([
                PreparedData(0, 2, "", "", "", 0, "0,1,2", 0, 0, "", 0, ""),
            ]), SymbolIdDict({
                0: (0, "b"),
                1: (0, "a"),
                2: (0, "d"),
            })),
        ]

        res, conv = merge_prepared_data(prep_list)

        self.assertEqual(6, len(conv))
        self.assertEqual("todo", conv.get_symbol(0))
        self.assertEqual("todo", conv.get_symbol(1))
        self.assertEqual("a", conv.get_symbol(2))
        self.assertEqual("b", conv.get_symbol(3))
        self.assertEqual("c", conv.get_symbol(4))
        self.assertEqual("d", conv.get_symbol(5))

        self.assertEqual(2, len(res))
        self.assertEqual(0, res[0].i)
        self.assertEqual(1, res[1].i)
        self.assertEqual(1, res[0].entry_id)
        self.assertEqual(2, res[1].entry_id)
        self.assertEqual("2,3,4", res[0].serialized_symbol_ids)
        self.assertEqual("3,2,5", res[1].serialized_symbol_ids)
コード例 #2
0
def __add(existing_set: PreparedDataList, restset: PreparedDataList,
          symbols: SymbolIdDict,
          func: Callable[[OrderedDictType[int, List[str]], SymbolIdDict],
                         OrderedSet[int]],
          **kwargs) -> Tuple[PreparedDataList, PreparedDataList]:
    logger = getLogger(__name__)
    new_set = existing_set
    new_restset = PreparedDataList()

    available_speaker_data = get_speaker_wise(restset)
    existing_speaker_data = get_speaker_wise(existing_set)

    for speaker_id, speaker_available in available_speaker_data.items():
        speaker_existing = existing_speaker_data[
            speaker_id] if speaker_id in existing_speaker_data else PreparedDataList(
            )

        speaker_existing_dict = prep_data_list_to_dict_with_symbols(
            speaker_existing, symbols)
        speaker_available_dict = prep_data_list_to_dict_with_symbols(
            speaker_available, symbols)

        selected_keys = func(speaker_available=speaker_available,
                             speaker_available_dict=speaker_available_dict,
                             speaker_existing=speaker_existing,
                             speaker_existing_dict=speaker_existing_dict,
                             **kwargs)

        not_selected_keys = set(
            speaker_available_dict.keys()).difference(selected_keys)
        selected_data = select_entities_from_prep_data(selected_keys,
                                                       speaker_available)
        not_selected_data = select_entities_from_prep_data(
            not_selected_keys, speaker_available)
        assert len(selected_data) + len(not_selected_data) == len(
            speaker_available)

        if len(selected_data) == 0:
            logger.warning(
                f"The part in the destination set for speaker with id {speaker_id} is empty! There exist a total of {len(speaker_available)} entries for that speaker."
            )

        if len(not_selected_data) == 0:
            logger.warning(
                f"The part in rest set for speaker with id {speaker_id} is empty! There exist a total of {len(speaker_available)} entries for that speaker."
            )

        new_set.extend(selected_data)
        new_restset.extend(not_selected_data)

        logger.info(
            f"Took {len(selected_data)}/{len(speaker_available)} utterances from speaker {speaker_id} ({selected_data.get_total_duration_s()/60:.2f}min/{selected_data.get_total_duration_s()/60/60:.2f}h)."
        )

    return new_set, new_restset
コード例 #3
0
    def test_sort_prep_data_list(self):
        l = PreparedDataList([
            self.get_dummy_prep_data(entry_id=2),
            self.get_dummy_prep_data(entry_id=1),
            self.get_dummy_prep_data(entry_id=3)
        ])

        l.sort(key=PreparedDataList._get_key_for_sorting, reverse=False)

        self.assertEqual(1, l[0].entry_id)
        self.assertEqual(2, l[1].entry_id)
        self.assertEqual(3, l[2].entry_id)
コード例 #4
0
def _get_speaker_occ_stats(speaker_order: List[str], speakers: SpeakersDict,
                           trainset: PreparedDataList,
                           valset: PreparedDataList, testset: PreparedDataList,
                           restset: PreparedDataList):
    trn_speakers = [[speakers.get_speaker(x.speaker_id)]
                    for x in trainset.items()]
    val_speakers = [[speakers.get_speaker(x.speaker_id)]
                    for x in valset.items()]
    tst_speakers = [[speakers.get_speaker(x.speaker_id)]
                    for x in testset.items()]
    rst_speakers = [[speakers.get_speaker(x.speaker_id)]
                    for x in restset.items()]

    utterances_count_df = get_occ_df_of_all_symbols(
        symbols=speaker_order,
        data_trn=trn_speakers,
        data_val=val_speakers,
        data_tst=tst_speakers,
        data_rst=rst_speakers,
    )
    utterances_count_df.columns = [
        'SPEAKER_NAME', 'TRAIN_UTTERANCES_COUNT', 'VAL_UTTERANCES_COUNT',
        'TEST_UTTERANCES_COUNT', 'REST_UTTERANCES_COUNT',
        'TOTAL_UTTERANCES_COUNT'
    ]
    print(utterances_count_df)

    utterances_percent_df = get_rel_occ_df_of_all_symbols(utterances_count_df)
    utterances_percent_df.columns = [
        'SPEAKER_NAME', 'TRAIN_UTTERANCES_PERCENT', 'VAL_UTTERANCES_PERCENT',
        'TEST_UTTERANCES_PERCENT', 'REST_UTTERANCES_PERCENT'
    ]
    print(utterances_percent_df)

    utterances_distribution_percent_df = get_dist_among_other_symbols_df_of_all_symbols(
        occs_df=utterances_count_df,
        data_trn=trn_speakers,
        data_val=val_speakers,
        data_tst=tst_speakers,
        data_rst=rst_speakers,
    )
    utterances_distribution_percent_df.columns = [
        'SPEAKER_NAME', 'TRAIN_UTTERANCES_DISTRIBUTION_PERCENT',
        'VAL_UTTERANCES_DISTRIBUTION_PERCENT',
        'TEST_UTTERANCES_DISTRIBUTION_PERCENT',
        'REST_UTTERANCES_DISTRIBUTION_PERCENT',
        'TOTAL_UTTERANCES_DISTRIBUTION_PERCENT'
    ]
    print(utterances_distribution_percent_df)

    return utterances_count_df, utterances_percent_df, utterances_distribution_percent_df
コード例 #5
0
    def test_split_prepared_data(self):
        data = PreparedDataList([
            self.get_dummy_prep_data(i=0),
            self.get_dummy_prep_data(i=1),
            self.get_dummy_prep_data(i=2),
            self.get_dummy_prep_data(i=3),
            self.get_dummy_prep_data(i=4),
            self.get_dummy_prep_data(i=5),
            self.get_dummy_prep_data(i=6),
            self.get_dummy_prep_data(i=7),
            self.get_dummy_prep_data(i=8),
            self.get_dummy_prep_data(i=9),
            self.get_dummy_prep_data(i=10),
            self.get_dummy_prep_data(i=11),
        ])

        train, test, val = split_prepared_data_train_test_val(
            data,
            test_size=1 / 6,
            validation_size=2 / 6,
            seed=0,
            shuffle=False)

        self.assertEqual(2, len(test))
        self.assertEqual(4, len(val))
        self.assertEqual(6, len(train))
コード例 #6
0
def get_total_set(trainset: PreparedDataList, valset: PreparedDataList, testset: PreparedDataList, restset: PreparedDataList) -> PreparedDataList:
  total_set = []
  total_set.extend(trainset)
  total_set.extend(testset)
  total_set.extend(valset)
  total_set.extend(restset)
  total_set = PreparedDataList(total_set)
  return total_set
コード例 #7
0
def add_rest(
        existing_set: PreparedDataList, restset: PreparedDataList,
        symbols: SymbolIdDict) -> Tuple[PreparedDataList, PreparedDataList]:
    # logger = getLogger(__name__)
    new_set = existing_set
    new_restset = PreparedDataList()
    new_set.extend(restset)
    return new_set, new_restset
コード例 #8
0
def add_n_divergent_random_seconds(
        existing_set: PreparedDataList, restset: PreparedDataList,
        symbols: SymbolIdDict, seed: int, seconds: float,
        n: int) -> List[Tuple[PreparedDataList, PreparedDataList]]:
    logger = getLogger(__name__)

    new_datasets: List[Tuple[PreparedDataList, PreparedDataList]] = []

    available_speaker_data = get_speaker_wise(restset)

    for speaker_id, speaker_available in available_speaker_data.items():
        speaker_available_dict = prep_data_list_to_dict_with_symbols(
            speaker_available, symbols)
        speaker_avail_durations_s = prep_data_list_to_dict_with_durations_s(
            speaker_available)

        selected_list_of_keys = n_divergent_random_seconds(
            n=n,
            seconds=seconds,
            durations_s=speaker_avail_durations_s,
            data=speaker_available_dict,
            seed=seed,
        )

        for i, k in enumerate(selected_list_of_keys):
            not_selected_keys = set(
                speaker_available_dict.keys()).difference(k)
            selected_data = select_entities_from_prep_data(
                k, speaker_available)
            not_selected_data = select_entities_from_prep_data(
                not_selected_keys, speaker_available)
            assert len(selected_data) + len(not_selected_data) == len(
                speaker_available)

            new_set = PreparedDataList(existing_set + selected_data)
            new_restset = PreparedDataList(not_selected_data)
            new_datasets.append((new_set, new_restset))

            logger.info(
                f"{i+1}/{n}: Took {len(selected_data)}/{len(speaker_available)} utterances from speaker {speaker_id} ({selected_data.get_total_duration_s()/60:.2f}min/{selected_data.get_total_duration_s()/60/60:.2f}h)."
            )

    return new_datasets
コード例 #9
0
def select_entities_from_prep_data(keys: Set[int], select_from: PreparedDataList) -> PreparedDataList:
  keys_exist = len(keys.difference(select_from.get_entry_ids())) == 0
  assert keys_exist
  res = PreparedDataList(item for item in select_from.items() if item.entry_id in keys)
  return res
コード例 #10
0
def dict_with_symbols_to_prep_data_list(d: Dict[int, List[int]], select_from: PreparedDataList):
  res = [x for x in select_from.items() if x.entry_id in d]
  return res
コード例 #11
0
def prep_data_list_to_dict_with_durations_s(l: PreparedDataList) -> OrderedDictType[int, float]:
  res = OrderedDict({x.entry_id: x.duration_s for x in l.items()})
  return res
コード例 #12
0
def prep_data_list_to_dict_with_symbols(l: PreparedDataList, symbols: SymbolIdDict) -> OrderedDictType[int, List[str]]:
  res = OrderedDict({x.entry_id: symbols.get_symbols(x.serialized_symbol_ids) for x in l.items()})
  return res
コード例 #13
0
def prep_data_list_to_dict_with_symbol_ids(l: PreparedDataList) -> OrderedDictType[int, List[int]]:
  res = OrderedDict({x.entry_id: deserialize_list(x.serialized_symbol_ids) for x in l.items()})
  return res
コード例 #14
0
def merge_prep_data_lists(l1: PreparedDataList, l2: PreparedDataList) -> PreparedDataList:
  res = PreparedDataList(l1)
  res.extend(l2)
  return res
コード例 #15
0
def _get_ngram_stats_df_core(symbol_order: List[str], symbols: SymbolIdDict,
                             trainset: PreparedDataList,
                             valset: PreparedDataList,
                             testset: PreparedDataList,
                             restset: PreparedDataList, n: int,
                             logger: Logger):
    logger.info(f"Get {n}-grams...")
    trn_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in trainset.items()
    ]
    val_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in valset.items()
    ]
    tst_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in testset.items()
    ]
    rst_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in restset.items()
    ]

    trn_symbols_one_gram = [get_ngrams(x, n=n) for x in trn_symbols]
    val_symbols_one_gram = [get_ngrams(x, n=n) for x in val_symbols]
    tst_symbols_one_gram = [get_ngrams(x, n=n) for x in tst_symbols]
    rst_symbols_one_gram = [get_ngrams(x, n=n) for x in rst_symbols]
    logger.info("Get stats...")

    occurences_count_df = get_occ_df_of_all_symbols(
        symbols=symbol_order,
        data_trn=trn_symbols_one_gram,
        data_val=val_symbols_one_gram,
        data_tst=tst_symbols_one_gram,
        data_rst=rst_symbols_one_gram,
    )
    occurences_count_df.columns = [
        FIRST_COL_NAME, 'TRAIN_OCCURRENCES_COUNT', 'VAL_OCCURRENCES_COUNT',
        'TEST_OCCURRENCES_COUNT', 'REST_OCCURRENCES_COUNT',
        'TOTAL_OCCURRENCES_COUNT'
    ]
    print(occurences_count_df)

    occurrences_percent_df = get_rel_occ_df_of_all_symbols(occurences_count_df)
    occurrences_percent_df.columns = [
        FIRST_COL_NAME, 'TRAIN_OCCURRENCES_PERCENT', 'VAL_OCCURRENCES_PERCENT',
        'TEST_OCCURRENCES_PERCENT', 'REST_OCCURRENCES_PERCENT'
    ]
    print(occurrences_percent_df)

    occurrences_distribution_percent_df = get_dist_among_other_symbols_df_of_all_symbols(
        occs_df=occurences_count_df,
        data_trn=trn_symbols_one_gram,
        data_val=val_symbols_one_gram,
        data_tst=tst_symbols_one_gram,
        data_rst=rst_symbols_one_gram,
    )
    occurrences_distribution_percent_df.columns = [
        FIRST_COL_NAME, 'TRAIN_OCCURRENCES_DISTRIBUTION_PERCENT',
        'VAL_OCCURRENCES_DISTRIBUTION_PERCENT',
        'TEST_OCCURRENCES_DISTRIBUTION_PERCENT',
        'REST_OCCURRENCES_DISTRIBUTION_PERCENT',
        'TOTAL_OCCURRENCES_DISTRIBUTION_PERCENT'
    ]
    print(occurrences_distribution_percent_df)

    utterance_occurrences_count_df = get_utter_occ_df_of_all_symbols(
        symbols=symbol_order,
        data_trn=trn_symbols_one_gram,
        data_val=val_symbols_one_gram,
        data_tst=tst_symbols_one_gram,
        data_rst=rst_symbols_one_gram,
    )
    utterance_occurrences_count_df.columns = [
        FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_COUNT',
        'VAL_UTTERANCE_OCCURRENCES_COUNT', 'TEST_UTTERANCE_OCCURRENCES_COUNT',
        'REST_UTTERANCE_OCCURRENCES_COUNT', 'TOTAL_UTTERANCE_OCCURRENCES_COUNT'
    ]
    print(utterance_occurrences_count_df)

    utterance_occurrences_percent_df = get_rel_utter_occ_df_of_all_symbols(
        utterance_occurrences_count_df)
    utterance_occurrences_percent_df.columns = [
        FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_PERCENT',
        'VAL_UTTERANCE_OCCURRENCES_PERCENT',
        'TEST_UTTERANCE_OCCURRENCES_PERCENT',
        'REST_UTTERANCE_OCCURRENCES_PERCENT'
    ]
    print(utterance_occurrences_percent_df)

    uniform_occurrences_count_df = get_uniform_distr_df_for_occs(
        symbols=symbol_order,
        occ_df=occurences_count_df,
    )
    uniform_occurrences_count_df.columns = [
        FIRST_COL_NAME, 'TRAIN_UNIFORM_OCCURRENCES_COUNT',
        'VAL_UNIFORM_OCCURRENCES_COUNT', 'TEST_UNIFORM_OCCURRENCES_COUNT',
        'REST_UNIFORM_OCCURRENCES_COUNT', 'TOTAL_UNIFORM_OCCURRENCES_COUNT'
    ]
    print(uniform_occurrences_count_df)

    uniform_occurrences_percent_df = get_rel_uniform_distr_df_for_occs(
        symbols=symbol_order, )
    uniform_occurrences_percent_df.columns = [
        FIRST_COL_NAME, 'UNIFORM_OCCURRENCES_PERCENT'
    ]
    print(uniform_occurrences_percent_df)

    return occurences_count_df, occurrences_percent_df, occurrences_distribution_percent_df, utterance_occurrences_count_df, utterance_occurrences_percent_df, uniform_occurrences_count_df, uniform_occurrences_percent_df
コード例 #16
0
def prepare_core(merge_data: MergedDataset) -> PreparedDataList:
    # logger = getLogger(__name__)
    restset = PreparedDataList.init_from_merged_ds(merge_data)
    return restset