def update_symbols(data: MergedDataset, symbols: SymbolIdDict) -> SymbolIdDict: new_symbols: Set[str] = { x for y in data.items() for x in symbols.get_symbols(y.serialized_symbol_ids) } new_symbol_ids = SymbolIdDict.init_from_symbols_with_pad( new_symbols, pad_symbol=DEFAULT_PADDING_SYMBOL) if new_symbol_ids.get_all_symbols() != symbols.get_all_symbols(): for entry in data.items(): original_symbols = symbols.get_symbols(entry.serialized_symbol_ids) entry.serialized_symbol_ids = new_symbol_ids.get_serialized_ids( original_symbols) return new_symbol_ids
def sents_convert_to_ipa(sentences: SentenceList, text_symbols: SymbolIdDict, ignore_tones: bool, ignore_arcs: bool, mode: Optional[EngToIpaMode], consider_ipa_annotations: bool, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: sents_new_symbols = [] for sentence in sentences.items(True): if sentence.lang == Language.ENG and mode is None: ex = "Please specify the ipa conversion mode." logger.exception(ex) raise Exception(ex) new_symbols, new_accent_ids = symbols_to_ipa( symbols=text_symbols.get_symbols(sentence.serialized_symbols), lang=sentence.lang, accent_ids=deserialize_list(sentence.serialized_accents), ignore_arcs=ignore_arcs, ignore_tones=ignore_tones, mode=mode, replace_unknown_with=DEFAULT_PADDING_SYMBOL, consider_ipa_annotations=consider_ipa_annotations, logger=logger, ) assert len(new_symbols) == len(new_accent_ids) sentence.lang = Language.IPA sentence.serialized_accents = serialize_list(new_accent_ids) sents_new_symbols.append(new_symbols) assert len(sentence.get_accent_ids()) == len(new_symbols) return update_symbols_and_text(sentences, sents_new_symbols)
def get_formatted(self, symbol_id_dict: SymbolIdDict, accent_id_dict: AccentsDict, pairs_per_line=170, space_length=0): return get_formatted_core( sent_id=self.sent_id, symbols=symbol_id_dict.get_symbols(self.serialized_symbols), accent_ids=self.get_accent_ids(), accent_id_dict=accent_id_dict, space_length=space_length, max_pairs_per_line=pairs_per_line )
def sents_map(sentences: SentenceList, text_symbols: SymbolIdDict, symbols_map: SymbolsMap, ignore_arcs: bool, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: sents_new_symbols = [] result = SentenceList() new_sent_id = 0 ipa_settings = IPAExtractionSettings( ignore_tones=False, ignore_arcs=ignore_arcs, replace_unknown_ipa_by=DEFAULT_PADDING_SYMBOL, ) for sentence in sentences.items(): symbols = text_symbols.get_symbols(sentence.serialized_symbols) accent_ids = deserialize_list(sentence.serialized_accents) mapped_symbols = symbols_map.apply_to_symbols(symbols) text = SymbolIdDict.symbols_to_text(mapped_symbols) # a resulting empty text would make no problems sents = text_to_sentences( text=text, lang=sentence.lang, logger=logger, ) for new_sent_text in sents: new_symbols = text_to_symbols( new_sent_text, lang=sentence.lang, ipa_settings=ipa_settings, logger=logger, ) if len(accent_ids) > 0: new_accent_ids = [accent_ids[0]] * len(new_symbols) else: new_accent_ids = [] assert len(new_accent_ids) == len(new_symbols) new_sent_id += 1 tmp = Sentence( sent_id=new_sent_id, text=new_sent_text, lang=sentence.lang, orig_lang=sentence.orig_lang, # this is not correct but nearest possible currently original_text=sentence.original_text, serialized_accents=serialize_list(new_accent_ids), serialized_symbols="" ) sents_new_symbols.append(new_symbols) assert len(tmp.get_accent_ids()) == len(new_symbols) result.append(tmp) return update_symbols_and_text(result, sents_new_symbols)
def get_ngram_rarity(data: PreparedDataList, corpus: PreparedDataList, symbols: SymbolIdDict, ngram: int) -> OrderedDictType[int, float]: data_symbols_dict = OrderedDict({ x.entry_id: symbols.get_symbols(x.serialized_symbol_ids) for x in data.items() }) corpus_symbols_dict = OrderedDict({ x.entry_id: symbols.get_symbols(x.serialized_symbol_ids) for x in corpus.items() }) rarity = get_rarity_ngrams( data=data_symbols_dict, corpus=corpus_symbols_dict, n_gram=ngram, ignore_symbols=None, ) return rarity
def from_sentences(cls, sentences: SentenceList, accents: AccentsDict, symbols: SymbolIdDict): res = cls() for sentence in sentences.items(): infer_sent = InferSentence( sent_id=sentence.sent_id, symbols=symbols.get_symbols(sentence.serialized_symbols), accents=accents.get_accents(sentence.serialized_accents), original_text=sentence.original_text, ) assert len(infer_sent.symbols) == len(infer_sent.accents) res.append(infer_sent) return res
def sents_accent_template(sentences: SentenceList, text_symbols: SymbolIdDict, accent_ids: AccentsDict) -> AccentedSymbolList: res = AccentedSymbolList() for i, sent in enumerate(sentences.items()): symbols = text_symbols.get_symbols(sent.serialized_symbols) accents = accent_ids.get_accents(sent.serialized_accents) for j, symbol_accent in enumerate(zip(symbols, accents)): symbol, accent = symbol_accent accented_symbol = AccentedSymbol( position=f"{i}-{j}", symbol=symbol, accent=accent ) res.append(accented_symbol) return res
def sents_normalize(sentences: SentenceList, text_symbols: SymbolIdDict, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: # Maybe add info if something was unknown sents_new_symbols = [] for sentence in sentences.items(): new_symbols, new_accent_ids = symbols_normalize( symbols=text_symbols.get_symbols(sentence.serialized_symbols), lang=sentence.lang, accent_ids=deserialize_list(sentence.serialized_accents), logger=logger, ) # TODO: check if new sentences resulted and then split them. sentence.serialized_accents = serialize_list(new_accent_ids) sents_new_symbols.append(new_symbols) return update_symbols_and_text(sentences, sents_new_symbols)
def get_ngram_stats_df(symbols: SymbolIdDict, trainset: PreparedDataList, valset: PreparedDataList, testset: PreparedDataList, restset: PreparedDataList, n: int, logger: Logger): total_set = get_total_set(trainset, valset, testset, restset) logger.info(f"Getting all {n}-gram stats...") tot_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in total_set.items() ] tot_symbols_one_gram = [get_ngrams(x, n=n) for x in tot_symbols] symbol_order = list(sorted({x for y in tot_symbols_one_gram for x in y})) ngram_stats = _get_ngram_stats_df_core( symbol_order=symbol_order, symbols=symbols, trainset=trainset, valset=valset, testset=testset, restset=restset, n=n, logger=logger, ) occurences_count_df, occurrences_percent_df, occurrences_distribution_percent_df, utterance_occurrences_count_df, utterance_occurrences_percent_df, uniform_occurrences_count_df, uniform_occurrences_percent_df = ngram_stats symbol_dfs = [] symbol_dfs.append(occurences_count_df) symbol_dfs.append(occurrences_percent_df) symbol_dfs.append(occurrences_distribution_percent_df) symbol_dfs.append(utterance_occurrences_count_df) symbol_dfs.append(utterance_occurrences_percent_df) symbol_dfs.append(uniform_occurrences_count_df) symbol_dfs.append(uniform_occurrences_percent_df) for i in range(1, len(symbol_dfs)): symbol_dfs[i] = symbol_dfs[i].loc[:, symbol_dfs[i]. columns != FIRST_COL_NAME] symbol_stats = pd.concat( symbol_dfs, axis=1, join='inner', ) # symbol_stats = symbol_stats.round(decimals=2) symbol_stats = symbol_stats.sort_values(by='TOTAL_OCCURRENCES_COUNT', ascending=False) print(symbol_stats) return symbol_stats
def log_stats(data: MergedDataset, symbols: SymbolIdDict, accent_ids: AccentsDict, speakers: SpeakersDict, logger: Logger): logger.info( f"Speakers ({len(speakers)}): {', '.join(sorted(speakers.get_all_speakers()))}" ) logger.info( f"Symbols ({len(symbols)}): {' '.join(sorted(symbols.get_all_symbols()))}" ) logger.info( f"Accents ({len(accent_ids)}): {', '.join(sorted(accent_ids.get_all_accents()))}" ) logger.info( f"Entries ({len(data)}): {data.get_total_duration_s()/60:.2f}m") symbol_counter = get_counter( [symbols.get_symbols(x.serialized_symbol_ids) for x in data.items()]) logger.info(symbol_counter)
def prep_data_list_to_dict_with_symbols(l: PreparedDataList, symbols: SymbolIdDict) -> OrderedDictType[int, List[str]]: res = OrderedDict({x.entry_id: symbols.get_symbols(x.serialized_symbol_ids) for x in l.items()}) return res
def get_formatted_v2(self, symbol_id_dict: SymbolIdDict): return get_formatted_core_v2( sent_id=self.sent_id, symbols=symbol_id_dict.get_symbols(self.serialized_symbols), original_text=self.original_text, )
def _get_ngram_stats_df_core(symbol_order: List[str], symbols: SymbolIdDict, trainset: PreparedDataList, valset: PreparedDataList, testset: PreparedDataList, restset: PreparedDataList, n: int, logger: Logger): logger.info(f"Get {n}-grams...") trn_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in trainset.items() ] val_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in valset.items() ] tst_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in testset.items() ] rst_symbols = [ symbols.get_symbols(x.serialized_symbol_ids) for x in restset.items() ] trn_symbols_one_gram = [get_ngrams(x, n=n) for x in trn_symbols] val_symbols_one_gram = [get_ngrams(x, n=n) for x in val_symbols] tst_symbols_one_gram = [get_ngrams(x, n=n) for x in tst_symbols] rst_symbols_one_gram = [get_ngrams(x, n=n) for x in rst_symbols] logger.info("Get stats...") occurences_count_df = get_occ_df_of_all_symbols( symbols=symbol_order, data_trn=trn_symbols_one_gram, data_val=val_symbols_one_gram, data_tst=tst_symbols_one_gram, data_rst=rst_symbols_one_gram, ) occurences_count_df.columns = [ FIRST_COL_NAME, 'TRAIN_OCCURRENCES_COUNT', 'VAL_OCCURRENCES_COUNT', 'TEST_OCCURRENCES_COUNT', 'REST_OCCURRENCES_COUNT', 'TOTAL_OCCURRENCES_COUNT' ] print(occurences_count_df) occurrences_percent_df = get_rel_occ_df_of_all_symbols(occurences_count_df) occurrences_percent_df.columns = [ FIRST_COL_NAME, 'TRAIN_OCCURRENCES_PERCENT', 'VAL_OCCURRENCES_PERCENT', 'TEST_OCCURRENCES_PERCENT', 'REST_OCCURRENCES_PERCENT' ] print(occurrences_percent_df) occurrences_distribution_percent_df = get_dist_among_other_symbols_df_of_all_symbols( occs_df=occurences_count_df, data_trn=trn_symbols_one_gram, data_val=val_symbols_one_gram, data_tst=tst_symbols_one_gram, data_rst=rst_symbols_one_gram, ) occurrences_distribution_percent_df.columns = [ FIRST_COL_NAME, 'TRAIN_OCCURRENCES_DISTRIBUTION_PERCENT', 'VAL_OCCURRENCES_DISTRIBUTION_PERCENT', 'TEST_OCCURRENCES_DISTRIBUTION_PERCENT', 'REST_OCCURRENCES_DISTRIBUTION_PERCENT', 'TOTAL_OCCURRENCES_DISTRIBUTION_PERCENT' ] print(occurrences_distribution_percent_df) utterance_occurrences_count_df = get_utter_occ_df_of_all_symbols( symbols=symbol_order, data_trn=trn_symbols_one_gram, data_val=val_symbols_one_gram, data_tst=tst_symbols_one_gram, data_rst=rst_symbols_one_gram, ) utterance_occurrences_count_df.columns = [ FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_COUNT', 'VAL_UTTERANCE_OCCURRENCES_COUNT', 'TEST_UTTERANCE_OCCURRENCES_COUNT', 'REST_UTTERANCE_OCCURRENCES_COUNT', 'TOTAL_UTTERANCE_OCCURRENCES_COUNT' ] print(utterance_occurrences_count_df) utterance_occurrences_percent_df = get_rel_utter_occ_df_of_all_symbols( utterance_occurrences_count_df) utterance_occurrences_percent_df.columns = [ FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_PERCENT', 'VAL_UTTERANCE_OCCURRENCES_PERCENT', 'TEST_UTTERANCE_OCCURRENCES_PERCENT', 'REST_UTTERANCE_OCCURRENCES_PERCENT' ] print(utterance_occurrences_percent_df) uniform_occurrences_count_df = get_uniform_distr_df_for_occs( symbols=symbol_order, occ_df=occurences_count_df, ) uniform_occurrences_count_df.columns = [ FIRST_COL_NAME, 'TRAIN_UNIFORM_OCCURRENCES_COUNT', 'VAL_UNIFORM_OCCURRENCES_COUNT', 'TEST_UNIFORM_OCCURRENCES_COUNT', 'REST_UNIFORM_OCCURRENCES_COUNT', 'TOTAL_UNIFORM_OCCURRENCES_COUNT' ] print(uniform_occurrences_count_df) uniform_occurrences_percent_df = get_rel_uniform_distr_df_for_occs( symbols=symbol_order, ) uniform_occurrences_percent_df.columns = [ FIRST_COL_NAME, 'UNIFORM_OCCURRENCES_PERCENT' ] print(uniform_occurrences_percent_df) return occurences_count_df, occurrences_percent_df, occurrences_distribution_percent_df, utterance_occurrences_count_df, utterance_occurrences_percent_df, uniform_occurrences_count_df, uniform_occurrences_percent_df
def int_set_to_symbols(symbol_ids: Optional[Set[int]], symbols: SymbolIdDict) -> Optional[Set[str]]: if symbol_ids is None: return None ignore_symbols = set(symbols.get_symbols(list(symbol_ids))) return ignore_symbols