Ejemplo n.º 1
0
def update_symbols(data: MergedDataset, symbols: SymbolIdDict) -> SymbolIdDict:
    new_symbols: Set[str] = {
        x
        for y in data.items()
        for x in symbols.get_symbols(y.serialized_symbol_ids)
    }
    new_symbol_ids = SymbolIdDict.init_from_symbols_with_pad(
        new_symbols, pad_symbol=DEFAULT_PADDING_SYMBOL)
    if new_symbol_ids.get_all_symbols() != symbols.get_all_symbols():
        for entry in data.items():
            original_symbols = symbols.get_symbols(entry.serialized_symbol_ids)
            entry.serialized_symbol_ids = new_symbol_ids.get_serialized_ids(
                original_symbols)
    return new_symbol_ids
Ejemplo n.º 2
0
def sents_convert_to_ipa(sentences: SentenceList, text_symbols: SymbolIdDict, ignore_tones: bool, ignore_arcs: bool, mode: Optional[EngToIpaMode], consider_ipa_annotations: bool, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]:

  sents_new_symbols = []
  for sentence in sentences.items(True):
    if sentence.lang == Language.ENG and mode is None:
      ex = "Please specify the ipa conversion mode."
      logger.exception(ex)
      raise Exception(ex)
    new_symbols, new_accent_ids = symbols_to_ipa(
      symbols=text_symbols.get_symbols(sentence.serialized_symbols),
      lang=sentence.lang,
      accent_ids=deserialize_list(sentence.serialized_accents),
      ignore_arcs=ignore_arcs,
      ignore_tones=ignore_tones,
      mode=mode,
      replace_unknown_with=DEFAULT_PADDING_SYMBOL,
      consider_ipa_annotations=consider_ipa_annotations,
      logger=logger,
    )
    assert len(new_symbols) == len(new_accent_ids)
    sentence.lang = Language.IPA
    sentence.serialized_accents = serialize_list(new_accent_ids)
    sents_new_symbols.append(new_symbols)
    assert len(sentence.get_accent_ids()) == len(new_symbols)

  return update_symbols_and_text(sentences, sents_new_symbols)
Ejemplo n.º 3
0
 def get_formatted(self, symbol_id_dict: SymbolIdDict, accent_id_dict: AccentsDict, pairs_per_line=170, space_length=0):
   return get_formatted_core(
     sent_id=self.sent_id,
     symbols=symbol_id_dict.get_symbols(self.serialized_symbols),
     accent_ids=self.get_accent_ids(),
     accent_id_dict=accent_id_dict,
     space_length=space_length,
     max_pairs_per_line=pairs_per_line
   )
Ejemplo n.º 4
0
def sents_map(sentences: SentenceList, text_symbols: SymbolIdDict, symbols_map: SymbolsMap, ignore_arcs: bool, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]:
  sents_new_symbols = []
  result = SentenceList()
  new_sent_id = 0

  ipa_settings = IPAExtractionSettings(
    ignore_tones=False,
    ignore_arcs=ignore_arcs,
    replace_unknown_ipa_by=DEFAULT_PADDING_SYMBOL,
  )

  for sentence in sentences.items():
    symbols = text_symbols.get_symbols(sentence.serialized_symbols)
    accent_ids = deserialize_list(sentence.serialized_accents)

    mapped_symbols = symbols_map.apply_to_symbols(symbols)

    text = SymbolIdDict.symbols_to_text(mapped_symbols)
    # a resulting empty text would make no problems
    sents = text_to_sentences(
      text=text,
      lang=sentence.lang,
      logger=logger,
    )

    for new_sent_text in sents:
      new_symbols = text_to_symbols(
        new_sent_text,
        lang=sentence.lang,
        ipa_settings=ipa_settings,
        logger=logger,
      )

      if len(accent_ids) > 0:
        new_accent_ids = [accent_ids[0]] * len(new_symbols)
      else:
        new_accent_ids = []

      assert len(new_accent_ids) == len(new_symbols)

      new_sent_id += 1
      tmp = Sentence(
        sent_id=new_sent_id,
        text=new_sent_text,
        lang=sentence.lang,
        orig_lang=sentence.orig_lang,
        # this is not correct but nearest possible currently
        original_text=sentence.original_text,
        serialized_accents=serialize_list(new_accent_ids),
        serialized_symbols=""
      )
      sents_new_symbols.append(new_symbols)

      assert len(tmp.get_accent_ids()) == len(new_symbols)
      result.append(tmp)

  return update_symbols_and_text(result, sents_new_symbols)
Ejemplo n.º 5
0
def get_ngram_rarity(data: PreparedDataList, corpus: PreparedDataList,
                     symbols: SymbolIdDict,
                     ngram: int) -> OrderedDictType[int, float]:
    data_symbols_dict = OrderedDict({
        x.entry_id: symbols.get_symbols(x.serialized_symbol_ids)
        for x in data.items()
    })
    corpus_symbols_dict = OrderedDict({
        x.entry_id: symbols.get_symbols(x.serialized_symbol_ids)
        for x in corpus.items()
    })

    rarity = get_rarity_ngrams(
        data=data_symbols_dict,
        corpus=corpus_symbols_dict,
        n_gram=ngram,
        ignore_symbols=None,
    )

    return rarity
Ejemplo n.º 6
0
 def from_sentences(cls, sentences: SentenceList, accents: AccentsDict, symbols: SymbolIdDict):
   res = cls()
   for sentence in sentences.items():
     infer_sent = InferSentence(
       sent_id=sentence.sent_id,
       symbols=symbols.get_symbols(sentence.serialized_symbols),
       accents=accents.get_accents(sentence.serialized_accents),
       original_text=sentence.original_text,
     )
     assert len(infer_sent.symbols) == len(infer_sent.accents)
     res.append(infer_sent)
   return res
Ejemplo n.º 7
0
def sents_accent_template(sentences: SentenceList, text_symbols: SymbolIdDict, accent_ids: AccentsDict) -> AccentedSymbolList:
  res = AccentedSymbolList()
  for i, sent in enumerate(sentences.items()):
    symbols = text_symbols.get_symbols(sent.serialized_symbols)
    accents = accent_ids.get_accents(sent.serialized_accents)
    for j, symbol_accent in enumerate(zip(symbols, accents)):
      symbol, accent = symbol_accent
      accented_symbol = AccentedSymbol(
        position=f"{i}-{j}",
        symbol=symbol,
        accent=accent
      )
      res.append(accented_symbol)
  return res
Ejemplo n.º 8
0
def sents_normalize(sentences: SentenceList, text_symbols: SymbolIdDict, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]:
  # Maybe add info if something was unknown
  sents_new_symbols = []
  for sentence in sentences.items():
    new_symbols, new_accent_ids = symbols_normalize(
      symbols=text_symbols.get_symbols(sentence.serialized_symbols),
      lang=sentence.lang,
      accent_ids=deserialize_list(sentence.serialized_accents),
      logger=logger,
    )
    # TODO: check if new sentences resulted and then split them.
    sentence.serialized_accents = serialize_list(new_accent_ids)
    sents_new_symbols.append(new_symbols)

  return update_symbols_and_text(sentences, sents_new_symbols)
Ejemplo n.º 9
0
def get_ngram_stats_df(symbols: SymbolIdDict, trainset: PreparedDataList,
                       valset: PreparedDataList, testset: PreparedDataList,
                       restset: PreparedDataList, n: int, logger: Logger):
    total_set = get_total_set(trainset, valset, testset, restset)
    logger.info(f"Getting all {n}-gram stats...")
    tot_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids)
        for x in total_set.items()
    ]
    tot_symbols_one_gram = [get_ngrams(x, n=n) for x in tot_symbols]
    symbol_order = list(sorted({x for y in tot_symbols_one_gram for x in y}))

    ngram_stats = _get_ngram_stats_df_core(
        symbol_order=symbol_order,
        symbols=symbols,
        trainset=trainset,
        valset=valset,
        testset=testset,
        restset=restset,
        n=n,
        logger=logger,
    )
    occurences_count_df, occurrences_percent_df, occurrences_distribution_percent_df, utterance_occurrences_count_df, utterance_occurrences_percent_df, uniform_occurrences_count_df, uniform_occurrences_percent_df = ngram_stats

    symbol_dfs = []
    symbol_dfs.append(occurences_count_df)
    symbol_dfs.append(occurrences_percent_df)
    symbol_dfs.append(occurrences_distribution_percent_df)
    symbol_dfs.append(utterance_occurrences_count_df)
    symbol_dfs.append(utterance_occurrences_percent_df)
    symbol_dfs.append(uniform_occurrences_count_df)
    symbol_dfs.append(uniform_occurrences_percent_df)

    for i in range(1, len(symbol_dfs)):
        symbol_dfs[i] = symbol_dfs[i].loc[:, symbol_dfs[i].
                                          columns != FIRST_COL_NAME]

    symbol_stats = pd.concat(
        symbol_dfs,
        axis=1,
        join='inner',
    )

    # symbol_stats = symbol_stats.round(decimals=2)
    symbol_stats = symbol_stats.sort_values(by='TOTAL_OCCURRENCES_COUNT',
                                            ascending=False)
    print(symbol_stats)
    return symbol_stats
Ejemplo n.º 10
0
def log_stats(data: MergedDataset, symbols: SymbolIdDict,
              accent_ids: AccentsDict, speakers: SpeakersDict, logger: Logger):
    logger.info(
        f"Speakers ({len(speakers)}): {', '.join(sorted(speakers.get_all_speakers()))}"
    )
    logger.info(
        f"Symbols ({len(symbols)}): {' '.join(sorted(symbols.get_all_symbols()))}"
    )
    logger.info(
        f"Accents ({len(accent_ids)}): {', '.join(sorted(accent_ids.get_all_accents()))}"
    )
    logger.info(
        f"Entries ({len(data)}): {data.get_total_duration_s()/60:.2f}m")
    symbol_counter = get_counter(
        [symbols.get_symbols(x.serialized_symbol_ids) for x in data.items()])
    logger.info(symbol_counter)
Ejemplo n.º 11
0
def prep_data_list_to_dict_with_symbols(l: PreparedDataList, symbols: SymbolIdDict) -> OrderedDictType[int, List[str]]:
  res = OrderedDict({x.entry_id: symbols.get_symbols(x.serialized_symbol_ids) for x in l.items()})
  return res
Ejemplo n.º 12
0
 def get_formatted_v2(self, symbol_id_dict: SymbolIdDict):
   return get_formatted_core_v2(
     sent_id=self.sent_id,
     symbols=symbol_id_dict.get_symbols(self.serialized_symbols),
     original_text=self.original_text,
   )
Ejemplo n.º 13
0
def _get_ngram_stats_df_core(symbol_order: List[str], symbols: SymbolIdDict,
                             trainset: PreparedDataList,
                             valset: PreparedDataList,
                             testset: PreparedDataList,
                             restset: PreparedDataList, n: int,
                             logger: Logger):
    logger.info(f"Get {n}-grams...")
    trn_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in trainset.items()
    ]
    val_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in valset.items()
    ]
    tst_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in testset.items()
    ]
    rst_symbols = [
        symbols.get_symbols(x.serialized_symbol_ids) for x in restset.items()
    ]

    trn_symbols_one_gram = [get_ngrams(x, n=n) for x in trn_symbols]
    val_symbols_one_gram = [get_ngrams(x, n=n) for x in val_symbols]
    tst_symbols_one_gram = [get_ngrams(x, n=n) for x in tst_symbols]
    rst_symbols_one_gram = [get_ngrams(x, n=n) for x in rst_symbols]
    logger.info("Get stats...")

    occurences_count_df = get_occ_df_of_all_symbols(
        symbols=symbol_order,
        data_trn=trn_symbols_one_gram,
        data_val=val_symbols_one_gram,
        data_tst=tst_symbols_one_gram,
        data_rst=rst_symbols_one_gram,
    )
    occurences_count_df.columns = [
        FIRST_COL_NAME, 'TRAIN_OCCURRENCES_COUNT', 'VAL_OCCURRENCES_COUNT',
        'TEST_OCCURRENCES_COUNT', 'REST_OCCURRENCES_COUNT',
        'TOTAL_OCCURRENCES_COUNT'
    ]
    print(occurences_count_df)

    occurrences_percent_df = get_rel_occ_df_of_all_symbols(occurences_count_df)
    occurrences_percent_df.columns = [
        FIRST_COL_NAME, 'TRAIN_OCCURRENCES_PERCENT', 'VAL_OCCURRENCES_PERCENT',
        'TEST_OCCURRENCES_PERCENT', 'REST_OCCURRENCES_PERCENT'
    ]
    print(occurrences_percent_df)

    occurrences_distribution_percent_df = get_dist_among_other_symbols_df_of_all_symbols(
        occs_df=occurences_count_df,
        data_trn=trn_symbols_one_gram,
        data_val=val_symbols_one_gram,
        data_tst=tst_symbols_one_gram,
        data_rst=rst_symbols_one_gram,
    )
    occurrences_distribution_percent_df.columns = [
        FIRST_COL_NAME, 'TRAIN_OCCURRENCES_DISTRIBUTION_PERCENT',
        'VAL_OCCURRENCES_DISTRIBUTION_PERCENT',
        'TEST_OCCURRENCES_DISTRIBUTION_PERCENT',
        'REST_OCCURRENCES_DISTRIBUTION_PERCENT',
        'TOTAL_OCCURRENCES_DISTRIBUTION_PERCENT'
    ]
    print(occurrences_distribution_percent_df)

    utterance_occurrences_count_df = get_utter_occ_df_of_all_symbols(
        symbols=symbol_order,
        data_trn=trn_symbols_one_gram,
        data_val=val_symbols_one_gram,
        data_tst=tst_symbols_one_gram,
        data_rst=rst_symbols_one_gram,
    )
    utterance_occurrences_count_df.columns = [
        FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_COUNT',
        'VAL_UTTERANCE_OCCURRENCES_COUNT', 'TEST_UTTERANCE_OCCURRENCES_COUNT',
        'REST_UTTERANCE_OCCURRENCES_COUNT', 'TOTAL_UTTERANCE_OCCURRENCES_COUNT'
    ]
    print(utterance_occurrences_count_df)

    utterance_occurrences_percent_df = get_rel_utter_occ_df_of_all_symbols(
        utterance_occurrences_count_df)
    utterance_occurrences_percent_df.columns = [
        FIRST_COL_NAME, 'TRAIN_UTTERANCE_OCCURRENCES_PERCENT',
        'VAL_UTTERANCE_OCCURRENCES_PERCENT',
        'TEST_UTTERANCE_OCCURRENCES_PERCENT',
        'REST_UTTERANCE_OCCURRENCES_PERCENT'
    ]
    print(utterance_occurrences_percent_df)

    uniform_occurrences_count_df = get_uniform_distr_df_for_occs(
        symbols=symbol_order,
        occ_df=occurences_count_df,
    )
    uniform_occurrences_count_df.columns = [
        FIRST_COL_NAME, 'TRAIN_UNIFORM_OCCURRENCES_COUNT',
        'VAL_UNIFORM_OCCURRENCES_COUNT', 'TEST_UNIFORM_OCCURRENCES_COUNT',
        'REST_UNIFORM_OCCURRENCES_COUNT', 'TOTAL_UNIFORM_OCCURRENCES_COUNT'
    ]
    print(uniform_occurrences_count_df)

    uniform_occurrences_percent_df = get_rel_uniform_distr_df_for_occs(
        symbols=symbol_order, )
    uniform_occurrences_percent_df.columns = [
        FIRST_COL_NAME, 'UNIFORM_OCCURRENCES_PERCENT'
    ]
    print(uniform_occurrences_percent_df)

    return occurences_count_df, occurrences_percent_df, occurrences_distribution_percent_df, utterance_occurrences_count_df, utterance_occurrences_percent_df, uniform_occurrences_count_df, uniform_occurrences_percent_df
Ejemplo n.º 14
0
def int_set_to_symbols(symbol_ids: Optional[Set[int]],
                       symbols: SymbolIdDict) -> Optional[Set[str]]:
    if symbol_ids is None:
        return None
    ignore_symbols = set(symbols.get_symbols(list(symbol_ids)))
    return ignore_symbols