Example #1
0
def filter_symbols(data: MergedDataset, symbols: SymbolIdDict,
                   accent_ids: AccentsDict, speakers: SpeakersDict,
                   allowed_symbol_ids: Set[int],
                   logger: Logger) -> MergedDatasetContainer:
    # maybe check all symbol ids are valid before
    allowed_symbols = [symbols.get_symbol(x) for x in allowed_symbol_ids]
    not_allowed_symbols = [
        symbols.get_symbol(x) for x in symbols.get_all_symbol_ids()
        if x not in allowed_symbol_ids
    ]
    logger.info(
        f"Keep utterances with these symbols: {' '.join(allowed_symbols)}")
    logger.info(
        f"Remove utterances with these symbols: {' '.join(not_allowed_symbols)}"
    )
    logger.info("Statistics before filtering:")
    log_stats(data, symbols, accent_ids, speakers, logger)
    result = MergedDataset([
        x for x in data.items() if contains_only_allowed_symbols(
            deserialize_list(x.serialized_symbol_ids), allowed_symbol_ids)
    ])
    if len(result) > 0:
        logger.info(
            f"Removed {len(data) - len(result)} from {len(data)} total entries and got {len(result)} entries ({len(result)/len(data)*100:.2f}%)."
        )
    else:
        logger.info("Removed all utterances!")
    new_symbol_ids = update_symbols(result, symbols)
    new_accent_ids = update_accents(result, accent_ids)
    new_speaker_ids = update_speakers(result, speakers)
    logger.info("Statistics after filtering:")
    log_stats(result, new_symbol_ids, new_accent_ids, new_speaker_ids, logger)

    res = MergedDatasetContainer(
        name=None,
        data=result,
        accent_ids=new_accent_ids,
        speaker_ids=new_speaker_ids,
        symbol_ids=new_symbol_ids,
    )
    return res
Example #2
0
def sims_to_csv(sims: Dict[int, List[Tuple[int, float]]],
                symbols: SymbolIdDict) -> pd.DataFrame:
    lines = []
    assert len(sims) == len(symbols)
    for symbol_id, similarities in sims.items():
        sims = [f"{symbols.get_symbol(symbol_id)}", "<=>"]
        for other_symbol_id, similarity in similarities:
            sims.append(symbols.get_symbol(other_symbol_id))
            sims.append(f"{similarity:.2f}")
        lines.append(sims)
    df = pd.DataFrame(lines)
    return df
Example #3
0
def plot_embeddings(
        symbols: SymbolIdDict, emb: torch.Tensor,
        logger: Logger) -> Tuple[pd.DataFrame, go.Figure, go.Figure]:
    assert emb.shape[0] == len(symbols)

    logger.info(f"Emb size {emb.shape}")
    logger.info(f"Sym len {len(symbols)}")

    sims = get_similarities(emb.numpy())
    df = sims_to_csv(sims, symbols)
    all_symbols_sorted = [symbols.get_symbol(x) for x in range(len(symbols))]
    emb_normed = norm2emb(emb)
    fig_2d = emb_plot_2d(emb_normed, all_symbols_sorted)
    fig_3d = emb_plot_3d(emb_normed, all_symbols_sorted)

    return df, fig_2d, fig_3d