def update_symbols(data: MergedDataset, symbols: SymbolIdDict) -> SymbolIdDict: new_symbols: Set[str] = { x for y in data.items() for x in symbols.get_symbols(y.serialized_symbol_ids) } new_symbol_ids = SymbolIdDict.init_from_symbols_with_pad( new_symbols, pad_symbol=DEFAULT_PADDING_SYMBOL) if new_symbol_ids.get_all_symbols() != symbols.get_all_symbols(): for entry in data.items(): original_symbols = symbols.get_symbols(entry.serialized_symbol_ids) entry.serialized_symbol_ids = new_symbol_ids.get_serialized_ids( original_symbols) return new_symbol_ids
def make_common_symbol_ids(self) -> SymbolIdDict: all_symbols: Set[str] = set() for ds in self.data: all_symbols |= ds.symbol_ids.get_all_symbols() new_symbol_ids = SymbolIdDict.init_from_symbols_with_pad( all_symbols, pad_symbol=DEFAULT_PADDING_SYMBOL) for ds in self.data: for entry in ds.data.items(): original_symbols = ds.symbol_ids.get_symbols( entry.serialized_symbol_ids) entry.serialized_symbol_ids = new_symbol_ids.get_serialized_ids( original_symbols) ds.symbol_ids = new_symbol_ids return new_symbol_ids