def test_invert_dict(self): data = {"s": 0, "y": 1, "v": 2, "l": 3, "i": 4, "p": 5, "b": 6, "z": 7, "c": 8, "a": 9, "k": 10, "e": 11, "d": 12, "j": 13, "x": 14, "u": 15, "o" : 16, "n": 17, "t": 18, "f": 19, "g": 20, "h": 21, "r": 22, "w": 23, "m": 24, "q": 25} expected = {0: 's', 1: 'y', 2: 'v', 3: 'l', 4: 'i', 5: 'p', 6: 'b', 7: 'z', 8: 'c', 9: 'a', 10: 'k', 11: 'e', 12: 'd', 13: 'j', 14: 'x', 15: 'u', 16: 'o', 17: 'n', 18: 't', 19: 'f', 20: 'g', 21: 'h', 22: 'r', 23: 'w', 24: 'm', 25: 'q'} self.assertDictEqual(utils.invert_dict(data), expected) data = {1: 40, 2: 30, 3: 40, 30: 2} expected = {40: [1, 3], 30: 2, 2: 30} self.assertEqual(utils.invert_dict(data), expected)
def __init__(self, data: Union[List[str], dict], preprocess_function=None, load_mode=False, use_unk=True): self._temp_unks = dict() self._hash = self.new_hash # default self._unk_memory_max = 10000 # prevents memory leak self._use_unk = use_unk self._preprocess_function = preprocess_function self._warning_not_use_unk = False if isinstance(data, dict) and load_mode: logger.info("Building from file.") keys = data.keys() assert '_metadata' in keys and 'data' in keys, 'Invalid content.' for k, v in data['_metadata'].items(): if k == '_preprocess_function': # load function if v is not None: v = pickle.loads(string_to_literal(v)) setattr(self, k, v) self._uniques = set(data['data'].values()) self._index_to_item = {int(k): v for k, v in data['data'].items()} else: self._uniques = self.get_uniques(data) self._index_to_item = dict(enumerate(self._uniques, self._n_unks)) self._item_to_index = invert_dict(self._index_to_item)
LF platforms: Multics, Unix and Unix-like systems (Linux, macOS, FreeBSD, AIX, Xenix, etc.), BeOS, Amiga, RISC OS, and others """ LF = '\n' """ CR platforms: Commodore 8-bit machines (C64, C128), Acorn BBC, ZX Spectrum, TRS-80, Apple II series, Oberon, the classic Mac OS, MIT Lisp Machine and OS-9 """ CR = "\r" # UNIX is DEFAULT DEFAULT_NEW_LINE_SEP = LF _NEW_LINE_SEP_MAP = {CRLF: "CRLF", LF: "LF", CR: "CR"} _STR_NEW_LINE_SEP_MAP = invert_dict(_NEW_LINE_SEP_MAP) class FileBase(metaclass=ABCMeta): """ High-level API for creating and manipulating files """ __size_map = {"B": 1.e0, "KB": 1.e3, "MB": 1.e6, "GB": 1.e9, "TB": 1.e12} _new_line_sep_map = _NEW_LINE_SEP_MAP.copy() _str_new_line_sep_map = _STR_NEW_LINE_SEP_MAP.copy() _default_new_line_sep = DEFAULT_NEW_LINE_SEP _dont_read = [".pyc"] _ignore_dir = [".git"] _allowed_ext = () _date_format = "%Y-%m-%d %H:%M:%S"
class Tokenizer: _n_unks = 10 __unks = dict({f'{{{i}}}': i for i in range(_n_unks)}) __unks.update(invert_dict(__unks)) def __init__(self, data: Union[List[str], dict], preprocess_function=None, load_mode=False, use_unk=True): self._temp_unks = dict() self._hash = self.new_hash # default self._unk_memory_max = 10000 # prevents memory leak self._use_unk = use_unk self._preprocess_function = preprocess_function self._warning_not_use_unk = False if isinstance(data, dict) and load_mode: logger.info("Building from file.") keys = data.keys() assert '_metadata' in keys and 'data' in keys, 'Invalid content.' for k, v in data['_metadata'].items(): if k == '_preprocess_function': # load function if v is not None: v = pickle.loads(string_to_literal(v)) setattr(self, k, v) self._uniques = set(data['data'].values()) self._index_to_item = {int(k): v for k, v in data['data'].items()} else: self._uniques = self.get_uniques(data) self._index_to_item = dict(enumerate(self._uniques, self._n_unks)) self._item_to_index = invert_dict(self._index_to_item) @property def preprocess_function(self): return self._preprocess_function or (lambda x: x) @property def last_index(self): return self._n_unks + len(self._index_to_item) - 1 def add_item(self, item): index_ = self.last_index + 1 self._index_to_item[index_] = item self._item_to_index[item] = index_ @property def unks(self): return self.__unks @property def unk_memory(self): return self._temp_unks @property def hash_default(self): return self._hash @property def new_hash(self): return secrets.token_hex(7) @classmethod def normalize(cls, data) -> List[Any]: if data is None: data = [] if isinstance(data, str): return [data] elif isinstance(data, (int, float, bytes)): return [data] assert is_sequence(data), TypeError( f'this data type is not supported, try sending a {str}, {list} or tuple' ) return data def get_uniques(self, values: List[str]) -> set: data = self.normalize(values) result = [] for v in data: result += v.split() return set(map(self.preprocess_function, result)) def item_index(self, word: str) -> int: return self._item_to_index.get(word) def index_item(self, index: int) -> str: return self._index_to_item.get(index, self.unks.get(index)) def _encode(self, data: List[str], hash__): if hash__ not in self._temp_unks and self._use_unk: if len(self._temp_unks) > self._unk_memory_max: self._temp_unks = {} logger.warning("Memory Leak Detected. Cleaned temp UNK's") self._temp_unks[hash__] = {} n = 0 result = [] for word in map(self.preprocess_function, data): index = self.item_index(word) if index is None: if not self._use_unk: if not self._warning_not_use_unk: logger.warning( "use_unk is False. All unknown item encoded will be added to tokenizer don't forget " "to save your " "tokenizer after encoding.") self._warning_not_use_unk = True self.add_item(word) index = self.last_index result.append(index) continue unk = f'{{{n % self._n_unks}}}' index = self.unks.get(unk) self._temp_unks[hash__].update({(n, f"{word}"): unk}) n += 1 result.append(index) return result def encode( self, data: Union[str, List[str]] ) -> Union[Tuple[List[int], str], List[List[int]]]: """ Encodes values in a sequence of numbers the hash is used to decode "unks" in the correct order. e.g: tokenizer = Tokenizer(data=['i like it', 'my name is Joab', 'hello']) sequences = tokenizer.encode(data=['hello my friend, how are you?', 'my name is mário']) # [([9, 7, 15, 10, 16, 5], 'e9bc59cb0d1564'), ([7, 13, 2, 15], '9f92140ebb0e19')] """ result = [] for sentence in self.normalize(data): assert isinstance(sentence, str), "send List[str] or str" if not self._use_unk: result.append(self._encode(sentence.split(), None)) continue __hash = self.new_hash result.append((self._encode(sentence.split(), __hash), __hash)) return result def decode(self, data: Union[List[int], int]): return [self.index_item(index) for index in self.normalize(data)] def to_json(self, path_: str): try: preprocess_function = str(pickle.dumps(self._preprocess_function) ) if self._preprocess_function else None except Exception as err: raise Exception(f'Error on preprocess function save: {err}') use_unk = self._use_unk tokenizer_data = { '_metadata': { '_preprocess_function': preprocess_function, '_use_unk': use_unk }, 'data': self._index_to_item } FileIO.create(path_, tokenizer_data).save(exist_ok=True) @classmethod def load_from_json(cls, path_: str): data = FileIO.load(path_) return cls(data.data, load_mode=True) def replace_unks(self, sentence: str, hash_): assert isinstance(sentence, str), 'expected a string.' try: _temp_unks = self._temp_unks[hash_].copy() if not _temp_unks: return sentence for (indx, word), unk in _temp_unks.items(): # cannot have an exception sentence = sentence.replace(unk, word, 1) self._temp_unks[hash_].pop((indx, word)) self._temp_unks.pop(hash_) except KeyError as err: logger.error( msg=f"{err}: There's something wrong open please issue " f"https://github.com/jlsneto/cereja/issues/new?template=bug-report.md" ) return sentence
def __invert__(self): """ Invert dict values to key """ return CJDict(invert_dict(self))