Beispiel #1
0
    def test_invert_dict(self):
        data = {"s": 0, "y": 1, "v": 2, "l": 3, "i": 4, "p": 5, "b": 6, "z": 7, "c": 8, "a": 9, "k": 10, "e": 11,
                "d": 12, "j": 13, "x": 14, "u": 15, "o"
                :    16, "n": 17, "t": 18, "f": 19, "g": 20, "h": 21, "r": 22, "w": 23, "m": 24, "q": 25}
        expected = {0:  's', 1: 'y', 2: 'v', 3: 'l', 4: 'i', 5: 'p', 6: 'b', 7: 'z', 8: 'c', 9: 'a', 10: 'k', 11: 'e',
                    12: 'd', 13: 'j', 14: 'x', 15: 'u', 16: 'o', 17: 'n', 18: 't', 19: 'f', 20: 'g', 21: 'h', 22: 'r',
                    23: 'w', 24: 'm', 25: 'q'}
        self.assertDictEqual(utils.invert_dict(data), expected)

        data = {1: 40, 2: 30, 3: 40, 30: 2}
        expected = {40: [1, 3], 30: 2, 2: 30}
        self.assertEqual(utils.invert_dict(data), expected)
Beispiel #2
0
 def __init__(self,
              data: Union[List[str], dict],
              preprocess_function=None,
              load_mode=False,
              use_unk=True):
     self._temp_unks = dict()
     self._hash = self.new_hash  # default
     self._unk_memory_max = 10000  # prevents memory leak
     self._use_unk = use_unk
     self._preprocess_function = preprocess_function
     self._warning_not_use_unk = False
     if isinstance(data, dict) and load_mode:
         logger.info("Building from file.")
         keys = data.keys()
         assert '_metadata' in keys and 'data' in keys, 'Invalid content.'
         for k, v in data['_metadata'].items():
             if k == '_preprocess_function':
                 # load function
                 if v is not None:
                     v = pickle.loads(string_to_literal(v))
             setattr(self, k, v)
         self._uniques = set(data['data'].values())
         self._index_to_item = {int(k): v for k, v in data['data'].items()}
     else:
         self._uniques = self.get_uniques(data)
         self._index_to_item = dict(enumerate(self._uniques, self._n_unks))
     self._item_to_index = invert_dict(self._index_to_item)
Beispiel #3
0
LF platforms:
Multics, Unix and Unix-like systems (Linux, macOS, FreeBSD, AIX, Xenix, etc.), BeOS, Amiga, RISC OS, and others
"""
LF = '\n'
"""
CR platforms:
Commodore 8-bit machines (C64, C128), Acorn BBC, ZX Spectrum, TRS-80, Apple II series, Oberon,
the classic Mac OS, MIT Lisp Machine and OS-9
"""
CR = "\r"

# UNIX is DEFAULT
DEFAULT_NEW_LINE_SEP = LF

_NEW_LINE_SEP_MAP = {CRLF: "CRLF", LF: "LF", CR: "CR"}
_STR_NEW_LINE_SEP_MAP = invert_dict(_NEW_LINE_SEP_MAP)


class FileBase(metaclass=ABCMeta):
    """
    High-level API for creating and manipulating files
    """
    __size_map = {"B": 1.e0, "KB": 1.e3, "MB": 1.e6, "GB": 1.e9, "TB": 1.e12}

    _new_line_sep_map = _NEW_LINE_SEP_MAP.copy()
    _str_new_line_sep_map = _STR_NEW_LINE_SEP_MAP.copy()
    _default_new_line_sep = DEFAULT_NEW_LINE_SEP
    _dont_read = [".pyc"]
    _ignore_dir = [".git"]
    _allowed_ext = ()
    _date_format = "%Y-%m-%d %H:%M:%S"
Beispiel #4
0
class Tokenizer:
    _n_unks = 10
    __unks = dict({f'{{{i}}}': i for i in range(_n_unks)})
    __unks.update(invert_dict(__unks))

    def __init__(self,
                 data: Union[List[str], dict],
                 preprocess_function=None,
                 load_mode=False,
                 use_unk=True):
        self._temp_unks = dict()
        self._hash = self.new_hash  # default
        self._unk_memory_max = 10000  # prevents memory leak
        self._use_unk = use_unk
        self._preprocess_function = preprocess_function
        self._warning_not_use_unk = False
        if isinstance(data, dict) and load_mode:
            logger.info("Building from file.")
            keys = data.keys()
            assert '_metadata' in keys and 'data' in keys, 'Invalid content.'
            for k, v in data['_metadata'].items():
                if k == '_preprocess_function':
                    # load function
                    if v is not None:
                        v = pickle.loads(string_to_literal(v))
                setattr(self, k, v)
            self._uniques = set(data['data'].values())
            self._index_to_item = {int(k): v for k, v in data['data'].items()}
        else:
            self._uniques = self.get_uniques(data)
            self._index_to_item = dict(enumerate(self._uniques, self._n_unks))
        self._item_to_index = invert_dict(self._index_to_item)

    @property
    def preprocess_function(self):
        return self._preprocess_function or (lambda x: x)

    @property
    def last_index(self):
        return self._n_unks + len(self._index_to_item) - 1

    def add_item(self, item):
        index_ = self.last_index + 1
        self._index_to_item[index_] = item
        self._item_to_index[item] = index_

    @property
    def unks(self):
        return self.__unks

    @property
    def unk_memory(self):
        return self._temp_unks

    @property
    def hash_default(self):
        return self._hash

    @property
    def new_hash(self):
        return secrets.token_hex(7)

    @classmethod
    def normalize(cls, data) -> List[Any]:
        if data is None:
            data = []
        if isinstance(data, str):
            return [data]
        elif isinstance(data, (int, float, bytes)):
            return [data]
        assert is_sequence(data), TypeError(
            f'this data type is not supported, try sending a {str}, {list} or tuple'
        )
        return data

    def get_uniques(self, values: List[str]) -> set:
        data = self.normalize(values)
        result = []
        for v in data:
            result += v.split()
        return set(map(self.preprocess_function, result))

    def item_index(self, word: str) -> int:
        return self._item_to_index.get(word)

    def index_item(self, index: int) -> str:
        return self._index_to_item.get(index, self.unks.get(index))

    def _encode(self, data: List[str], hash__):
        if hash__ not in self._temp_unks and self._use_unk:
            if len(self._temp_unks) > self._unk_memory_max:
                self._temp_unks = {}
                logger.warning("Memory Leak Detected. Cleaned temp UNK's")
            self._temp_unks[hash__] = {}
        n = 0
        result = []
        for word in map(self.preprocess_function, data):
            index = self.item_index(word)
            if index is None:
                if not self._use_unk:
                    if not self._warning_not_use_unk:
                        logger.warning(
                            "use_unk is False. All unknown item encoded will be added to tokenizer don't forget "
                            "to save your "
                            "tokenizer after encoding.")
                        self._warning_not_use_unk = True
                    self.add_item(word)
                    index = self.last_index
                    result.append(index)
                    continue
                unk = f'{{{n % self._n_unks}}}'
                index = self.unks.get(unk)
                self._temp_unks[hash__].update({(n, f"{word}"): unk})
                n += 1
            result.append(index)
        return result

    def encode(
        self, data: Union[str, List[str]]
    ) -> Union[Tuple[List[int], str], List[List[int]]]:
        """
        Encodes values in a sequence of numbers

        the hash is used to decode "unks" in the correct order.

        e.g:
            tokenizer = Tokenizer(data=['i like it', 'my name is Joab', 'hello'])
            sequences = tokenizer.encode(data=['hello my friend, how are you?', 'my name is mário'])
            # [([9, 7, 15, 10, 16, 5], 'e9bc59cb0d1564'), ([7, 13, 2, 15], '9f92140ebb0e19')]

        """
        result = []
        for sentence in self.normalize(data):
            assert isinstance(sentence, str), "send List[str] or str"
            if not self._use_unk:
                result.append(self._encode(sentence.split(), None))
                continue
            __hash = self.new_hash
            result.append((self._encode(sentence.split(), __hash), __hash))
        return result

    def decode(self, data: Union[List[int], int]):
        return [self.index_item(index) for index in self.normalize(data)]

    def to_json(self, path_: str):
        try:
            preprocess_function = str(pickle.dumps(self._preprocess_function)
                                      ) if self._preprocess_function else None
        except Exception as err:
            raise Exception(f'Error on preprocess function save: {err}')
        use_unk = self._use_unk
        tokenizer_data = {
            '_metadata': {
                '_preprocess_function': preprocess_function,
                '_use_unk': use_unk
            },
            'data': self._index_to_item
        }
        FileIO.create(path_, tokenizer_data).save(exist_ok=True)

    @classmethod
    def load_from_json(cls, path_: str):
        data = FileIO.load(path_)
        return cls(data.data, load_mode=True)

    def replace_unks(self, sentence: str, hash_):
        assert isinstance(sentence, str), 'expected a string.'
        try:
            _temp_unks = self._temp_unks[hash_].copy()
            if not _temp_unks:
                return sentence
            for (indx,
                 word), unk in _temp_unks.items():  # cannot have an exception
                sentence = sentence.replace(unk, word, 1)
                self._temp_unks[hash_].pop((indx, word))
            self._temp_unks.pop(hash_)
        except KeyError as err:
            logger.error(
                msg=f"{err}: There's something wrong open please issue "
                f"https://github.com/jlsneto/cereja/issues/new?template=bug-report.md"
            )
        return sentence
Beispiel #5
0
 def __invert__(self):
     """
     Invert dict values to key
     """
     return CJDict(invert_dict(self))