Beispiel #1
0
 def save(self,
          on_new_path: Union[str, Path] = None,
          mode=None,
          exist_ok=False,
          overwrite=False,
          encoding=None,
          **kwargs):
     if (self._last_update is not None
             and overwrite is False) and not self._is_deleted:
         if self._last_update != self.updated_at:
             raise AssertionError(
                 f"File change detected (last change {self.updated_at}), if you want to overwrite "
                 f"set overwrite=True")
     if on_new_path is not None:
         self.set_path(on_new_path)
     assert exist_ok or not self.path.exists, FileExistsError(
         "File exists. If you want override, please send 'exist_ok=True'")
     assert self._only_read is False, f"{self.name} is only read."
     mode = mode or self._get_mode('w')
     assert 'w' in mode, f"{mode} for write isn't valid."
     encoding = None if self._is_byte else 'utf-8'
     with tempfile.TemporaryDirectory() as tmpdirname:
         tmp_file = Path(tmpdirname).join(self.name)
         if not self._newline and not self._is_byte:
             kwargs.update({'newline': ''})
         try:
             with open(tmp_file, mode=mode, encoding=encoding,
                       **kwargs) as fp:
                 self._save_fp(fp)
         except Exception as err:
             raise IOError(f'Error writing to the file system file: {err}')
         tmp_file.cp(self.path)
Beispiel #2
0
    def test_sanity(self):
        self.assertEqual(
            Path(__file__).parent,
            Path(os.path.realpath(__file__)).parent)
        p_test = 'cereja/test/sanity'
        p = Path(p_test)
        self.assertTrue(p.name, 'sanity')
        self.assertTrue(p.parent.name, 'test')
        self.assertTrue(p == p_test)
        self.assertTrue('sanity' in p)
        p = p + ['/con/', 'cat']
        p_test = Path('cereja/test/sanity').join('con', 'cat')
        self.assertEqual(p_test.parts[-2:], ('con', 'cat'))
        self.assertTrue(p == p_test)
        self.assertListEqual(
            Path(__file__).parent.list_dir(only_name=True),
            list(
                map(lambda x: x.rsplit('.')[0],
                    os.listdir(Path(__file__).parent))))

        with TempDir() as tmp_dir:
            tmp_dir = Path(tmp_dir)
            suffix_case = tmp_dir.join('test.suffix')
            self.assertEqual(suffix_case.suffix, '.suffix')
            mkdir(suffix_case)
            self.assertEqual(suffix_case.suffix, '')
Beispiel #3
0
 def unzip(cls, file_path, save_on: str = None, load_on_memory=False):
     with ZipFile(file_path, mode='r') as myzip:
         if save_on or load_on_memory:
             with tempfile.TemporaryDirectory() as tmpdirname:
                 unzip_dir = save_on or tmpdirname
                 unzip_dir = Path(unzip_dir).join(Path(myzip.filename).stem)
                 mkdir(unzip_dir)
                 myzip.extractall(unzip_dir)
                 if load_on_memory:
                     return FileIO.load_files(unzip_dir)
         return myzip.namelist()
Beispiel #4
0
 def save(self, on_new_path: Union[str, Path] = None, exist_ok=False, overwrite=False, force=False,
          **kwargs):
     with tempfile.TemporaryDirectory() as tmpdirname:
         tmp_file = Path(tmpdirname).join(self.name)
         try:
             with ZipFile(tmp_file.path, mode='w', compression=ZIP_DEFLATED) as myzip:
                 for i in self.data:
                     myzip.write(i)
         except Exception as err:
             raise IOError(f'Error writing to the file system file: {err}')
         tmp_file.cp(self.path)
         self._last_update = self.updated_at
Beispiel #5
0
    def test_sanity(self):
        with tempfile.TemporaryDirectory() as tempdir:
            to_encode = 'oi tudo tranquilo'
            tokenizer = Tokenizer(['oi amigo tudo bem'],
                                  preprocess_function=separate,
                                  use_unk=False)

            self.assertEqual(tokenizer.last_index, 13)
            encoded = tokenizer.encode([to_encode])
            valid_encode_tokens = [[
                tokenizer._item_to_index['oi'],
                tokenizer._item_to_index['tudo'],
                tokenizer._item_to_index['tranquilo']
            ]]
            self.assertEqual(tokenizer.last_index, 14)
            self.assertEqual(encoded, valid_encode_tokens)
            decoded = ' '.join(tokenizer.decode(encoded[0]))
            self.assertEqual(decoded, to_encode)
            temp_file = Path(tempdir).join('tokenizer.json')
            tokenizer.to_json(temp_file)

            # Valid.
            tokenizer = Tokenizer.load_from_json(temp_file)
            encoded = tokenizer.encode([to_encode])
            self.assertEqual(tokenizer.last_index, 14)
            self.assertEqual(encoded, valid_encode_tokens)
            decoded = ' '.join(tokenizer.decode(encoded[0]))
            self.assertEqual(decoded, to_encode)
Beispiel #6
0
    def load_files(cls, path_, ext=None, contains_in_name: List = (), not_contains_in_name=(), take_empty=True,
                   recursive=False):

        ext = ext or ''
        f_paths = Path.list_files(path_, ext=ext, contains_in_name=contains_in_name,
                                  not_contains_in_name=not_contains_in_name, recursive=recursive)
        loaded = []
        for p in f_paths:
            if recursive and p.is_dir:
                loaded.extend(cls.load_files(path_=p, ext=ext, contains_in_name=contains_in_name,
                                             not_contains_in_name=not_contains_in_name, take_empty=take_empty,
                                             recursive=recursive))
                continue
            if not p.exists or p.is_dir:
                continue
            file_ = cls.load(path_=p)
            if file_ is None:
                continue
            if take_empty is True and file_.is_empty:
                continue
            if not (file_.ext == f'.{ext.strip(".")}' or ext == ''):
                continue
            if contains_in_name:
                if not any(map(file_.name_without_ext.__contains__, contains_in_name)):
                    continue
            if not_contains_in_name:
                if any(map(file_.name_without_ext.__contains__, not_contains_in_name)):
                    continue
            loaded.append(file_)
        return loaded
Beispiel #7
0
 def create(
         cls, path_: Union[Type[Path], str], data: Any,
         **kwargs) -> Union[_TxtIO, _JsonIO, _Mp4IO, _CsvIO, _GenericFile]:
     path_ = Path(path_)
     return cls.lookup(path_.suffix)(path_=path_,
                                     data=data,
                                     creation_mode=True,
                                     **kwargs)
 def test_eval(self):
     with tempfile.TemporaryDirectory() as tempdir:
         eval_data = [1, 2, 3, 'hi', ['oi']]
         file = FileIO.create(Path(tempdir).join(self.file.name), eval_data)
         self.assertEqual(file.data, eval_data)
         file.save()
         file = FileIO.load(file.path)
         self.assertEqual(file.data, eval_data)
 def test_sanity(self):
     console.log(f'Testing {self.name}')
     with tempfile.TemporaryDirectory() as tempdir:
         self.file.set_path(Path(tempdir).join(self.file.name))
         data_before_save = self.file.data
         self.file.save(exist_ok=True)
         file = FileIO.load(self.file.path, **self.load_kwargs)
         self.assertTrue(file.path.exists, msg="File don't exist.")
         self.assertEqual(file.data, data_before_save, msg='Data corrupted')
         file.delete()
         self.assertFalse(file.path.exists)
Beispiel #10
0
 def __init__(self, path_: Path, data=None, creation_mode=False, **kwargs):
     super().__init__()
     if not isinstance(path_, Path):
         path_ = Path(path_)
     self._creation_mode = creation_mode
     self._path = path_
     if self._ext_allowed:
         assert self._path.suffix in self._ext_allowed, f'{path_.suffix} != {self._ext_allowed}'
     self._ext_without_point = self._path.suffix.strip(".").upper()
     if creation_mode:
         self._data = self.parse(data)
     else:
         self._data = self.load(**kwargs)
Beispiel #11
0
 def __init__(self, path_: Path, data=None, creation_mode=False, **kwargs):
     super().__init__()
     if not isinstance(path_, Path):
         path_ = Path(path_)
     self._creation_mode = creation_mode
     self._path = path_
     if self._ext_allowed:
         assert self._path.suffix in self._ext_allowed, f'{path_.suffix} != {self._ext_allowed}'
     self._ext_without_point = self._path.suffix.strip(".").upper()
     self._built = True  # next setattr will be saved in history to be future recovery
     if creation_mode:
         self._data = self.parse(data)
     else:
         self._data = self._load(**kwargs)
Beispiel #12
0
    def parse(self, data):
        if not data:
            return []
        if isinstance(data, (str, Path)):
            data = [data]

        if not is_sequence(data):
            raise ValueError("Send list of paths invalid.")
        res = []

        for p in data:
            p = Path(p)
            if not p.exists:
                raise ValueError(f'{p.path} not Found.')
            res.append(p)
        return res
Beispiel #13
0
    def save_freq(self,
                  save_on: str,
                  prefix='freq',
                  ext: str = 'json',
                  probability=False):
        ext = ext.strip('.')  # normalize
        save_on = Path(save_on)

        path_words = save_on.join(f'{prefix}_words.{ext}')
        self.words_freq.to_json(path_words,
                                probability=probability,
                                exist_ok=True)

        path_phrases = save_on.join(f'{prefix}_phrases.{ext}')
        self.phrases_freq.to_json(path_phrases,
                                  probability=probability,
                                  exist_ok=True)
Beispiel #14
0
 def __init__(self, path_: Path, data=None, creation_mode=False, is_byte=None, **kwargs):
     super().__init__()
     if is_byte is not None:
         self._is_byte = bool(is_byte)
     if not isinstance(path_, Path):
         path_ = Path(path_)
     self._creation_mode = creation_mode
     self._path = path_
     if self._ext_allowed:
         msg_assert_err = f'{path_.suffix} != {self._ext_allowed}. Force with support_ext = ".xyz" '
         assert self._path.suffix in self._ext_allowed, msg_assert_err
     self._ext_without_point = self._path.suffix.strip(".").upper()
     self._built = True  # next setattr will be saved in history to be future recovery
     if creation_mode:
         self._data = self.parse(data)
     else:
         self._data = self._load(**kwargs)
Beispiel #15
0
    def save(self, save_on_dir: str, take_split: bool = True, test_max_size: int = None, source_vocab_size: int = None,
             target_vocab_size: int = None, shuffle=True, prefix=None, ext='align', **kwargs):
        save_on_dir = Path(save_on_dir)
        if take_split:
            x_train, y_train, x_test, y_test = self.split_data(test_max_size=test_max_size,
                                                               source_vocab_size=source_vocab_size,
                                                               target_vocab_size=target_vocab_size,
                                                               take_parallel_data=False,
                                                               shuffle=shuffle)
            train_prefix, test_prefix = (f'{prefix}_train', f'{prefix}_test') if prefix is not None else (
                'train', 'test')
            data_to_save = ((train_prefix, x_train, y_train), (test_prefix, x_test, y_test))
        else:
            data_to_save = ((prefix, self.source.data, self.target.data),)

        for prefix, x, y in data_to_save:
            save_on = save_on_dir.join(f'{prefix}_{self.source_language}.{ext.strip(".")}')
            FileIO.create(save_on, data=x).save(**kwargs)
            save_on = save_on_dir.join(f'{prefix}_{self.target_language}.{ext.strip(".")}')
            FileIO.create(save_on, data=y).save(**kwargs)
Beispiel #16
0
 def from_url(cls, url: str, *args, **kwargs):
     # TODO: need improves
     resp = request.get(url, *args, **kwargs)
     if resp.code == 200:
         return cls(Path('./subtitle.srt'), resp.data, creation_mode=True)
Beispiel #17
0
 def set_path(self, path_):
     p = Path(path_)
     assert not p.is_dir, f"{p.path} is a directory."
     self._path = p
Beispiel #18
0
 def set_path(self, path_):
     self._path = Path(path_)
Beispiel #19
0
 def create(cls, path_: Union[Type[Path], str], data: Any, **kwargs) -> _FileIO:
     path_ = Path(path_)
     return cls.lookup(path_.suffix).create(path_=path_, data=data, **kwargs)
Beispiel #20
0
 def load(cls, path_: Union[str, Path], **kwargs) -> _FileIO:
     path_ = Path(path_)
     if not path_.exists:
         raise FileNotFoundError(f"{path_.path} not found.")
     return cls.lookup(path_.suffix)(path_=path_, **kwargs)
Beispiel #21
0
 def load_from_clipboard(cls):
     return cls.load(Path(clipboard()))