def save(self, on_new_path: Union[str, Path] = None, mode=None, exist_ok=False, overwrite=False, encoding=None, **kwargs): if (self._last_update is not None and overwrite is False) and not self._is_deleted: if self._last_update != self.updated_at: raise AssertionError( f"File change detected (last change {self.updated_at}), if you want to overwrite " f"set overwrite=True") if on_new_path is not None: self.set_path(on_new_path) assert exist_ok or not self.path.exists, FileExistsError( "File exists. If you want override, please send 'exist_ok=True'") assert self._only_read is False, f"{self.name} is only read." mode = mode or self._get_mode('w') assert 'w' in mode, f"{mode} for write isn't valid." encoding = None if self._is_byte else 'utf-8' with tempfile.TemporaryDirectory() as tmpdirname: tmp_file = Path(tmpdirname).join(self.name) if not self._newline and not self._is_byte: kwargs.update({'newline': ''}) try: with open(tmp_file, mode=mode, encoding=encoding, **kwargs) as fp: self._save_fp(fp) except Exception as err: raise IOError(f'Error writing to the file system file: {err}') tmp_file.cp(self.path)
def test_sanity(self): self.assertEqual( Path(__file__).parent, Path(os.path.realpath(__file__)).parent) p_test = 'cereja/test/sanity' p = Path(p_test) self.assertTrue(p.name, 'sanity') self.assertTrue(p.parent.name, 'test') self.assertTrue(p == p_test) self.assertTrue('sanity' in p) p = p + ['/con/', 'cat'] p_test = Path('cereja/test/sanity').join('con', 'cat') self.assertEqual(p_test.parts[-2:], ('con', 'cat')) self.assertTrue(p == p_test) self.assertListEqual( Path(__file__).parent.list_dir(only_name=True), list( map(lambda x: x.rsplit('.')[0], os.listdir(Path(__file__).parent)))) with TempDir() as tmp_dir: tmp_dir = Path(tmp_dir) suffix_case = tmp_dir.join('test.suffix') self.assertEqual(suffix_case.suffix, '.suffix') mkdir(suffix_case) self.assertEqual(suffix_case.suffix, '')
def unzip(cls, file_path, save_on: str = None, load_on_memory=False): with ZipFile(file_path, mode='r') as myzip: if save_on or load_on_memory: with tempfile.TemporaryDirectory() as tmpdirname: unzip_dir = save_on or tmpdirname unzip_dir = Path(unzip_dir).join(Path(myzip.filename).stem) mkdir(unzip_dir) myzip.extractall(unzip_dir) if load_on_memory: return FileIO.load_files(unzip_dir) return myzip.namelist()
def save(self, on_new_path: Union[str, Path] = None, exist_ok=False, overwrite=False, force=False, **kwargs): with tempfile.TemporaryDirectory() as tmpdirname: tmp_file = Path(tmpdirname).join(self.name) try: with ZipFile(tmp_file.path, mode='w', compression=ZIP_DEFLATED) as myzip: for i in self.data: myzip.write(i) except Exception as err: raise IOError(f'Error writing to the file system file: {err}') tmp_file.cp(self.path) self._last_update = self.updated_at
def test_sanity(self): with tempfile.TemporaryDirectory() as tempdir: to_encode = 'oi tudo tranquilo' tokenizer = Tokenizer(['oi amigo tudo bem'], preprocess_function=separate, use_unk=False) self.assertEqual(tokenizer.last_index, 13) encoded = tokenizer.encode([to_encode]) valid_encode_tokens = [[ tokenizer._item_to_index['oi'], tokenizer._item_to_index['tudo'], tokenizer._item_to_index['tranquilo'] ]] self.assertEqual(tokenizer.last_index, 14) self.assertEqual(encoded, valid_encode_tokens) decoded = ' '.join(tokenizer.decode(encoded[0])) self.assertEqual(decoded, to_encode) temp_file = Path(tempdir).join('tokenizer.json') tokenizer.to_json(temp_file) # Valid. tokenizer = Tokenizer.load_from_json(temp_file) encoded = tokenizer.encode([to_encode]) self.assertEqual(tokenizer.last_index, 14) self.assertEqual(encoded, valid_encode_tokens) decoded = ' '.join(tokenizer.decode(encoded[0])) self.assertEqual(decoded, to_encode)
def load_files(cls, path_, ext=None, contains_in_name: List = (), not_contains_in_name=(), take_empty=True, recursive=False): ext = ext or '' f_paths = Path.list_files(path_, ext=ext, contains_in_name=contains_in_name, not_contains_in_name=not_contains_in_name, recursive=recursive) loaded = [] for p in f_paths: if recursive and p.is_dir: loaded.extend(cls.load_files(path_=p, ext=ext, contains_in_name=contains_in_name, not_contains_in_name=not_contains_in_name, take_empty=take_empty, recursive=recursive)) continue if not p.exists or p.is_dir: continue file_ = cls.load(path_=p) if file_ is None: continue if take_empty is True and file_.is_empty: continue if not (file_.ext == f'.{ext.strip(".")}' or ext == ''): continue if contains_in_name: if not any(map(file_.name_without_ext.__contains__, contains_in_name)): continue if not_contains_in_name: if any(map(file_.name_without_ext.__contains__, not_contains_in_name)): continue loaded.append(file_) return loaded
def create( cls, path_: Union[Type[Path], str], data: Any, **kwargs) -> Union[_TxtIO, _JsonIO, _Mp4IO, _CsvIO, _GenericFile]: path_ = Path(path_) return cls.lookup(path_.suffix)(path_=path_, data=data, creation_mode=True, **kwargs)
def test_eval(self): with tempfile.TemporaryDirectory() as tempdir: eval_data = [1, 2, 3, 'hi', ['oi']] file = FileIO.create(Path(tempdir).join(self.file.name), eval_data) self.assertEqual(file.data, eval_data) file.save() file = FileIO.load(file.path) self.assertEqual(file.data, eval_data)
def test_sanity(self): console.log(f'Testing {self.name}') with tempfile.TemporaryDirectory() as tempdir: self.file.set_path(Path(tempdir).join(self.file.name)) data_before_save = self.file.data self.file.save(exist_ok=True) file = FileIO.load(self.file.path, **self.load_kwargs) self.assertTrue(file.path.exists, msg="File don't exist.") self.assertEqual(file.data, data_before_save, msg='Data corrupted') file.delete() self.assertFalse(file.path.exists)
def __init__(self, path_: Path, data=None, creation_mode=False, **kwargs): super().__init__() if not isinstance(path_, Path): path_ = Path(path_) self._creation_mode = creation_mode self._path = path_ if self._ext_allowed: assert self._path.suffix in self._ext_allowed, f'{path_.suffix} != {self._ext_allowed}' self._ext_without_point = self._path.suffix.strip(".").upper() if creation_mode: self._data = self.parse(data) else: self._data = self.load(**kwargs)
def __init__(self, path_: Path, data=None, creation_mode=False, **kwargs): super().__init__() if not isinstance(path_, Path): path_ = Path(path_) self._creation_mode = creation_mode self._path = path_ if self._ext_allowed: assert self._path.suffix in self._ext_allowed, f'{path_.suffix} != {self._ext_allowed}' self._ext_without_point = self._path.suffix.strip(".").upper() self._built = True # next setattr will be saved in history to be future recovery if creation_mode: self._data = self.parse(data) else: self._data = self._load(**kwargs)
def parse(self, data): if not data: return [] if isinstance(data, (str, Path)): data = [data] if not is_sequence(data): raise ValueError("Send list of paths invalid.") res = [] for p in data: p = Path(p) if not p.exists: raise ValueError(f'{p.path} not Found.') res.append(p) return res
def save_freq(self, save_on: str, prefix='freq', ext: str = 'json', probability=False): ext = ext.strip('.') # normalize save_on = Path(save_on) path_words = save_on.join(f'{prefix}_words.{ext}') self.words_freq.to_json(path_words, probability=probability, exist_ok=True) path_phrases = save_on.join(f'{prefix}_phrases.{ext}') self.phrases_freq.to_json(path_phrases, probability=probability, exist_ok=True)
def __init__(self, path_: Path, data=None, creation_mode=False, is_byte=None, **kwargs): super().__init__() if is_byte is not None: self._is_byte = bool(is_byte) if not isinstance(path_, Path): path_ = Path(path_) self._creation_mode = creation_mode self._path = path_ if self._ext_allowed: msg_assert_err = f'{path_.suffix} != {self._ext_allowed}. Force with support_ext = ".xyz" ' assert self._path.suffix in self._ext_allowed, msg_assert_err self._ext_without_point = self._path.suffix.strip(".").upper() self._built = True # next setattr will be saved in history to be future recovery if creation_mode: self._data = self.parse(data) else: self._data = self._load(**kwargs)
def save(self, save_on_dir: str, take_split: bool = True, test_max_size: int = None, source_vocab_size: int = None, target_vocab_size: int = None, shuffle=True, prefix=None, ext='align', **kwargs): save_on_dir = Path(save_on_dir) if take_split: x_train, y_train, x_test, y_test = self.split_data(test_max_size=test_max_size, source_vocab_size=source_vocab_size, target_vocab_size=target_vocab_size, take_parallel_data=False, shuffle=shuffle) train_prefix, test_prefix = (f'{prefix}_train', f'{prefix}_test') if prefix is not None else ( 'train', 'test') data_to_save = ((train_prefix, x_train, y_train), (test_prefix, x_test, y_test)) else: data_to_save = ((prefix, self.source.data, self.target.data),) for prefix, x, y in data_to_save: save_on = save_on_dir.join(f'{prefix}_{self.source_language}.{ext.strip(".")}') FileIO.create(save_on, data=x).save(**kwargs) save_on = save_on_dir.join(f'{prefix}_{self.target_language}.{ext.strip(".")}') FileIO.create(save_on, data=y).save(**kwargs)
def from_url(cls, url: str, *args, **kwargs): # TODO: need improves resp = request.get(url, *args, **kwargs) if resp.code == 200: return cls(Path('./subtitle.srt'), resp.data, creation_mode=True)
def set_path(self, path_): p = Path(path_) assert not p.is_dir, f"{p.path} is a directory." self._path = p
def set_path(self, path_): self._path = Path(path_)
def create(cls, path_: Union[Type[Path], str], data: Any, **kwargs) -> _FileIO: path_ = Path(path_) return cls.lookup(path_.suffix).create(path_=path_, data=data, **kwargs)
def load(cls, path_: Union[str, Path], **kwargs) -> _FileIO: path_ = Path(path_) if not path_.exists: raise FileNotFoundError(f"{path_.path} not found.") return cls.lookup(path_.suffix)(path_=path_, **kwargs)
def load_from_clipboard(cls): return cls.load(Path(clipboard()))