def save(self, save_on_dir: str, take_split: bool = True, test_max_size: int = None, source_vocab_size: int = None, target_vocab_size: int = None, shuffle=True, prefix=None, ext='align', **kwargs): save_on_dir = Path(save_on_dir) if take_split: x_train, y_train, x_test, y_test = self.split_data( test_max_size=test_max_size, source_vocab_size=source_vocab_size, target_vocab_size=target_vocab_size, take_parallel_data=False, shuffle=shuffle) train_prefix, test_prefix = ( f'{prefix}_train', f'{prefix}_test') if prefix is not None else ('train', 'test') data_to_save = ((train_prefix, x_train, y_train), (test_prefix, x_test, y_test)) else: data_to_save = ((prefix, self.source.data, self.target.data), ) for prefix, x, y in data_to_save: save_on = save_on_dir.join( f'{prefix}_{self.source_language}.{ext.strip(".")}') FileIO.create(save_on, data=x).save(**kwargs) save_on = save_on_dir.join( f'{prefix}_{self.target_language}.{ext.strip(".")}') FileIO.create(save_on, data=y).save(**kwargs)
def load_corpus_from_csv(cls, path_: str, src_col_name: str, trg_col_name: str, source_name=None, target_name=None): csv_read = csv.DictReader(FileIO.load(path_).data) src_data = [] trg_data = [] for i in csv_read: for col_name in (src_col_name, trg_col_name): if col_name not in i: raise ValueError(f"Not found col <{col_name}> in {list(i.keys())}") src_data.append(i[src_col_name]) trg_data.append(i[trg_col_name]) return cls(src_data, trg_data, source_name=source_name, target_name=target_name)
def save(self, filepath: str): FileIO.create(filepath, data=self.__language_data).save(exist_ok=True)
def load(self, filepath: str): self._is_model = True self.__language_data = FileIO.load(filepath).data return self
def save(self, path_, **kwargs): from cereja import FileIO, Path assert Path(path_).suffix == '.py', "Only python source code." FileIO.create(path_, self._source_code).save(**kwargs)