class Storage: def __init__(self, session: Session, logger=None, component: ComponentType = None, max_file_size: int = 10**5, max_count=10**3): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) self.dag_provider = DagProvider(session) self.logger = logger self.component = component self.max_file_size = max_file_size self.max_count = max_count def log_info(self, message: str): if self.logger: self.logger.info(message, self.component) def copy_from(self, src: int, dag: Dag): storages = self.provider.query(DagStorage). \ filter(DagStorage.dag == src). \ all() libraries = self.library_provider.query(DagLibrary). \ filter(DagLibrary.dag == src). \ all() s_news = [] for s in storages: s_new = DagStorage(dag=dag.id, file=s.file, path=s.path, is_dir=s.is_dir) s_news.append(s_new) l_news = [] for l in libraries: l_new = DagLibrary(dag=dag.id, library=l.library, version=l.version) l_news.append(l_new) self.provider.add_all(s_news) self.library_provider.add_all(l_news) def _build_spec(self, folder: str): ignore_file = os.path.join(folder, 'file.ignore.txt') if not os.path.exists(ignore_file): ignore_patterns = [] else: ignore_patterns = read_lines(ignore_file) ignore_patterns.extend( ['log', '/data', '/models', '__pycache__', '*.ipynb']) return pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, ignore_patterns) def upload(self, folder: str, dag: Dag, control_reqs: bool = True): self.log_info('upload started') hashs = self.file_provider.hashs(dag.project) self.log_info('hashes are retrieved') all_files = [] spec = self._build_spec(folder) files = glob(os.path.join(folder, '**')) for file in files[:]: path = os.path.relpath(file, folder) if spec.match_file(path) or path == '.': continue if os.path.isdir(file): child_files = glob(os.path.join(folder, file, '**'), recursive=True) files.extend(child_files) if self.max_count and len(files) > self.max_count: raise Exception(f'files count = {len(files)} ' f'But max count = {self.max_count}') self.log_info('list of files formed') folders_to_add = [] files_to_add = [] files_storage_to_add = [] total_size_added = 0 for o in files: path = os.path.relpath(o, folder) if spec.match_file(path) or path == '.': continue if isdir(o): folder_to_add = DagStorage(dag=dag.id, path=path, is_dir=True) folders_to_add.append(folder_to_add) continue content = open(o, 'rb').read() size = sys.getsizeof(content) if self.max_file_size and size > self.max_file_size: raise Exception( f'file = {o} has size {size}.' f' But max size is set to {self.max_file_size}') md5 = hashlib.md5(content).hexdigest() all_files.append(o) if md5 not in hashs: file = File(md5=md5, content=content, project=dag.project, dag=dag.id, created=now()) hashs[md5] = file files_to_add.append(file) total_size_added += size file_storage = DagStorage(dag=dag.id, path=path, file=hashs[md5], is_dir=False) files_storage_to_add.append(file_storage) self.log_info('inserting DagStorage folders') if len(folders_to_add) > 0: self.provider.bulk_save_objects(folders_to_add) self.log_info('inserting Files') if len(files_to_add) > 0: self.file_provider.bulk_save_objects(files_to_add, return_defaults=True) self.log_info('inserting DagStorage Files') if len(files_storage_to_add) > 0: for file_storage in files_storage_to_add: if isinstance(file_storage.file, File): # noinspection PyUnresolvedReferences file_storage.file = file_storage.file.id self.provider.bulk_save_objects(files_storage_to_add) dag.file_size += total_size_added self.dag_provider.update() if INSTALL_DEPENDENCIES and control_reqs: reqs = control_requirements(folder, files=all_files) for name, rel, version in reqs: self.library_provider.add( DagLibrary(dag=dag.id, library=name, version=version)) def download_dag(self, dag: int, folder: str): os.makedirs(folder, exist_ok=True) items = self.provider.by_dag(dag) items = sorted(items, key=lambda x: x[1] is not None) for item, file in items: path = os.path.join(folder, item.path) if item.is_dir: os.makedirs(path, exist_ok=True) else: with open(path, 'wb') as f: f.write(file.content) def download(self, task: int): task = self.task_provider.by_id( task, joinedload(Task.dag_rel, innerjoin=True)) folder = join(TASK_FOLDER, str(task.id)) self.download_dag(task.dag, folder) config = Config.from_yaml(task.dag_rel.config) info = config['info'] try: data_folder = os.path.join(DATA_FOLDER, info['project']) os.makedirs(data_folder, exist_ok=True) os.symlink(data_folder, os.path.join(folder, 'data'), target_is_directory=True) except FileExistsError: pass try: model_folder = os.path.join(MODEL_FOLDER, info['project']) os.makedirs(model_folder, exist_ok=True) os.symlink(model_folder, os.path.join(folder, 'models'), target_is_directory=True) except FileExistsError: pass sys.path.insert(0, folder) return folder def import_executor(self, folder: str, base_folder: str, executor: str, libraries: List[Tuple] = None): sys.path.insert(0, base_folder) spec = self._build_spec(folder) was_installation = False folders = [ p for p in glob(f'{folder}/*', recursive=True) if os.path.isdir(p) and not spec.match_file(p) ] folders += [folder] library_names = set(n for n, v in (libraries or [])) library_versions = {n: v for n, v in (libraries or [])} for n in library_names: try: version = pkg_resources.get_distribution(n).version need_install = library_versions[n] != version except Exception: need_install = True if INSTALL_DEPENDENCIES and need_install: os.system(f'pip install {n}=={library_versions[n]}') was_installation = True def is_valid_class(cls: pyclbr.Class): return cls.name == executor or \ cls.name.lower() == executor or \ to_snake(cls.name) == executor def relative_name(path: str): rel = os.path.relpath(path, base_folder) parts = [str(p).split('.')[0] for p in rel.split(os.sep)] return '.'.join(parts) for (module_loader, module_name, ispkg) in pkgutil.iter_modules(folders): module = module_loader.find_module(module_name) rel_path = os.path.relpath( os.path.splitext(module.path)[0], base_folder).replace('/', '.') try: classes = pyclbr.readmodule(rel_path, path=[base_folder]) except Exception: continue for k, v in classes.items(): if is_valid_class(v): importlib.import_module(relative_name(module.path)) return True, was_installation return False, was_installation
class Storage: def __init__(self, session: Session): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) def copy_from(self, src: int, dag: Dag): storages = self.provider.query(DagStorage). \ filter(DagStorage.dag == src). \ all() libraries = self.library_provider.query(DagLibrary). \ filter(DagLibrary.dag == src). \ all() s_news = [] for s in storages: s_new = DagStorage(dag=dag.id, file=s.file, path=s.path, is_dir=s.is_dir) s_news.append(s_new) l_news = [] for l in libraries: l_new = DagLibrary(dag=dag.id, library=l.library, version=l.version) l_news.append(l_new) self.provider.add_all(s_news) self.library_provider.add_all(l_news) def _build_spec(self, folder: str): ignore_file = os.path.join(folder, 'file.ignore.txt') if not os.path.exists(ignore_file): ignore_patterns = [] else: ignore_patterns = read_lines(ignore_file) ignore_patterns.extend(['log', 'data', 'models', '__pycache__']) return pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, ignore_patterns) def upload(self, folder: str, dag: Dag, control_reqs: bool = True): hashs = self.file_provider.hashs(dag.project) files = [] all_files = [] spec = self._build_spec(folder) for o in glob(os.path.join(folder, '**'), recursive=True): path = os.path.relpath(o, folder) if spec.match_file(path) or path == '.': continue if isdir(o): self.provider.add( DagStorage(dag=dag.id, path=path, is_dir=True)) continue content = open(o, 'rb').read() md5 = hashlib.md5(content).hexdigest() all_files.append(o) if md5 in hashs: file_id = hashs[md5] else: file = File(md5=md5, content=content, project=dag.project, dag=dag.id, created=now()) self.file_provider.add(file) file_id = file.id hashs[md5] = file.id files.append(o) self.provider.add( DagStorage(dag=dag.id, path=path, file=file_id, is_dir=False)) if INSTALL_DEPENDENCIES and control_reqs: reqs = control_requirements(folder, files=all_files) for name, rel, version in reqs: self.library_provider.add( DagLibrary(dag=dag.id, library=name, version=version)) def download(self, task: int): task = self.task_provider.by_id( task, joinedload(Task.dag_rel, innerjoin=True)) folder = join(TASK_FOLDER, str(task.id)) os.makedirs(folder, exist_ok=True) items = self.provider.by_dag(task.dag) items = sorted(items, key=lambda x: x[1] is not None) for item, file in items: path = os.path.join(folder, item.path) if item.is_dir: os.makedirs(path, exist_ok=True) else: with open(path, 'wb') as f: f.write(file.content) config = Config.from_yaml(task.dag_rel.config) info = config['info'] try: data_folder = os.path.join(DATA_FOLDER, info['project']) os.makedirs(data_folder, exist_ok=True) os.symlink(data_folder, os.path.join(folder, 'data'), target_is_directory=True) except FileExistsError: pass try: model_folder = os.path.join(MODEL_FOLDER, info['project']) os.makedirs(model_folder, exist_ok=True) os.symlink(model_folder, os.path.join(folder, 'models'), target_is_directory=True) except FileExistsError: pass sys.path.insert(0, folder) return folder def import_executor(self, folder: str, base_folder: str, executor: str, libraries: List[Tuple] = None): sys.path.insert(0, base_folder) spec = self._build_spec(folder) was_installation = False folders = [ p for p in glob(f'{folder}/*', recursive=True) if os.path.isdir(p) and not spec.match_file(p) ] folders += [folder] library_names = set(n for n, v in (libraries or [])) library_versions = {n: v for n, v in (libraries or [])} for n in library_names: try: version = pkg_resources.get_distribution(n).version need_install = library_versions[n] != version except Exception: need_install = True if INSTALL_DEPENDENCIES and need_install: os.system(f'pip install {n}=={library_versions[n]}') was_installation = True def is_valid_class(cls: pyclbr.Class): super_names = get_super_names(cls) if 'Executor' not in super_names: return False return cls.name == executor or \ cls.name.lower() == executor or \ to_snake(cls.name) == executor def relative_name(path: str): rel = os.path.relpath(path, base_folder) parts = [str(p).split('.')[0] for p in rel.split(os.sep)] return '.'.join(parts) for (module_loader, module_name, ispkg) in pkgutil.iter_modules(folders): module = module_loader.find_module(module_name) module_folder = dirname(module.path) classes = pyclbr.readmodule(module_name, path=[module_folder]) for k, v in classes.items(): if is_valid_class(v): importlib.import_module(relative_name(module.path)) return True, was_installation return False, was_installation