def update_code(): data = request_data() provider = FileProvider(_write_session) file = provider.by_id(data['file_id']) content = data['content'].encode('utf-8') md5 = hashlib.md5(content).hexdigest() if md5 == file.md5: return if file.dag != data['dag']: new_file = File(md5=md5, content=content, project=file.project, dag=data['dag']) provider.add(new_file) storage = DagStorageProvider(_write_session).by_id(data['storage']) storage.file = new_file.id provider.commit() return {'file': new_file.id} else: file.content = content file.md5 = md5 provider.commit() return {'file': file.id}
def create_providers(self): self.log_info('create_providers') self.dag_provider = DagProvider(self.session) self.task_provider = TaskProvider(self.session) self.file_provider = FileProvider(self.session) self.dag_storage_provider = DagStorageProvider(self.session)
def __init__(self, session: Session, logger=None, component: ComponentType = None, max_file_size: int = 10 ** 5, max_count=10 ** 3): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) self.dag_provider = DagProvider(session) self.logger = logger self.component = component self.max_file_size = max_file_size self.max_count = max_count
def code(): id = request_data() res = OrderedDict() parents = dict() for s, f in DagStorageProvider(_read_session).by_dag(id): s.path = s.path.strip() parent = os.path.dirname(s.path) name = os.path.basename(s.path) if name == '': continue if s.is_dir: node = { 'name': name, 'children': [], 'id': s.id, 'dag': id, 'storage': s.id } if not parent: res[name] = node parents[s.path] = node else: # noinspection PyUnresolvedReferences parents[parent]['children'].append(node) parents[os.path.join(parent, name)] = node else: node = {'name': name, 'id': f.id, 'dag': id, 'storage': s.id} try: node['content'] = f.content.decode('utf-8') except UnicodeDecodeError: node['content'] = '' if not parent: res[name] = node else: # noinspection PyUnresolvedReferences parents[parent]['children'].append(node) def sort_key(x): if 'children' in x and len(x['children']) > 0: return '_' * 5 + x['name'] return x['name'] def sort(node: dict): if 'children' in node and len(node['children']) > 0: node['children'] = sorted(node['children'], key=sort_key) for c in node['children']: sort(c) res = sorted(list(res.values()), key=sort_key) for r in res: sort(r) return {'items': res}
class Storage: def __init__(self, session: Session, logger=None, component: ComponentType = None, max_file_size: int = 10**5, max_count=10**3): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) self.dag_provider = DagProvider(session) self.logger = logger self.component = component self.max_file_size = max_file_size self.max_count = max_count def log_info(self, message: str): if self.logger: self.logger.info(message, self.component) def copy_from(self, src: int, dag: Dag): storages = self.provider.query(DagStorage). \ filter(DagStorage.dag == src). \ all() libraries = self.library_provider.query(DagLibrary). \ filter(DagLibrary.dag == src). \ all() s_news = [] for s in storages: s_new = DagStorage(dag=dag.id, file=s.file, path=s.path, is_dir=s.is_dir) s_news.append(s_new) l_news = [] for l in libraries: l_new = DagLibrary(dag=dag.id, library=l.library, version=l.version) l_news.append(l_new) self.provider.add_all(s_news) self.library_provider.add_all(l_news) def _build_spec(self, folder: str): ignore_file = os.path.join(folder, 'file.ignore.txt') if not os.path.exists(ignore_file): ignore_patterns = [] else: ignore_patterns = read_lines(ignore_file) ignore_patterns.extend( ['log', '/data', '/models', '__pycache__', '*.ipynb']) return pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, ignore_patterns) def upload(self, folder: str, dag: Dag, control_reqs: bool = True): self.log_info('upload started') hashs = self.file_provider.hashs(dag.project) self.log_info('hashes are retrieved') all_files = [] spec = self._build_spec(folder) files = glob(os.path.join(folder, '**')) for file in files[:]: path = os.path.relpath(file, folder) if spec.match_file(path) or path == '.': continue if os.path.isdir(file): child_files = glob(os.path.join(folder, file, '**'), recursive=True) files.extend(child_files) if self.max_count and len(files) > self.max_count: raise Exception(f'files count = {len(files)} ' f'But max count = {self.max_count}') self.log_info('list of files formed') folders_to_add = [] files_to_add = [] files_storage_to_add = [] total_size_added = 0 for o in files: path = os.path.relpath(o, folder) if spec.match_file(path) or path == '.': continue if isdir(o): folder_to_add = DagStorage(dag=dag.id, path=path, is_dir=True) folders_to_add.append(folder_to_add) continue content = open(o, 'rb').read() size = sys.getsizeof(content) if self.max_file_size and size > self.max_file_size: raise Exception( f'file = {o} has size {size}.' f' But max size is set to {self.max_file_size}') md5 = hashlib.md5(content).hexdigest() all_files.append(o) if md5 not in hashs: file = File(md5=md5, content=content, project=dag.project, dag=dag.id, created=now()) hashs[md5] = file files_to_add.append(file) total_size_added += size file_storage = DagStorage(dag=dag.id, path=path, file=hashs[md5], is_dir=False) files_storage_to_add.append(file_storage) self.log_info('inserting DagStorage folders') if len(folders_to_add) > 0: self.provider.bulk_save_objects(folders_to_add) self.log_info('inserting Files') if len(files_to_add) > 0: self.file_provider.bulk_save_objects(files_to_add, return_defaults=True) self.log_info('inserting DagStorage Files') if len(files_storage_to_add) > 0: for file_storage in files_storage_to_add: if isinstance(file_storage.file, File): # noinspection PyUnresolvedReferences file_storage.file = file_storage.file.id self.provider.bulk_save_objects(files_storage_to_add) dag.file_size += total_size_added self.dag_provider.update() if INSTALL_DEPENDENCIES and control_reqs: reqs = control_requirements(folder, files=all_files) for name, rel, version in reqs: self.library_provider.add( DagLibrary(dag=dag.id, library=name, version=version)) def download_dag(self, dag: int, folder: str): os.makedirs(folder, exist_ok=True) items = self.provider.by_dag(dag) items = sorted(items, key=lambda x: x[1] is not None) for item, file in items: path = os.path.join(folder, item.path) if item.is_dir: os.makedirs(path, exist_ok=True) else: with open(path, 'wb') as f: f.write(file.content) def download(self, task: int): task = self.task_provider.by_id( task, joinedload(Task.dag_rel, innerjoin=True)) folder = join(TASK_FOLDER, str(task.id)) self.download_dag(task.dag, folder) config = Config.from_yaml(task.dag_rel.config) info = config['info'] try: data_folder = os.path.join(DATA_FOLDER, info['project']) os.makedirs(data_folder, exist_ok=True) os.symlink(data_folder, os.path.join(folder, 'data'), target_is_directory=True) except FileExistsError: pass try: model_folder = os.path.join(MODEL_FOLDER, info['project']) os.makedirs(model_folder, exist_ok=True) os.symlink(model_folder, os.path.join(folder, 'models'), target_is_directory=True) except FileExistsError: pass sys.path.insert(0, folder) return folder def import_executor(self, folder: str, base_folder: str, executor: str, libraries: List[Tuple] = None): sys.path.insert(0, base_folder) spec = self._build_spec(folder) was_installation = False folders = [ p for p in glob(f'{folder}/*', recursive=True) if os.path.isdir(p) and not spec.match_file(p) ] folders += [folder] library_names = set(n for n, v in (libraries or [])) library_versions = {n: v for n, v in (libraries or [])} for n in library_names: try: version = pkg_resources.get_distribution(n).version need_install = library_versions[n] != version except Exception: need_install = True if INSTALL_DEPENDENCIES and need_install: os.system(f'pip install {n}=={library_versions[n]}') was_installation = True def is_valid_class(cls: pyclbr.Class): return cls.name == executor or \ cls.name.lower() == executor or \ to_snake(cls.name) == executor def relative_name(path: str): rel = os.path.relpath(path, base_folder) parts = [str(p).split('.')[0] for p in rel.split(os.sep)] return '.'.join(parts) for (module_loader, module_name, ispkg) in pkgutil.iter_modules(folders): module = module_loader.find_module(module_name) rel_path = os.path.relpath( os.path.splitext(module.path)[0], base_folder).replace('/', '.') try: classes = pyclbr.readmodule(rel_path, path=[base_folder]) except Exception: continue for k, v in classes.items(): if is_valid_class(v): importlib.import_module(relative_name(module.path)) return True, was_installation return False, was_installation
class DagCopyBuilder: def __init__(self, session: Session, dag: int, file_changes: str = '', dag_suffix: str = '', logger=None, component: ComponentType = None): self.dag = dag self.file_changes = file_changes self.session = session self.logger = logger self.component = component self.dag_suffix = dag_suffix self.dag_db = None self.dag_provider = None self.task_provider = None self.file_provider = None self.dag_storage_provider = None def log_info(self, message: str): if self.logger: self.logger.info(message, self.component) def create_providers(self): self.log_info('create_providers') self.dag_provider = DagProvider(self.session) self.task_provider = TaskProvider(self.session) self.file_provider = FileProvider(self.session) self.dag_storage_provider = DagStorageProvider(self.session) def create_dag(self): dag = self.dag_provider.by_id(self.dag) name = dag.name if self.dag_suffix: name += ' ' + self.dag_suffix dag_new = Dag(name=name, created=now(), config=dag.config, project=dag.project, docker_img=dag.docker_img, img_size=0, file_size=0, type=dag.type) self.dag_provider.add(dag_new) self.dag_db = dag_new def find_replace(self, changes: dict, path: str): for k, v in changes.items(): if not re.match(k, path): continue return v def create_tasks(self): tasks = self.task_provider.by_dag(self.dag) tasks_new = [] tasks_old = [] for t in tasks: if t.parent: continue task = Task( name=t.name, status=TaskStatus.NotRan.value, computer=t.computer, gpu=t.gpu, gpu_max=t.gpu_max, cpu=t.cpu, executor=t.executor, memory=t.memory, steps=t.steps, dag=self.dag_db.id, debug=t.debug, type=t.type, ) task.additional_info = t.additional_info tasks_new.append(task) tasks_old.append(t) self.task_provider.bulk_save_objects(tasks_new, return_defaults=True) old2new = { t_old.id: t_new.id for t_new, t_old in zip(tasks_new, tasks_old) } dependencies = self.task_provider.get_dependencies(self.dag) dependencies_new = [] for d in dependencies: d_new = TaskDependence(task_id=old2new[d.task_id], depend_id=old2new[d.depend_id]) dependencies_new.append(d_new) self.task_provider.bulk_save_objects(dependencies_new, return_defaults=False) changes = yaml_load(self.file_changes) storages = self.dag_storage_provider.by_dag(self.dag) storages_new = [] for s, f in storages: if not isinstance(changes, dict): continue replace = self.find_replace(changes, s.path) if replace is not None and f: content = f.content.decode('utf-8') if s.path.endswith('.yml'): data = yaml_load(content) data = merge_dicts_smart(data, replace) content = yaml_dump(data) else: for k, v in replace: if k not in content: raise Exception(f'{k} is not in the content') content = content.replace(k, v) content = content.encode('utf-8') md5 = hashlib.md5(content).hexdigest() f = self.file_provider.by_md5(md5) if not f: f = File(content=content, created=now(), project=self.dag_db.project, md5=md5, dag=self.dag_db.id) self.file_provider.add(f) s_new = DagStorage(dag=self.dag_db.id, file=f.id, path=s.path, is_dir=s.is_dir) storages_new.append(s_new) self.dag_storage_provider.bulk_save_objects(storages_new, return_defaults=False) def build(self): self.create_providers() self.create_dag() self.create_tasks()
def __init__(self, session: Session): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session)
class Storage: def __init__(self, session: Session): self.file_provider = FileProvider(session) self.provider = DagStorageProvider(session) self.task_provider = TaskProvider(session) self.library_provider = DagLibraryProvider(session) def copy_from(self, src: int, dag: Dag): storages = self.provider.query(DagStorage). \ filter(DagStorage.dag == src). \ all() libraries = self.library_provider.query(DagLibrary). \ filter(DagLibrary.dag == src). \ all() s_news = [] for s in storages: s_new = DagStorage(dag=dag.id, file=s.file, path=s.path, is_dir=s.is_dir) s_news.append(s_new) l_news = [] for l in libraries: l_new = DagLibrary(dag=dag.id, library=l.library, version=l.version) l_news.append(l_new) self.provider.add_all(s_news) self.library_provider.add_all(l_news) def _build_spec(self, folder: str): ignore_file = os.path.join(folder, 'file.ignore.txt') if not os.path.exists(ignore_file): ignore_patterns = [] else: ignore_patterns = read_lines(ignore_file) ignore_patterns.extend(['log', 'data', 'models', '__pycache__']) return pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, ignore_patterns) def upload(self, folder: str, dag: Dag, control_reqs: bool = True): hashs = self.file_provider.hashs(dag.project) files = [] all_files = [] spec = self._build_spec(folder) for o in glob(os.path.join(folder, '**'), recursive=True): path = os.path.relpath(o, folder) if spec.match_file(path) or path == '.': continue if isdir(o): self.provider.add( DagStorage(dag=dag.id, path=path, is_dir=True)) continue content = open(o, 'rb').read() md5 = hashlib.md5(content).hexdigest() all_files.append(o) if md5 in hashs: file_id = hashs[md5] else: file = File(md5=md5, content=content, project=dag.project, dag=dag.id, created=now()) self.file_provider.add(file) file_id = file.id hashs[md5] = file.id files.append(o) self.provider.add( DagStorage(dag=dag.id, path=path, file=file_id, is_dir=False)) if INSTALL_DEPENDENCIES and control_reqs: reqs = control_requirements(folder, files=all_files) for name, rel, version in reqs: self.library_provider.add( DagLibrary(dag=dag.id, library=name, version=version)) def download(self, task: int): task = self.task_provider.by_id( task, joinedload(Task.dag_rel, innerjoin=True)) folder = join(TASK_FOLDER, str(task.id)) os.makedirs(folder, exist_ok=True) items = self.provider.by_dag(task.dag) items = sorted(items, key=lambda x: x[1] is not None) for item, file in items: path = os.path.join(folder, item.path) if item.is_dir: os.makedirs(path, exist_ok=True) else: with open(path, 'wb') as f: f.write(file.content) config = Config.from_yaml(task.dag_rel.config) info = config['info'] try: data_folder = os.path.join(DATA_FOLDER, info['project']) os.makedirs(data_folder, exist_ok=True) os.symlink(data_folder, os.path.join(folder, 'data'), target_is_directory=True) except FileExistsError: pass try: model_folder = os.path.join(MODEL_FOLDER, info['project']) os.makedirs(model_folder, exist_ok=True) os.symlink(model_folder, os.path.join(folder, 'models'), target_is_directory=True) except FileExistsError: pass sys.path.insert(0, folder) return folder def import_executor(self, folder: str, base_folder: str, executor: str, libraries: List[Tuple] = None): sys.path.insert(0, base_folder) spec = self._build_spec(folder) was_installation = False folders = [ p for p in glob(f'{folder}/*', recursive=True) if os.path.isdir(p) and not spec.match_file(p) ] folders += [folder] library_names = set(n for n, v in (libraries or [])) library_versions = {n: v for n, v in (libraries or [])} for n in library_names: try: version = pkg_resources.get_distribution(n).version need_install = library_versions[n] != version except Exception: need_install = True if INSTALL_DEPENDENCIES and need_install: os.system(f'pip install {n}=={library_versions[n]}') was_installation = True def is_valid_class(cls: pyclbr.Class): super_names = get_super_names(cls) if 'Executor' not in super_names: return False return cls.name == executor or \ cls.name.lower() == executor or \ to_snake(cls.name) == executor def relative_name(path: str): rel = os.path.relpath(path, base_folder) parts = [str(p).split('.')[0] for p in rel.split(os.sep)] return '.'.join(parts) for (module_loader, module_name, ispkg) in pkgutil.iter_modules(folders): module = module_loader.find_module(module_name) module_folder = dirname(module.path) classes = pyclbr.readmodule(module_name, path=[module_folder]) for k, v in classes.items(): if is_valid_class(v): importlib.import_module(relative_name(module.path)) return True, was_installation return False, was_installation