Beispiel #1
0
    def create_providers(self):
        self.log_info('create_providers')

        self.dag_provider = DagProvider(self.session)
        self.task_provider = TaskProvider(self.session)
        self.file_provider = FileProvider(self.session)
        self.dag_storage_provider = DagStorageProvider(self.session)
Beispiel #2
0
def update_code():
    data = request_data()
    provider = FileProvider(_write_session)
    file = provider.by_id(data['file_id'])
    content = data['content'].encode('utf-8')
    md5 = hashlib.md5(content).hexdigest()

    if md5 == file.md5:
        return

    if file.dag != data['dag']:
        new_file = File(md5=md5,
                        content=content,
                        project=file.project,
                        dag=data['dag'])
        provider.add(new_file)

        storage = DagStorageProvider(_write_session).by_id(data['storage'])
        storage.file = new_file.id
        provider.commit()
        return {'file': new_file.id}
    else:
        file.content = content
        file.md5 = md5
        provider.commit()
        return {'file': file.id}
Beispiel #3
0
    def __init__(self, session: Session, logger=None,
                 component: ComponentType = None,
                 max_file_size: int = 10 ** 5, max_count=10 ** 3):
        self.file_provider = FileProvider(session)
        self.provider = DagStorageProvider(session)
        self.task_provider = TaskProvider(session)
        self.library_provider = DagLibraryProvider(session)
        self.dag_provider = DagProvider(session)

        self.logger = logger
        self.component = component
        self.max_file_size = max_file_size
        self.max_count = max_count
Beispiel #4
0
def remove_files():
    data = request_data()
    provider = FileProvider(_write_session)
    res = provider.remove(data)
    return res
Beispiel #5
0
class Storage:
    def __init__(self,
                 session: Session,
                 logger=None,
                 component: ComponentType = None,
                 max_file_size: int = 10**5,
                 max_count=10**3):
        self.file_provider = FileProvider(session)
        self.provider = DagStorageProvider(session)
        self.task_provider = TaskProvider(session)
        self.library_provider = DagLibraryProvider(session)
        self.dag_provider = DagProvider(session)

        self.logger = logger
        self.component = component
        self.max_file_size = max_file_size
        self.max_count = max_count

    def log_info(self, message: str):
        if self.logger:
            self.logger.info(message, self.component)

    def copy_from(self, src: int, dag: Dag):
        storages = self.provider.query(DagStorage). \
            filter(DagStorage.dag == src). \
            all()
        libraries = self.library_provider.query(DagLibrary). \
            filter(DagLibrary.dag == src). \
            all()

        s_news = []
        for s in storages:
            s_new = DagStorage(dag=dag.id,
                               file=s.file,
                               path=s.path,
                               is_dir=s.is_dir)
            s_news.append(s_new)
        l_news = []
        for l in libraries:
            l_new = DagLibrary(dag=dag.id,
                               library=l.library,
                               version=l.version)
            l_news.append(l_new)

        self.provider.add_all(s_news)
        self.library_provider.add_all(l_news)

    def _build_spec(self, folder: str):
        ignore_file = os.path.join(folder, 'file.ignore.txt')
        if not os.path.exists(ignore_file):
            ignore_patterns = []
        else:
            ignore_patterns = read_lines(ignore_file)
        ignore_patterns.extend(
            ['log', '/data', '/models', '__pycache__', '*.ipynb'])

        return pathspec.PathSpec.from_lines(
            pathspec.patterns.GitWildMatchPattern, ignore_patterns)

    def upload(self, folder: str, dag: Dag, control_reqs: bool = True):
        self.log_info('upload started')
        hashs = self.file_provider.hashs(dag.project)
        self.log_info('hashes are retrieved')

        all_files = []
        spec = self._build_spec(folder)

        files = glob(os.path.join(folder, '**'))
        for file in files[:]:
            path = os.path.relpath(file, folder)
            if spec.match_file(path) or path == '.':
                continue
            if os.path.isdir(file):
                child_files = glob(os.path.join(folder, file, '**'),
                                   recursive=True)
                files.extend(child_files)

        if self.max_count and len(files) > self.max_count:
            raise Exception(f'files count = {len(files)} '
                            f'But max count = {self.max_count}')

        self.log_info('list of files formed')

        folders_to_add = []
        files_to_add = []
        files_storage_to_add = []

        total_size_added = 0

        for o in files:
            path = os.path.relpath(o, folder)
            if spec.match_file(path) or path == '.':
                continue

            if isdir(o):
                folder_to_add = DagStorage(dag=dag.id, path=path, is_dir=True)
                folders_to_add.append(folder_to_add)
                continue
            content = open(o, 'rb').read()
            size = sys.getsizeof(content)
            if self.max_file_size and size > self.max_file_size:
                raise Exception(
                    f'file = {o} has size {size}.'
                    f' But max size is set to {self.max_file_size}')
            md5 = hashlib.md5(content).hexdigest()

            all_files.append(o)

            if md5 not in hashs:
                file = File(md5=md5,
                            content=content,
                            project=dag.project,
                            dag=dag.id,
                            created=now())
                hashs[md5] = file
                files_to_add.append(file)
                total_size_added += size

            file_storage = DagStorage(dag=dag.id,
                                      path=path,
                                      file=hashs[md5],
                                      is_dir=False)
            files_storage_to_add.append(file_storage)

        self.log_info('inserting DagStorage folders')

        if len(folders_to_add) > 0:
            self.provider.bulk_save_objects(folders_to_add)

        self.log_info('inserting Files')

        if len(files_to_add) > 0:
            self.file_provider.bulk_save_objects(files_to_add,
                                                 return_defaults=True)

        self.log_info('inserting DagStorage Files')

        if len(files_storage_to_add) > 0:
            for file_storage in files_storage_to_add:
                if isinstance(file_storage.file, File):
                    # noinspection PyUnresolvedReferences
                    file_storage.file = file_storage.file.id

            self.provider.bulk_save_objects(files_storage_to_add)

        dag.file_size += total_size_added

        self.dag_provider.update()

        if INSTALL_DEPENDENCIES and control_reqs:
            reqs = control_requirements(folder, files=all_files)
            for name, rel, version in reqs:
                self.library_provider.add(
                    DagLibrary(dag=dag.id, library=name, version=version))

    def download_dag(self, dag: int, folder: str):
        os.makedirs(folder, exist_ok=True)

        items = self.provider.by_dag(dag)
        items = sorted(items, key=lambda x: x[1] is not None)
        for item, file in items:
            path = os.path.join(folder, item.path)
            if item.is_dir:
                os.makedirs(path, exist_ok=True)
            else:
                with open(path, 'wb') as f:
                    f.write(file.content)

    def download(self, task: int):
        task = self.task_provider.by_id(
            task, joinedload(Task.dag_rel, innerjoin=True))
        folder = join(TASK_FOLDER, str(task.id))
        self.download_dag(task.dag, folder)

        config = Config.from_yaml(task.dag_rel.config)
        info = config['info']
        try:
            data_folder = os.path.join(DATA_FOLDER, info['project'])
            os.makedirs(data_folder, exist_ok=True)

            os.symlink(data_folder,
                       os.path.join(folder, 'data'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        try:
            model_folder = os.path.join(MODEL_FOLDER, info['project'])
            os.makedirs(model_folder, exist_ok=True)

            os.symlink(model_folder,
                       os.path.join(folder, 'models'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        sys.path.insert(0, folder)
        return folder

    def import_executor(self,
                        folder: str,
                        base_folder: str,
                        executor: str,
                        libraries: List[Tuple] = None):

        sys.path.insert(0, base_folder)

        spec = self._build_spec(folder)
        was_installation = False

        folders = [
            p for p in glob(f'{folder}/*', recursive=True)
            if os.path.isdir(p) and not spec.match_file(p)
        ]
        folders += [folder]
        library_names = set(n for n, v in (libraries or []))
        library_versions = {n: v for n, v in (libraries or [])}

        for n in library_names:
            try:
                version = pkg_resources.get_distribution(n).version
                need_install = library_versions[n] != version
            except Exception:
                need_install = True

            if INSTALL_DEPENDENCIES and need_install:
                os.system(f'pip install {n}=={library_versions[n]}')
                was_installation = True

        def is_valid_class(cls: pyclbr.Class):
            return cls.name == executor or \
                   cls.name.lower() == executor or \
                   to_snake(cls.name) == executor

        def relative_name(path: str):
            rel = os.path.relpath(path, base_folder)
            parts = [str(p).split('.')[0] for p in rel.split(os.sep)]
            return '.'.join(parts)

        for (module_loader, module_name,
             ispkg) in pkgutil.iter_modules(folders):
            module = module_loader.find_module(module_name)
            rel_path = os.path.relpath(
                os.path.splitext(module.path)[0],
                base_folder).replace('/', '.')
            try:
                classes = pyclbr.readmodule(rel_path, path=[base_folder])
            except Exception:
                continue
            for k, v in classes.items():
                if is_valid_class(v):
                    importlib.import_module(relative_name(module.path))
                    return True, was_installation

        return False, was_installation
Beispiel #6
0
class DagCopyBuilder:
    def __init__(self,
                 session: Session,
                 dag: int,
                 file_changes: str = '',
                 dag_suffix: str = '',
                 logger=None,
                 component: ComponentType = None):
        self.dag = dag
        self.file_changes = file_changes
        self.session = session
        self.logger = logger
        self.component = component
        self.dag_suffix = dag_suffix

        self.dag_db = None

        self.dag_provider = None
        self.task_provider = None
        self.file_provider = None
        self.dag_storage_provider = None

    def log_info(self, message: str):
        if self.logger:
            self.logger.info(message, self.component)

    def create_providers(self):
        self.log_info('create_providers')

        self.dag_provider = DagProvider(self.session)
        self.task_provider = TaskProvider(self.session)
        self.file_provider = FileProvider(self.session)
        self.dag_storage_provider = DagStorageProvider(self.session)

    def create_dag(self):
        dag = self.dag_provider.by_id(self.dag)
        name = dag.name
        if self.dag_suffix:
            name += ' ' + self.dag_suffix
        dag_new = Dag(name=name,
                      created=now(),
                      config=dag.config,
                      project=dag.project,
                      docker_img=dag.docker_img,
                      img_size=0,
                      file_size=0,
                      type=dag.type)
        self.dag_provider.add(dag_new)
        self.dag_db = dag_new

    def find_replace(self, changes: dict, path: str):
        for k, v in changes.items():
            if not re.match(k, path):
                continue
            return v

    def create_tasks(self):
        tasks = self.task_provider.by_dag(self.dag)
        tasks_new = []
        tasks_old = []

        for t in tasks:
            if t.parent:
                continue

            task = Task(
                name=t.name,
                status=TaskStatus.NotRan.value,
                computer=t.computer,
                gpu=t.gpu,
                gpu_max=t.gpu_max,
                cpu=t.cpu,
                executor=t.executor,
                memory=t.memory,
                steps=t.steps,
                dag=self.dag_db.id,
                debug=t.debug,
                type=t.type,
            )
            task.additional_info = t.additional_info
            tasks_new.append(task)
            tasks_old.append(t)

        self.task_provider.bulk_save_objects(tasks_new, return_defaults=True)
        old2new = {
            t_old.id: t_new.id
            for t_new, t_old in zip(tasks_new, tasks_old)
        }
        dependencies = self.task_provider.get_dependencies(self.dag)
        dependencies_new = []
        for d in dependencies:
            d_new = TaskDependence(task_id=old2new[d.task_id],
                                   depend_id=old2new[d.depend_id])
            dependencies_new.append(d_new)

        self.task_provider.bulk_save_objects(dependencies_new,
                                             return_defaults=False)

        changes = yaml_load(self.file_changes)
        storages = self.dag_storage_provider.by_dag(self.dag)
        storages_new = []

        for s, f in storages:
            if not isinstance(changes, dict):
                continue

            replace = self.find_replace(changes, s.path)
            if replace is not None and f:
                content = f.content.decode('utf-8')
                if s.path.endswith('.yml'):
                    data = yaml_load(content)
                    data = merge_dicts_smart(data, replace)
                    content = yaml_dump(data)
                else:
                    for k, v in replace:
                        if k not in content:
                            raise Exception(f'{k} is not in the content')
                        content = content.replace(k, v)
                content = content.encode('utf-8')
                md5 = hashlib.md5(content).hexdigest()
                f = self.file_provider.by_md5(md5)
                if not f:
                    f = File(content=content,
                             created=now(),
                             project=self.dag_db.project,
                             md5=md5,
                             dag=self.dag_db.id)
                self.file_provider.add(f)

            s_new = DagStorage(dag=self.dag_db.id,
                               file=f.id,
                               path=s.path,
                               is_dir=s.is_dir)
            storages_new.append(s_new)

        self.dag_storage_provider.bulk_save_objects(storages_new,
                                                    return_defaults=False)

    def build(self):
        self.create_providers()
        self.create_dag()
        self.create_tasks()
Beispiel #7
0
 def __init__(self, session: Session):
     self.file_provider = FileProvider(session)
     self.provider = DagStorageProvider(session)
     self.task_provider = TaskProvider(session)
     self.library_provider = DagLibraryProvider(session)
Beispiel #8
0
class Storage:
    def __init__(self, session: Session):
        self.file_provider = FileProvider(session)
        self.provider = DagStorageProvider(session)
        self.task_provider = TaskProvider(session)
        self.library_provider = DagLibraryProvider(session)

    def copy_from(self, src: int, dag: Dag):
        storages = self.provider.query(DagStorage). \
            filter(DagStorage.dag == src). \
            all()
        libraries = self.library_provider.query(DagLibrary). \
            filter(DagLibrary.dag == src). \
            all()

        s_news = []
        for s in storages:
            s_new = DagStorage(dag=dag.id,
                               file=s.file,
                               path=s.path,
                               is_dir=s.is_dir)
            s_news.append(s_new)
        l_news = []
        for l in libraries:
            l_new = DagLibrary(dag=dag.id,
                               library=l.library,
                               version=l.version)
            l_news.append(l_new)

        self.provider.add_all(s_news)
        self.library_provider.add_all(l_news)

    def _build_spec(self, folder: str):
        ignore_file = os.path.join(folder, 'file.ignore.txt')
        if not os.path.exists(ignore_file):
            ignore_patterns = []
        else:
            ignore_patterns = read_lines(ignore_file)
        ignore_patterns.extend(['log', 'data', 'models', '__pycache__'])

        return pathspec.PathSpec.from_lines(
            pathspec.patterns.GitWildMatchPattern, ignore_patterns)

    def upload(self, folder: str, dag: Dag, control_reqs: bool = True):
        hashs = self.file_provider.hashs(dag.project)

        files = []
        all_files = []
        spec = self._build_spec(folder)

        for o in glob(os.path.join(folder, '**'), recursive=True):
            path = os.path.relpath(o, folder)
            if spec.match_file(path) or path == '.':
                continue

            if isdir(o):
                self.provider.add(
                    DagStorage(dag=dag.id, path=path, is_dir=True))
                continue
            content = open(o, 'rb').read()
            md5 = hashlib.md5(content).hexdigest()

            all_files.append(o)

            if md5 in hashs:
                file_id = hashs[md5]
            else:
                file = File(md5=md5,
                            content=content,
                            project=dag.project,
                            dag=dag.id,
                            created=now())
                self.file_provider.add(file)
                file_id = file.id
                hashs[md5] = file.id
                files.append(o)

            self.provider.add(
                DagStorage(dag=dag.id, path=path, file=file_id, is_dir=False))

        if INSTALL_DEPENDENCIES and control_reqs:
            reqs = control_requirements(folder, files=all_files)
            for name, rel, version in reqs:
                self.library_provider.add(
                    DagLibrary(dag=dag.id, library=name, version=version))

    def download(self, task: int):
        task = self.task_provider.by_id(
            task, joinedload(Task.dag_rel, innerjoin=True))
        folder = join(TASK_FOLDER, str(task.id))
        os.makedirs(folder, exist_ok=True)
        items = self.provider.by_dag(task.dag)
        items = sorted(items, key=lambda x: x[1] is not None)
        for item, file in items:
            path = os.path.join(folder, item.path)
            if item.is_dir:
                os.makedirs(path, exist_ok=True)
            else:
                with open(path, 'wb') as f:
                    f.write(file.content)

        config = Config.from_yaml(task.dag_rel.config)
        info = config['info']
        try:
            data_folder = os.path.join(DATA_FOLDER, info['project'])
            os.makedirs(data_folder, exist_ok=True)

            os.symlink(data_folder,
                       os.path.join(folder, 'data'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        try:
            model_folder = os.path.join(MODEL_FOLDER, info['project'])
            os.makedirs(model_folder, exist_ok=True)

            os.symlink(model_folder,
                       os.path.join(folder, 'models'),
                       target_is_directory=True)
        except FileExistsError:
            pass

        sys.path.insert(0, folder)
        return folder

    def import_executor(self,
                        folder: str,
                        base_folder: str,
                        executor: str,
                        libraries: List[Tuple] = None):

        sys.path.insert(0, base_folder)

        spec = self._build_spec(folder)
        was_installation = False

        folders = [
            p for p in glob(f'{folder}/*', recursive=True)
            if os.path.isdir(p) and not spec.match_file(p)
        ]
        folders += [folder]
        library_names = set(n for n, v in (libraries or []))
        library_versions = {n: v for n, v in (libraries or [])}

        for n in library_names:
            try:
                version = pkg_resources.get_distribution(n).version
                need_install = library_versions[n] != version
            except Exception:
                need_install = True

            if INSTALL_DEPENDENCIES and need_install:
                os.system(f'pip install {n}=={library_versions[n]}')
                was_installation = True

        def is_valid_class(cls: pyclbr.Class):
            super_names = get_super_names(cls)
            if 'Executor' not in super_names:
                return False

            return cls.name == executor or \
                cls.name.lower() == executor or \
                to_snake(cls.name) == executor

        def relative_name(path: str):
            rel = os.path.relpath(path, base_folder)
            parts = [str(p).split('.')[0] for p in rel.split(os.sep)]
            return '.'.join(parts)

        for (module_loader, module_name,
             ispkg) in pkgutil.iter_modules(folders):
            module = module_loader.find_module(module_name)
            module_folder = dirname(module.path)
            classes = pyclbr.readmodule(module_name, path=[module_folder])
            for k, v in classes.items():
                if is_valid_class(v):
                    importlib.import_module(relative_name(module.path))

                    return True, was_installation

        return False, was_installation