def _dag(config: str, debug: bool = False, control_reqs=True, params: Tuple[str] = ()): logger = create_logger(_session, name='_dag') logger.info('started', ComponentType.Client) config_text = open(config, 'r').read() config_parsed = yaml_load(config_text) params = dict_from_list_str(params) config_parsed = merge_dicts_smart(config_parsed, params) config_text = yaml_dump(config_parsed) logger.info('config parsed', ComponentType.Client) type_name = config_parsed['info'].get('type', 'standard') if type_name == DagType.Standard.name.lower(): return dag_standard( session=_session, config=config_parsed, debug=debug, config_text=config_text, config_path=config, control_reqs=control_reqs, logger=logger, component=ComponentType.Client ) return dag_pipe( session=_session, config=config_parsed, config_text=config_text )
def build(self): try: self.create_base() self.check_status() self.change_status() self.download() self.create_executor() self.execute() except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(key='ExecuteBuilder') self.session = Session.create_session(key='ExecuteBuilder') self.logger.session = create_logger(self.session, 'ExecuteBuilder') step = self.executor.step.id if \ (self.executor and self.executor.step) else None self.error(traceback.format_exc(), step) self.provider.change_status(self.task, TaskStatus.Failed) raise e finally: if app.current_task: app.current_task.update_state(state=states.SUCCESS) app.close() if self.exit: # noinspection PyProtectedMember os._exit(0)
def execute(config: str, debug: bool): _create_computer() # Fail all InProgress Tasks logger = create_logger(_session, __name__) provider = TaskProvider(_session) step_provider = StepProvider(_session) for t in provider.by_status(TaskStatus.InProgress, worker_index=WORKER_INDEX): step = step_provider.last_for_task(t.id) logger.error( f'Task Id = {t.id} was in InProgress state ' f'when another tasks arrived to the same worker', ComponentType.Worker, t.computer_assigned, t.id, step) provider.change_status(t, TaskStatus.Failed) # Create dag created_dag = _dag(config, debug) for ids in created_dag.values(): for id in ids: task = provider.by_id(id) task.gpu_assigned = ','.join( [str(i) for i, _ in enumerate(GPUtil.getGPUs())]) provider.commit() execute_by_id(id, exit=False)
def execute(config: str, debug: bool, params): check_statuses() _create_computer() _create_docker() # Fail all InProgress Tasks logger = create_logger(_session, __name__) provider = TaskProvider(_session) step_provider = StepProvider(_session) for t in provider.by_status(TaskStatus.InProgress, worker_index=WORKER_INDEX): step = step_provider.last_for_task(t.id) logger.error( f'Task Id = {t.id} was in InProgress state ' f'when another tasks arrived to the same worker', ComponentType.Worker, t.computer_assigned, t.id, step) provider.change_status(t, TaskStatus.Failed) # Create dags dags = _dag(config, debug, params=params) for dag in dags: for ids in dag.values(): for id in ids: task = provider.by_id(id) task.gpu_assigned = ','.join( [str(i) for i in range(torch.cuda.device_count())]) provider.commit() execute_by_id(id, exit=False)
def worker_supervisor(): """ Start worker supervisor. This program controls workers ran on the same machine. Also, it writes metric of resources consumption. """ host = socket.gethostname() logger = create_logger(_session, 'worker_supervisor') logger.info('worker_supervisor start', ComponentType.WorkerSupervisor, host) _create_computer() _create_docker() start_schedule([(stop_processes_not_exist, 10)]) if DOCKER_MAIN: syncer = FileSync() start_schedule([(worker_usage, 0)]) start_schedule([(syncer.sync, 0)]) name = f'{host}_{DOCKER_IMG}_supervisor' argv = [ 'worker', '--loglevel=INFO', '-P=solo', f'-n={name}', '-O fair', '-c=1', '--prefetch-multiplier=1', '-Q', f'{name}' ] logger.info('worker_supervisor run celery', ComponentType.WorkerSupervisor, host) app.worker_main(argv)
def build(self): try: # if self.fast_check(): # return self.auxiliary = {'time': now()} self.create_base() self.process_stop_tasks() self.process_start_dags() self.process_parent_tasks() self.load_tasks() self.load_computers() self.process_tasks() self.write_auxiliary() except ObjectDeletedError: pass except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(key='SupervisorBuilder') self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.logger.error(traceback.format_exc(), ComponentType.Supervisor)
def sync_directed( session: Session, source: Computer, target: Computer, folders: List ): current_computer = socket.gethostname() logger = create_logger(session, __name__) for folder, excluded in folders: end = ' --perms --chmod=777 --size-only' if len(excluded) > 0: parts = [] folder_excluded = False for i in range(len(excluded)): if excluded[i] == folder: folder_excluded = True break if not excluded[i].startswith(folder): continue part = os.path.relpath(excluded[i], folder) part = f'--exclude {part}' parts.append(part) if folder_excluded: continue if len(parts) > 0: end += ' ' + ' '.join(parts) source_folder = join(source.root_folder, folder) target_folder = join(target.root_folder, folder) if current_computer == source.name: command = f'rsync -vhru -e ' \ f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \ f'{source_folder}/ ' \ f'{target.user}@{target.ip}:{target_folder}/ {end}' elif current_computer == target.name: command = f'rsync -vhru -e ' \ f'"ssh -p {source.port} -o StrictHostKeyChecking=no" ' \ f'{source.user}@{source.ip}:{source_folder}/ ' \ f'{target_folder}/ {end}' else: command = f'rsync -vhru -e ' \ f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \ f' {source_folder}/ ' \ f'{target.user}@{target.ip}:{target_folder}/ {end}' command = f'ssh -p {source.port} ' \ f'{source.user}@{source.ip} "{command}"' logger.info(command, ComponentType.WorkerSupervisor, current_computer) try: subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, universal_newlines=True) except subprocess.CalledProcessError as exc: raise Exception(exc.output)
def sync(self): hostname = socket.gethostname() try: provider = ComputerProvider(self.session) task_synced_provider = TaskSyncedProvider(self.session) computer = provider.by_name(hostname) sync_start = now() if FILE_SYNC_INTERVAL == 0: time.sleep(1) else: computers = provider.all_with_last_activtiy() computers = [ c for c in computers if (now() - c.last_activity).total_seconds() < 10 ] computers_names = {c.name for c in computers} for c, project, tasks in task_synced_provider.for_computer( computer.name): if c.name not in computers_names: self.logger.info( f'Computer = {c.name} ' f'is offline. Can not sync', ComponentType.WorkerSupervisor, hostname) continue if c.syncing_computer: continue excluded = list(map(str, yaml_load(project.ignore_folders))) folders_excluded = [[join('data', project.name), excluded], [join('models', project.name), []]] computer.syncing_computer = c.name provider.update() sync_directed(self.session, c, computer, folders_excluded) for t in tasks: task_synced_provider.add( TaskSynced(computer=computer.name, task=t.id)) time.sleep(FILE_SYNC_INTERVAL) computer.last_synced = sync_start computer.syncing_computer = None provider.update() except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup('FileSync') self.session = Session.create_session(key='FileSync') self.logger = create_logger(self.session, 'FileSync') self.logger.error(traceback.format_exc(), ComponentType.WorkerSupervisor, hostname)
def process_error(self, e: Exception): if Session.sqlalchemy_error(e): Session.cleanup('FileSync') self.session = Session.create_session(key='FileSync') self.logger = create_logger(self.session, 'FileSync') hostname = socket.gethostname() self.logger.error( traceback.format_exc(), ComponentType.WorkerSupervisor, hostname )
def __init__(self, id: int, repeat_count: int = 1, exit=True): self.session = Session.create_session(key='ExecuteBuilder') self.id = id self.repeat_count = repeat_count self.logger = create_logger(self.session, 'ExecuteBuilder') self.logger_db = create_logger(self.session, 'ExecuteBuilder.db', console=False) self.exit = exit self.provider = None self.library_provider = None self.storage = None self.task = None self.dag = None self.executor = None self.hostname = None self.docker_img = None self.worker_index = None self.queue_personal = None self.config = None self.executor_type = None
def __init__(self): self.session = Session.create_session(key='SupervisorBuilder') self.logger = create_logger(self.session, 'SupervisorBuilder') self.provider = None self.computer_provider = None self.docker_provider = None self.auxiliary_provider = None self.dag_provider = None self.queues = None self.not_ran_tasks = None self.dep_status = None self.computers = None self.auxiliary = {}
def wrapper(): try: f(wrapper_vars['session'], wrapper_vars['logger']) except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(name) wrapper_vars['session'] = Session.create_session(key=name) wrapper_vars['logger'] = create_logger(wrapper_vars['session'], name) wrapper_vars['logger'].error(traceback.format_exc(), ComponentType.WorkerSupervisor, hostname)
def find_imports(path: str, files: List[str] = None, exclude_patterns: List[str] = None, encoding='utf-8'): res = [] raw_imports = [] files = files if files is not None \ else glob(os.path.join(path, '**', '*.py'), recursive=True) exclude_patterns = exclude_patterns \ if exclude_patterns is not None else [] spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, exclude_patterns) for file in files: if not file.endswith('.py'): continue file_rel = os.path.relpath(file, path) if spec.match_file(file_rel): continue with open(file, 'r', encoding=encoding) as f: content = f.read() try: tree = ast.parse(content) for node in ast.walk(tree): if isinstance(node, ast.Import): for subnode in node.names: raw_imports.append((subnode.name, file_rel)) elif isinstance(node, ast.ImportFrom): raw_imports.append((node.module, file_rel)) except Exception as exc: logger = create_logger(Session.create_session(), __name__) logger.error('Failed on file: %s' % file_rel) raise exc for lib, file in raw_imports: name = lib.split('.')[0] try: if name in _mapping: name = _mapping[name] version = pkg_resources.get_distribution(name).version res.append((name, version)) except Exception: pass return res
def _dag(config: str, debug: bool = False, control_reqs=True, params: Tuple[str] = ()): logger = create_logger(_session, name='_dag') logger.info('started', ComponentType.Client) config_text = open(config, 'r').read() config_parsed = yaml_load(config_text) params = dict_from_list_str(params) config_parsed = merge_dicts_smart(config_parsed, params) config_text = yaml_dump(config_parsed) logger.info('config parsed', ComponentType.Client) try: commit = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip() config_parsed['info']['name'] += f'_{commit.decode("utf-8")[:6]}' except Exception: logger.info('commit not parsed') type_name = config_parsed['info'].get('type', 'standard') if type_name == DagType.Standard.name.lower(): cells = grid_cells( config_parsed['grid']) if 'grid' in config_parsed else [None] dags = [] for cell in cells: dag = dag_standard(session=_session, config=config_parsed, debug=debug, config_text=config_text, config_path=config, control_reqs=control_reqs, logger=logger, component=ComponentType.Client, grid_cell=cell) dags.append(dag) return dags return [ dag_pipe(session=_session, config=config_parsed, config_text=config_text) ]
def sync_directed( session: Session, source: Computer, target: Computer, ignore_folders: List ): current_computer = socket.gethostname() end = ' --perms --chmod=777 --size-only' logger = create_logger(session, __name__) for folder, excluded in ignore_folders: if len(excluded) > 0: excluded = excluded[:] for i in range(len(excluded)): excluded[i] = f'--exclude {excluded[i]}' end += ' ' + ' '.join(excluded) source_folder = join(source.root_folder, folder) target_folder = join(target.root_folder, folder) if current_computer == source.name: command = f'rsync -vhru -e ' \ f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \ f'{source_folder}/ ' \ f'{target.user}@{target.ip}:{target_folder}/ {end}' elif current_computer == target.name: command = f'rsync -vhru -e ' \ f'"ssh -p {source.port} -o StrictHostKeyChecking=no" ' \ f'{source.user}@{source.ip}:{source_folder}/ ' \ f'{target_folder}/ {end}' else: command = f'rsync -vhru -e ' \ f'"ssh -p {target.port} -o StrictHostKeyChecking=no" ' \ f' {source_folder}/ ' \ f'{target.user}@{target.ip}:{target_folder}/ {end}' command = f'ssh -p {source.port} ' \ f'{source.user}@{source.ip} "{command}"' logger.info(command, ComponentType.WorkerSupervisor, current_computer) subprocess.check_output(command, shell=True)
def error_handler(f): name = f.__name__ wrapper_vars = {'session': Session.create_session(key=name)} wrapper_vars['logger'] = create_logger(wrapper_vars['session'], name) hostname = socket.gethostname() def wrapper(): try: f(wrapper_vars['session'], wrapper_vars['logger']) except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup(name) wrapper_vars['session'] = Session.create_session(key=name) wrapper_vars['logger'] = create_logger(wrapper_vars['session'], name) wrapper_vars['logger'].error(traceback.format_exc(), ComponentType.WorkerSupervisor, hostname) return wrapper
def decorated(*args, **kwargs): global _read_session, _write_session, logger success = True status = 200 error = '' try: res = f(*args, **kwargs) except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup('server.read') Session.cleanup('server.write') _read_session = Session.create_session(key='server.read') _write_session = Session.create_session(key='server.write') logger = create_logger(_write_session, __name__) logger.error( f'Requested Url: {request.path}\n\n{traceback.format_exc()}', ComponentType.API ) error = traceback.format_exc() success = False status = 500 res = None res = res or {} if isinstance(res, Response): return res res['success'] = success res['error'] = error return Response(json.dumps(res), status=status)
class FileSync: session = Session.create_session(key='FileSync') logger = create_logger(session, 'FileSync') def sync_manual(self, computer: Computer, provider: ComputerProvider): """ button sync was clicked manually """ if not computer.meta: return meta = yaml_load(computer.meta) if 'manual_sync' not in meta: return manual_sync = meta['manual_sync'] project_provider = ProjectProvider(self.session) docker_provider = DockerProvider(self.session) dockers = docker_provider.get_online() project = project_provider.by_id(manual_sync['project']) for docker in dockers: if docker.computer == computer.name: continue source = provider.by_name(docker.computer) ignore_folders = [ [join('models', project.name), []] ] sync_directed(self.session, target=computer, source=source, ignore_folders=ignore_folders) del meta['manual_sync'] computer.meta = yaml_dump(meta) provider.update() def sync(self): hostname = socket.gethostname() try: provider = ComputerProvider(self.session) task_synced_provider = TaskSyncedProvider(self.session) computer = provider.by_name(hostname) sync_start = now() if FILE_SYNC_INTERVAL == 0: time.sleep(1) else: self.sync_manual(computer, provider) computers = provider.all_with_last_activtiy() computers = [ c for c in computers if (now() - c.last_activity).total_seconds() < 10 ] computers_names = {c.name for c in computers} for c, project, tasks in task_synced_provider.for_computer( computer.name): if c.sync_with_this_computer: if c.name not in computers_names: self.logger.info(f'Computer = {c.name} ' f'is offline. Can not sync', ComponentType.WorkerSupervisor, hostname) continue if c.syncing_computer: continue ignore_folders = [ [join('models', project.name), []] ] computer.syncing_computer = c.name provider.update() sync_directed(self.session, c, computer, ignore_folders) for t in tasks: task_synced_provider.add( TaskSynced(computer=computer.name, task=t.id) ) time.sleep(FILE_SYNC_INTERVAL) computer.last_synced = sync_start computer.syncing_computer = None provider.update() except Exception as e: if Session.sqlalchemy_error(e): Session.cleanup('FileSync') self.session = Session.create_session(key='FileSync') self.logger = create_logger(self.session, 'FileSync') self.logger.error( traceback.format_exc(), ComponentType.WorkerSupervisor, hostname )
import socket from kaggle.models import DatasetNewRequest from mlcomp.db.core import Session from mlcomp.db.enums import ComponentType from mlcomp.db.providers import ModelProvider from mlcomp.worker.executors.base.equation import Equation from mlcomp.worker.executors.base.executor import Executor from mlcomp.utils.logging import create_logger from mlcomp.utils.config import Config try: from kaggle import api except OSError: logger = create_logger(Session.create_session(), __name__) logger.warning( 'Could not find kaggle.json. ' 'Kaggle executors can not be used', ComponentType.Worker, socket.gethostname()) class DownloadType(Enum): Kaggle = 0 Link = 1 @Executor.register class Download(Executor): def __init__(self, output: str,
from mlcomp.server.back.supervisor import register_supervisor from mlcomp.utils.logging import create_logger from mlcomp.utils.io import from_module_path, zip_folder from mlcomp.server.back.create_dags import dag_model_add, dag_model_start from mlcomp.utils.misc import to_snake, now from mlcomp.db.models import Model, Report, ReportLayout, Task from mlcomp.utils.io import yaml_load, yaml_dump from mlcomp.worker.storage import Storage app = Flask(__name__) CORS(app) _read_session = Session.create_session(key='server.read') _write_session = Session.create_session(key='server.write') logger = create_logger(_write_session, __name__) @app.route('/', defaults={'path': ''}, methods=['GET']) @app.route('/<path:path>', methods=['GET']) def send_static(path): file = 'index.html' if '.' in path: file = path module_path = from_module_path(__file__, f'../front/dist/mlcomp/') return send_from_directory(module_path, file) def request_data(): return json.loads(request.data.decode('utf-8'))