def start(self): if not _mpi_enabled: raise OptionalModuleMissing("mpi4py", "Cannot initialize ExtremeScaleExecutor without mpi4py") else: # This is only to stop flake8 from complaining logger.debug("MPI version :{}".format(mpi4py.__version__)) super().start()
def __init__(self, hostname, username=None, script_dir=None, envs=None, port=22): ''' Initialize a persistent connection to the remote system. We should know at this point whether ssh connectivity is possible Args: - hostname (String) : Hostname KWargs: - username (string) : Username on remote system - script_dir (string) : Full path to a script dir where generated scripts could be sent to. - envs (dict) : A dictionary of env variables to be set when executing commands - port (int) : Port at which the SSHService is running Raises: ''' if not _oauth_ssh_enabled: raise OptionalModuleMissing( ['oauth_ssh'], "OauthSSHChannel requires oauth_ssh module and config.") self.hostname = hostname self.username = username self.script_dir = script_dir self.port = port self.envs = {} if envs is not None: self.envs = envs try: access_token = find_access_token(hostname) except Exception: logger.exception( "Failed to find the access token for {}".format(hostname)) raise try: self.service = SSHService(hostname, port) self.transport = self.service.login(access_token, username) except Exception: logger.exception( "Caught an exception in the OAuth authentication step with {}". format(hostname)) raise self.sftp_client = paramiko.SFTPClient.from_transport(self.transport)
def __init__(self, label='ExtremeScaleExecutor', provider=LocalProvider(), launch_cmd=None, address="127.0.0.1", worker_ports=None, worker_port_range=(54000, 55000), interchange_port_range=(55000, 56000), storage_access=None, working_dir=None, worker_debug=False, ranks_per_node=1, heartbeat_threshold=120, heartbeat_period=30, managed=True): super().__init__(label=label, provider=provider, launch_cmd=launch_cmd, address=address, worker_ports=worker_ports, worker_port_range=worker_port_range, interchange_port_range=interchange_port_range, storage_access=storage_access, working_dir=working_dir, worker_debug=worker_debug, heartbeat_threshold=heartbeat_threshold, heartbeat_period=heartbeat_period, managed=managed) if not _mpi_enabled: raise OptionalModuleMissing( "mpi4py", "Cannot initialize ExtremeScaleExecutor without mpi4py") else: # This is only to stop flake8 from complaining logger.debug("MPI version :{}".format(mpi4py.__version__)) self.ranks_per_node = ranks_per_node logger.debug("Initializing ExtremeScaleExecutor") if not launch_cmd: self.launch_cmd = ( "mpiexec -np {ranks_per_node} mpi_worker_pool.py " "{debug} " "--task_url={task_url} " "--result_url={result_url} " "--logdir={logdir} " "--hb_period={heartbeat_period} " "--hb_threshold={heartbeat_threshold} ") self.worker_debug = worker_debug
def __init__(self, image: str, namespace: str = 'default', nodes_per_block: int = 1, init_blocks: int = 4, min_blocks: int = 0, max_blocks: int = 10, max_cpu: float = 2, max_mem: str = "500Mi", init_cpu: float = 1, init_mem: str = "250Mi", parallelism: float = 1, worker_init: str = "", pod_name: Optional[str] = None, user_id: Optional[str] = None, group_id: Optional[str] = None, run_as_non_root: bool = False, secret: Optional[str] = None, persistent_volumes: List[Tuple[str, str]] = []) -> None: if not _kubernetes_enabled: raise OptionalModuleMissing( ['kubernetes'], "Kubernetes provider requires kubernetes module and config.") config.load_kube_config() self.namespace = namespace self.image = image self.nodes_per_block = nodes_per_block self.init_blocks = init_blocks self.min_blocks = min_blocks self.max_blocks = max_blocks self.max_cpu = max_cpu self.max_mem = max_mem self.init_cpu = init_cpu self.init_mem = init_mem self.parallelism = parallelism self.worker_init = worker_init self.secret = secret self.pod_name = pod_name self.user_id = user_id self.group_id = group_id self.run_as_non_root = run_as_non_root self.persistent_volumes = persistent_volumes self.kube_client = client.CoreV1Api() # Dictionary that keeps track of jobs, keyed on job_id self.resources = {} # type: Dict[object, Dict[str, Any]]
def get_db_logger(logger_name='parsl_db_logger', is_logging_server=False, monitoring_config=None, **kwargs): """ Parameters ---------- logger_name : str, optional Name of the logger to use. Prevents adding repeat handlers or incorrect handlers is_logging_server : Bool, optional Used internally to determine which handler to return when using local db logging monitoring_config : MonitoringConfig, optional Pass in a logger class object to use for generating loggers. Returns ------- logging.logger object Raises ------ OptionalModuleMissing """ logger = logging.getLogger(logger_name) if monitoring_config is None: logger.addHandler(NullHandler()) return logger if monitoring_config.database_type == 'elasticsearch': if not _es_logging_enabled: raise OptionalModuleMissing( ['CMRESHandler'], "Logging to ElasticSearch requires the cmreslogging module") handler = CMRESHandler(hosts=[{ 'host': monitoring_config.host, 'port': monitoring_config.port }], use_ssl=monitoring_config.enable_ssl, auth_type=CMRESHandler.AuthType.NO_AUTH, es_index_name=monitoring_config.index_name, es_additional_fields={ 'Campaign': "test", 'Version': monitoring_config.version, 'Username': getpass.getuser() }) logger = logging.getLogger(monitoring_config.logger_name) logger.setLevel(logging.INFO) logger.addHandler(handler) elif monitoring_config.database_type == 'local_database' and not is_logging_server: # add a handler that will pass logs to the logging server handler = RemoteHandler(monitoring_config.web_app_host, monitoring_config.web_app_port) # use the specific name generated by the server or the monitor wrapper logger = logging.getLogger(logger_name) logger.setLevel(logging.INFO) logger.addHandler(handler) elif monitoring_config.database_type == 'local_database' and is_logging_server: # add a handler that will take logs being recieved on the server and log them to the database handler = DatabaseHandler(monitoring_config.eng_link) # use the specific name generated by the server or the monitor wrapper logger = logging.getLogger(logger_name) logger.setLevel(logging.INFO) logger.addHandler(handler) else: raise ValueError( 'database_type must be one of ["local_database", "elasticsearch"]') return logger
class Database: if not _sqlalchemy_enabled: raise OptionalModuleMissing( ['sqlalchemy'], ("Default database logging requires the sqlalchemy library." " Enable monitoring support with: pip install parsl[monitoring]")) if not _sqlalchemy_utils_enabled: raise OptionalModuleMissing( ['sqlalchemy_utils'], ("Default database logging requires the sqlalchemy_utils library." " Enable monitoring support with: pip install parsl[monitoring]")) Base = declarative_base() def __init__( self, url: str = 'sqlite:///monitoring.db', ): self.eng = sa.create_engine(url) self.meta = self.Base.metadata self.meta.create_all(self.eng) self.meta.reflect(bind=self.eng) Session = sessionmaker(bind=self.eng) self.session = Session() def update(self, *, table: str, columns: List[str], messages: List[Dict[str, Any]]) -> None: table_obj = self.meta.tables[table] mappings = self._generate_mappings(table_obj, columns=columns, messages=messages) mapper = get_mapper(table_obj) self.session.bulk_update_mappings(mapper, mappings) self.session.commit() def insert(self, *, table: str, messages: List[Dict[str, Any]]) -> None: table_obj = self.meta.tables[table] mappings = self._generate_mappings(table_obj, messages=messages) mapper = get_mapper(table_obj) self.session.bulk_insert_mappings(mapper, mappings) self.session.commit() def rollback(self) -> None: self.session.rollback() def _generate_mappings( self, table: Table, columns: Optional[List[str]] = None, messages: List[Dict[str, Any]] = []) -> List[Dict[str, Any]]: mappings = [] for msg in messages: m = {} if columns is None: columns = table.c.keys() for column in columns: m[column] = msg.get(column, None) mappings.append(m) return mappings class Workflow(Base): __tablename__ = WORKFLOW run_id = Column(Text, nullable=False, primary_key=True) workflow_name = Column(Text, nullable=True) workflow_version = Column(Text, nullable=True) time_began = Column(DateTime, nullable=False) time_completed = Column(DateTime, nullable=True) host = Column(Text, nullable=False) user = Column(Text, nullable=False) rundir = Column(Text, nullable=False) tasks_failed_count = Column(Integer, nullable=False) tasks_completed_count = Column(Integer, nullable=False) class Status(Base): __tablename__ = STATUS task_id = Column(Integer, sa.ForeignKey('task.task_id'), nullable=False) task_status_name = Column(Text, nullable=False) timestamp = Column(DateTime, nullable=False) run_id = Column(Text, sa.ForeignKey('workflow.run_id'), nullable=False) try_id = Column('try_id', Integer, nullable=False) __table_args__ = (PrimaryKeyConstraint('task_id', 'run_id', 'task_status_name', 'timestamp'), ) class Task(Base): __tablename__ = TASK task_id = Column('task_id', Integer, nullable=False) run_id = Column('run_id', Text, nullable=False) task_depends = Column('task_depends', Text, nullable=True) task_func_name = Column('task_func_name', Text, nullable=False) task_memoize = Column('task_memoize', Text, nullable=False) task_hashsum = Column('task_hashsum', Text, nullable=True) task_inputs = Column('task_inputs', Text, nullable=True) task_outputs = Column('task_outputs', Text, nullable=True) task_stdin = Column('task_stdin', Text, nullable=True) task_stdout = Column('task_stdout', Text, nullable=True) task_stderr = Column('task_stderr', Text, nullable=True) task_time_invoked = Column('task_time_invoked', DateTime, nullable=True) task_time_returned = Column('task_time_returned', DateTime, nullable=True) task_fail_count = Column('task_fail_count', Integer, nullable=False) __table_args__ = (PrimaryKeyConstraint('task_id', 'run_id'), ) class Try(Base): __tablename__ = TRY try_id = Column('try_id', Integer, nullable=False) task_id = Column('task_id', Integer, nullable=False) run_id = Column('run_id', Text, nullable=False) hostname = Column('hostname', Text, nullable=True) task_executor = Column('task_executor', Text, nullable=False) task_try_time_launched = Column('task_try_time_launched', DateTime, nullable=True) task_try_time_running = Column('task_try_time_running', DateTime, nullable=True) task_try_time_returned = Column('task_try_time_returned', DateTime, nullable=True) task_fail_history = Column('task_fail_history', Text, nullable=True) task_joins = Column('task_joins', Text, nullable=True) __table_args__ = (PrimaryKeyConstraint('try_id', 'task_id', 'run_id'), ) class Node(Base): __tablename__ = NODE id = Column('id', Integer, nullable=False, primary_key=True, autoincrement=True) run_id = Column('run_id', Text, nullable=False) hostname = Column('hostname', Text, nullable=False) uid = Column('uid', Text, nullable=False) block_id = Column('block_id', Text, nullable=False) cpu_count = Column('cpu_count', Integer, nullable=False) total_memory = Column('total_memory', Integer, nullable=False) active = Column('active', Boolean, nullable=False) worker_count = Column('worker_count', Integer, nullable=False) python_v = Column('python_v', Text, nullable=False) timestamp = Column('timestamp', DateTime, nullable=False) last_heartbeat = Column('last_heartbeat', DateTime, nullable=False) class Block(Base): __tablename__ = BLOCK run_id = Column('run_id', Text, nullable=False) executor_label = Column('executor_label', Text, nullable=False) block_id = Column('block_id', Text, nullable=False) job_id = Column('job_id', Text, nullable=True) timestamp = Column('timestamp', DateTime, nullable=False) status = Column("status", Text, nullable=False) __table_args__ = (PrimaryKeyConstraint('run_id', 'block_id', 'executor_label', 'timestamp'), ) class Resource(Base): __tablename__ = RESOURCE try_id = Column('try_id', Integer, sa.ForeignKey('try.try_id'), nullable=False) task_id = Column('task_id', Integer, sa.ForeignKey('task.task_id'), nullable=False) run_id = Column('run_id', Text, sa.ForeignKey('workflow.run_id'), nullable=False) timestamp = Column('timestamp', DateTime, nullable=False) resource_monitoring_interval = Column('resource_monitoring_interval', Float, nullable=True) psutil_process_pid = Column('psutil_process_pid', Integer, nullable=True) psutil_process_cpu_percent = Column('psutil_process_cpu_percent', Float, nullable=True) psutil_process_memory_percent = Column('psutil_process_memory_percent', Float, nullable=True) psutil_process_children_count = Column('psutil_process_children_count', Float, nullable=True) psutil_process_time_user = Column('psutil_process_time_user', Float, nullable=True) psutil_process_time_system = Column('psutil_process_time_system', Float, nullable=True) psutil_process_memory_virtual = Column('psutil_process_memory_virtual', Float, nullable=True) psutil_process_memory_resident = Column( 'psutil_process_memory_resident', Float, nullable=True) psutil_process_disk_read = Column('psutil_process_disk_read', Float, nullable=True) psutil_process_disk_write = Column('psutil_process_disk_write', Float, nullable=True) psutil_process_status = Column('psutil_process_status', Text, nullable=True) __table_args__ = (PrimaryKeyConstraint('try_id', 'task_id', 'run_id', 'timestamp'), )
def __init__(self, vm_reference, init_blocks=1, min_blocks=0, max_blocks=10, parallelism=1, worker_init='', location='westus', group_name='parsl.group', key_name=None, key_file=None, vnet_name="parsl.vnet", linger=False, launcher=SingleNodeLauncher()): if not _api_enabled: raise OptionalModuleMissing( ['azure', 'msrestazure'], "Azure Provider requires the azure module.") self._label = 'azure' self.init_blocks = init_blocks self.min_blocks = min_blocks self.max_blocks = max_blocks self.max_nodes = max_blocks self.parallelism = parallelism self.nodes_per_block = 1 self.worker_init = worker_init self.vm_reference = vm_reference self.region = location self.vnet_name = vnet_name self.key_name = key_name self.key_file = key_file self.location = location self.group_name = group_name self.launcher = launcher self.linger = linger self.resources = {} self.instances = [] env_specified = os.getenv("AZURE_CLIENT_ID") is not None and os.getenv( "AZURE_CLIENT_SECRET") is not None and os.getenv( "AZURE_TENANT_ID") is not None and os.getenv( "AZURE_SUBSCRIPTION_ID") is not None if key_file is None and not env_specified: raise ConfigurationError( ("Must specify either: 'key_file', or " "`AZURE_CLIENT_ID`, `AZURE_CLIENT_SECRET`, " "and `AZURE_TENANT_ID` environment variables.")) if key_file is None: self.clientid = os.getenv("AZURE_CLIENT_ID") self.clientsecret = os.getenv("AZURE_CLIENT_SECRET") self.tenantid = os.getenv("AZURE_TENANT_ID") self.subid = os.getenv("AZURE_SUBSCRIPTION_ID") else: with open(key_file) as fh: keys = json.load(fh) self.clientid = keys.get("AZURE_CLIENT_ID") self.clientsecret = keys.get("AZURE_CLIENT_SECRET") self.tenantid = keys.get("AZURE_TENANT_ID") self.subid = keys.get("AZURE_SUBSCRIPTION_ID") self.get_clients()
def __init__(self, label: str = "WorkQueueExecutor", provider: ExecutionProvider = LocalProvider(), working_dir: str = ".", managed: bool = True, project_name: Optional[str] = None, project_password_file: Optional[str] = None, address: Optional[str] = None, port: int = WORK_QUEUE_DEFAULT_PORT, env: Optional[Dict] = None, shared_fs: bool = False, storage_access: Optional[List[Staging]] = None, use_cache: bool = False, source: bool = False, pack: bool = False, extra_pkgs: Optional[List[str]] = None, autolabel: bool = False, autolabel_window: int = 1, autocategory: bool = True, max_retries: Optional[int] = 1, init_command: str = "", worker_options: str = "", full_debug: bool = True, worker_executable: str = 'work_queue_worker'): NoStatusHandlingExecutor.__init__(self) self._provider = provider self._scaling_enabled = True if not _work_queue_enabled: raise OptionalModuleMissing( ['work_queue'], "WorkQueueExecutor requires the work_queue module.") self.label = label self.managed = managed self.task_queue = multiprocessing.Queue( ) # type: multiprocessing.Queue self.collector_queue = multiprocessing.Queue( ) # type: multiprocessing.Queue self.blocks = {} # type: Dict[str, str] self.address = address self.port = port self.task_counter = -1 self.project_name = project_name self.project_password_file = project_password_file self.env = env self.init_command = init_command self.shared_fs = shared_fs self.storage_access = storage_access self.use_cache = use_cache self.working_dir = working_dir self.registered_files = set() # type: Set[str] self.full_debug = full_debug self.source = True if pack else source self.pack = pack self.extra_pkgs = extra_pkgs or [] self.autolabel = autolabel self.autolabel_window = autolabel_window self.autocategory = autocategory self.max_retries = max_retries self.should_stop = multiprocessing.Value(c_bool, False) self.cached_envs = {} # type: Dict[int, str] self.worker_options = worker_options self.worker_executable = worker_executable if not self.address: self.address = socket.gethostname() if self.project_password_file is not None and not os.path.exists( self.project_password_file): raise WorkQueueFailure('Could not find password file: {}'.format( self.project_password_file)) if self.project_password_file is not None: if os.path.exists(self.project_password_file) is False: logger.debug("Password File does not exist, no file used") self.project_password_file = None # Build foundations of the launch command self.launch_cmd = ( "{package_prefix}python3 exec_parsl_function.py {mapping} {function} {result}" ) if self.init_command != "": self.launch_cmd = self.init_command + "; " + self.launch_cmd
def __init__(self, image_id, key_name, init_blocks=1, min_blocks=0, max_blocks=10, nodes_per_block=1, parallelism=1, worker_init='', instance_type='t2.small', region='us-east-2', spot_max_bid=0, key_file=None, profile=None, iam_instance_profile_arn='', state_file=None, walltime="01:00:00", linger=False, launcher=SingleNodeLauncher()): if not _boto_enabled: raise OptionalModuleMissing( ['boto3'], "AWS Provider requires the boto3 module.") self.image_id = image_id self._label = 'ec2' self.init_blocks = init_blocks self.min_blocks = min_blocks self.max_blocks = max_blocks self.nodes_per_block = nodes_per_block self.max_nodes = max_blocks * nodes_per_block self.parallelism = parallelism self.worker_init = worker_init self.instance_type = instance_type self.region = region self.spot_max_bid = spot_max_bid self.key_name = key_name self.key_file = key_file self.profile = profile self.iam_instance_profile_arn = iam_instance_profile_arn self.walltime = walltime self.launcher = launcher self.linger = linger self.resources = {} self.state_file = state_file if state_file is not None else 'awsproviderstate.json' env_specified = os.getenv( "AWS_ACCESS_KEY_ID") is not None and os.getenv( "AWS_SECRET_ACCESS_KEY") is not None if profile is None and key_file is None and not env_specified: raise ConfigurationError( "Must specify either profile', 'key_file', or " "'AWS_ACCESS_KEY_ID' and 'AWS_SECRET_ACCESS_KEY' environment variables." ) try: self.initialize_boto_client() except Exception as e: logger.error("{} failed to initialize.".format(self)) raise e state_file_exists = False try: self.read_state_file(self.state_file) state_file_exists = True except Exception: logger.info( "No state file found. Cannot load previous options. Creating new infrastructure." ) if not state_file_exists: try: self.create_vpc().id except Exception as e: logger.info( "Failed to create ec2 infrastructure: {0}".format(e)) raise else: self.write_state_file()
class Database: if not _sqlalchemy_enabled: raise OptionalModuleMissing( ['sqlalchemy'], ("Default database logging requires the sqlalchemy library." " Enable monitoring support with: pip install 'parsl[monitoring]'" )) Base = declarative_base() def __init__( self, url: str = 'sqlite:///runinfomonitoring.db', ): self.eng = sa.create_engine(url) self.meta = self.Base.metadata # TODO: this code wants a read lock on the sqlite3 database, and fails if it cannot # - for example, if someone else is querying the database at the point that the # monitoring system is initialized. See PR #1917 for related locked-for-read fixes # elsewhere in this file. self.meta.create_all(self.eng) self.meta.reflect(bind=self.eng) Session = sessionmaker(bind=self.eng) self.session = Session() def _get_mapper(self, table_obj: Table) -> Mapper: if hasattr(mapperlib, '_all_registries'): all_mappers = set() for mapper_registry in mapperlib._all_registries(): # type: ignore all_mappers.update(mapper_registry.mappers) else: # SQLAlchemy <1.4 all_mappers = mapperlib._mapper_registry # type: ignore mapper_gen = (mapper for mapper in all_mappers if table_obj in mapper.tables) try: mapper = next(mapper_gen) second_mapper = next(mapper_gen, False) except StopIteration: raise ValueError(f"Could not get mapper for table {table_obj}") if second_mapper: raise ValueError(f"Multiple mappers for table {table_obj}") return mapper def update(self, *, table: str, columns: List[str], messages: List[MonitoringMessage]) -> None: table_obj = self.meta.tables[table] mappings = self._generate_mappings(table_obj, columns=columns, messages=messages) mapper = self._get_mapper(table_obj) self.session.bulk_update_mappings(mapper, mappings) self.session.commit() def insert(self, *, table: str, messages: List[MonitoringMessage]) -> None: table_obj = self.meta.tables[table] mappings = self._generate_mappings(table_obj, messages=messages) mapper = self._get_mapper(table_obj) self.session.bulk_insert_mappings(mapper, mappings) self.session.commit() def rollback(self) -> None: self.session.rollback() def _generate_mappings( self, table: Table, columns: Optional[List[str]] = None, messages: List[MonitoringMessage] = []) -> List[Dict[str, Any]]: mappings = [] for msg in messages: m = {} if columns is None: columns = table.c.keys() for column in columns: m[column] = msg.get(column, None) mappings.append(m) return mappings class Workflow(Base): __tablename__ = WORKFLOW run_id = Column(Text, nullable=False, primary_key=True) workflow_name = Column(Text, nullable=True) workflow_version = Column(Text, nullable=True) time_began = Column(DateTime, nullable=False) time_completed = Column(DateTime, nullable=True) host = Column(Text, nullable=False) user = Column(Text, nullable=False) rundir = Column(Text, nullable=False) tasks_failed_count = Column(Integer, nullable=False) tasks_completed_count = Column(Integer, nullable=False) class Status(Base): __tablename__ = STATUS task_id = Column(Integer, sa.ForeignKey('task.task_id'), nullable=False) task_status_name = Column(Text, nullable=False) timestamp = Column(DateTime, nullable=False) run_id = Column(Text, sa.ForeignKey('workflow.run_id'), nullable=False) try_id = Column('try_id', Integer, nullable=False) __table_args__ = (PrimaryKeyConstraint('task_id', 'run_id', 'task_status_name', 'timestamp'), ) class Task(Base): __tablename__ = TASK task_id = Column('task_id', Integer, nullable=False) run_id = Column('run_id', Text, nullable=False) task_depends = Column('task_depends', Text, nullable=True) task_func_name = Column('task_func_name', Text, nullable=False) task_memoize = Column('task_memoize', Text, nullable=False) task_hashsum = Column('task_hashsum', Text, nullable=True, index=True) task_inputs = Column('task_inputs', Text, nullable=True) task_outputs = Column('task_outputs', Text, nullable=True) task_stdin = Column('task_stdin', Text, nullable=True) task_stdout = Column('task_stdout', Text, nullable=True) task_stderr = Column('task_stderr', Text, nullable=True) task_time_invoked = Column('task_time_invoked', DateTime, nullable=True) task_time_returned = Column('task_time_returned', DateTime, nullable=True) task_fail_count = Column('task_fail_count', Integer, nullable=False) task_fail_cost = Column('task_fail_cost', Float, nullable=False) __table_args__ = (PrimaryKeyConstraint('task_id', 'run_id'), ) class Try(Base): __tablename__ = TRY try_id = Column('try_id', Integer, nullable=False) task_id = Column('task_id', Integer, nullable=False) run_id = Column('run_id', Text, nullable=False) block_id = Column('block_id', Text, nullable=True) hostname = Column('hostname', Text, nullable=True) task_executor = Column('task_executor', Text, nullable=False) task_try_time_launched = Column('task_try_time_launched', DateTime, nullable=True) task_try_time_running = Column('task_try_time_running', DateTime, nullable=True) task_try_time_returned = Column('task_try_time_returned', DateTime, nullable=True) task_fail_history = Column('task_fail_history', Text, nullable=True) task_joins = Column('task_joins', Text, nullable=True) __table_args__ = (PrimaryKeyConstraint('try_id', 'task_id', 'run_id'), ) class Node(Base): __tablename__ = NODE id = Column('id', Integer, nullable=False, primary_key=True, autoincrement=True) run_id = Column('run_id', Text, nullable=False) hostname = Column('hostname', Text, nullable=False) uid = Column('uid', Text, nullable=False) block_id = Column('block_id', Text, nullable=False) cpu_count = Column('cpu_count', Integer, nullable=False) total_memory = Column('total_memory', Integer, nullable=False) active = Column('active', Boolean, nullable=False) worker_count = Column('worker_count', Integer, nullable=False) python_v = Column('python_v', Text, nullable=False) timestamp = Column('timestamp', DateTime, nullable=False) last_heartbeat = Column('last_heartbeat', DateTime, nullable=False) class Block(Base): __tablename__ = BLOCK run_id = Column('run_id', Text, nullable=False) executor_label = Column('executor_label', Text, nullable=False) block_id = Column('block_id', Text, nullable=False) job_id = Column('job_id', Text, nullable=True) timestamp = Column('timestamp', DateTime, nullable=False) status = Column("status", Text, nullable=False) __table_args__ = (PrimaryKeyConstraint('run_id', 'block_id', 'executor_label', 'timestamp'), ) class Resource(Base): __tablename__ = RESOURCE try_id = Column('try_id', Integer, sa.ForeignKey('try.try_id'), nullable=False) task_id = Column('task_id', Integer, sa.ForeignKey('task.task_id'), nullable=False) run_id = Column('run_id', Text, sa.ForeignKey('workflow.run_id'), nullable=False) timestamp = Column('timestamp', DateTime, nullable=False) resource_monitoring_interval = Column('resource_monitoring_interval', Float, nullable=True) psutil_process_pid = Column('psutil_process_pid', Integer, nullable=True) psutil_process_memory_percent = Column('psutil_process_memory_percent', Float, nullable=True) psutil_process_children_count = Column('psutil_process_children_count', Float, nullable=True) psutil_process_time_user = Column('psutil_process_time_user', Float, nullable=True) psutil_process_time_system = Column('psutil_process_time_system', Float, nullable=True) psutil_process_memory_virtual = Column('psutil_process_memory_virtual', Float, nullable=True) psutil_process_memory_resident = Column( 'psutil_process_memory_resident', Float, nullable=True) psutil_process_disk_read = Column('psutil_process_disk_read', Float, nullable=True) psutil_process_disk_write = Column('psutil_process_disk_write', Float, nullable=True) psutil_process_status = Column('psutil_process_status', Text, nullable=True) __table_args__ = (PrimaryKeyConstraint('try_id', 'task_id', 'run_id', 'timestamp'), )
def __init__( self, image: str, namespace: str = "default", nodes_per_block: int = 1, init_blocks: int = 0, min_blocks: int = 0, max_blocks: int = 10, max_cpu: float = 2, max_mem: str = "500Mi", init_cpu: float = 1, init_mem: str = "250Mi", parallelism: float = 1, worker_init: str = "", pod_name: Optional[str] = None, user_id: Optional[str] = None, group_id: Optional[str] = None, run_as_non_root: bool = False, secret: Optional[str] = None, incluster_config: Optional[bool] = True, persistent_volumes: Optional[List[Tuple[str, str]]] = None, ) -> None: if persistent_volumes is None: persistent_volumes = [] if not _kubernetes_enabled: raise OptionalModuleMissing( ["kubernetes"], "Kubernetes provider requires kubernetes module and config.", ) if incluster_config: config.load_incluster_config() else: config.load_kube_config() self.namespace = namespace self.image = image self.nodes_per_block = nodes_per_block self.init_blocks = init_blocks # Kubernetes provider doesn't really know which pods by container to initialize # so best to set init_blocks to 0 assert init_blocks == 0 self.min_blocks = min_blocks self.max_blocks = max_blocks self.max_cpu = max_cpu self.max_mem = max_mem self.init_cpu = init_cpu self.init_mem = init_mem self.parallelism = parallelism self.worker_init = worker_init self.secret = secret self.incluster_config = incluster_config self.pod_name = pod_name self.user_id = user_id self.group_id = group_id self.run_as_non_root = run_as_non_root self.persistent_volumes = persistent_volumes self.kube_client = client.CoreV1Api() # Dictionary that keeps track of jobs, keyed on job_id self.resources_by_pod_name = {} # Dictionary that keeps track of jobs, keyed on task_type self.resources_by_task_type = {}