Example #1
0
async def ping(uri: str, message: str) -> None:
    """Ping odin at uri and send message.

    :param uri: The location of the server
    :param message: The message you expect to see back
    :raises RuntimeError: If the server returns an error
    """
    async with websockets.connect(uri) as websocket:
        await websocket.send(json.dumps({APIField.COMMAND: 'PING', APIField.REQUEST: message}))
        resp = json.loads(await websocket.recv())
        if resp[APIField.STATUS] == APIStatus.ERROR:
            LOGGER.error(resp)
            raise RuntimeError(resp)
        LOGGER.info(resp[APIField.RESPONSE])
Example #2
0
async def request_generate_config(ws, config):
    """Use async to open a connection to serve.py and generate a config."""
    async with websockets.connect(ws) as websocket:
        await websocket.send(
            json.dumps({
                APIField.COMMAND: 'GENERATE',
                APIField.REQUEST: config
            }))

        result = json.loads(await websocket.recv())
        if result[APIField.STATUS] == APIStatus.ERROR:
            LOGGER.error(result)
            return
        if result[APIField.STATUS] == APIStatus.OK:
            LOGGER.info('Generated pipeline is called %s',
                        result[APIField.RESPONSE])
Example #3
0
    def submit(self, task: Task) -> str:
        """Submit a multi-worker PyTorchJob Task

        :param task: The task definition
        :type task: Task
        :return: A string handle name
        :rtype: str
        """
        secrets = self._reference_secrets(task)
        configmaps = self._generate_configmaps(task)
        task.num_gpus = 1
        pod_spec = task_to_pod_spec(task, container_name="pytorch-elasticjob", secrets=secrets, configmaps=configmaps)
        template_metadata = client.V1ObjectMeta(name=task.name)

        template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec)

        worker_replica_spec = {}
        worker_replica_spec['replicas'] = task.num_workers
        worker_replica_spec['restartPolicy'] = PyTorchElasticJobHandler.EXIT_CODE
        worker_replica_spec['template'] = template

        spec = {}
        spec['replicaSpecs'] = {}
        spec['replicaSpecs']['Worker'] = worker_replica_spec
        spec['minReplicas'] = task.num_workers
        spec['maxReplicas'] = task.num_workers
        etcd_svc = getenv('PYTORCH_ELASTIC_ETCD_SVC')
        if not etcd_svc:
            LOGGER.warning("No environment variable set for etcd service, looking for first available in elastic-job namespace")
            api = client.CoreV1Api()
            etcd_svc = [x for x in api.list_namespaced_service('elastic-job').items if x.metadata.name =='etcd-service'][0].spec.cluster_ip
        LOGGER.info("Using etcd service on %s:%d", etcd_svc, PyTorchElasticJobHandler.ETCD_PORT)
        spec['rdzvEndpoint'] = f'{etcd_svc}:{PyTorchElasticJobHandler.ETCD_PORT}'
        pytorch_job_spec = {}
        pytorch_job_spec['kind'] = PyTorchElasticJobHandler.NAME
        pytorch_job_spec['apiVersion'] = f'{PyTorchElasticJobHandler.GROUP}/{PyTorchElasticJobHandler.VERSION}'
        pytorch_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name)
        pytorch_job_spec['spec'] = spec

        pytorch_job = self.api.create_namespaced_custom_object(
            PyTorchElasticJobHandler.GROUP,
            PyTorchElasticJobHandler.VERSION,
            self.namespace,
            PyTorchElasticJobHandler.PLURAL,
            pytorch_job_spec,
        )
        return pytorch_job['metadata']['name']
Example #4
0
def create_user_http(url: str, jwt_token: str, username: str, password: str,
                     firstname: str, lastname: str) -> None:
    """Create or update a user over HTTP
    :param url: the base URL
    :param jwt_token: The JWT token representing this authentication
    :param username: The user ID
    :param password: The updated password
    :param firstname: The firstname
    :param lastname: The lastname
    """
    user = {"username": username, "password": password}
    if firstname:
        user['firstname'] = firstname
    if lastname:
        user['lastname'] = lastname
    headers = {'Authorization': f'Bearer {jwt_token}'}

    try:
        response = requests.get(f'{url}/v1/users/{username}')
        if response.status_code == 401:
            raise ValueError("Invalid login")
        if response.status_code != 200:
            # No such user exists so do a POST
            response = requests.post(f'{url}/v1/users',
                                     headers=headers,
                                     json={"user": user})
            if response.status_code != 200:
                raise Exception(f"Failed to create user: {username}")
            results = response.json()
            LOGGER.info("Created new user")
            LOGGER.info(json.dumps(results))
            return

        results = response.json()
        LOGGER.info("Found existing user")
        LOGGER.info(json.dumps(results))
    except Exception as ex:
        LOGGER.error(ex)
        return

    response = requests.put(f'{url}/v1/users/{username}',
                            json=user,
                            headers=headers)
    results = response.json()
    LOGGER.info(json.dumps(results))
Example #5
0
async def schedule_pipeline(ws, work) -> None:
    """Use async to open a connection to serve.py and launch work

    Blocks until the job completes (and websocket stays open)
    """
    async with websockets.connect(ws) as websocket:
        await websocket.send(
            json.dumps({
                APIField.COMMAND: 'START',
                APIField.REQUEST: work
            }))

        result = json.loads(await websocket.recv())
        while result[APIField.STATUS] != APIStatus.END:
            if result[APIField.STATUS] == APIStatus.ERROR:
                LOGGER.error(result)
                return

            if result[APIField.RESPONSE].startswith('PIPE_ID'):
                pipe_id = result.split(' ')[-1]
                LOGGER.info('Started %s', pipe_id)
            else:
                LOGGER.info(result[APIField.RESPONSE])
            result = json.loads(await websocket.recv())