Ejemplo n.º 1
0
 def _monitor_hanging_pods_daemon(self):
     while True:
         output = get_bash_output(
             'kubectl get pods -n {namespace} -o=JSON'.format(
                 namespace=self.namespace))
         output = '' if not output else output if isinstance(
             output, str) else output.decode('utf-8')
         try:
             output_config = json.loads(output)
         except Exception as ex:
             self.log.warning(
                 'K8S Glue pods monitor: Failed parsing kubectl output:\n{}\nEx: {}'
                 .format(output, ex))
             sleep(self._polling_interval)
             continue
         pods = output_config.get('items', [])
         for pod in pods:
             try:
                 reason = functools.reduce(
                     lambda a, b: a[b], ('status', 'containerStatuses', 0,
                                         'state', 'waiting', 'reason'), pod)
             except (IndexError, KeyError):
                 continue
             if reason == 'ImagePullBackOff':
                 pod_name = pod.get('metadata', {}).get('name', None)
                 if pod_name:
                     task_id = pod_name.rpartition('-')[-1]
                     delete_pod_cmd = 'kubectl delete pods {} -n {}'.format(
                         pod_name, self.namespace)
                     get_bash_output(delete_pod_cmd)
                     try:
                         self._session.api_client.tasks.failed(
                             task=task_id,
                             status_reason=
                             "K8S glue error due to ImagePullBackOff",
                             status_message="Changed by K8S glue",
                             force=True)
                     except Exception as ex:
                         self.log.warning(
                             'K8S Glue pods monitor: Failed deleting task "{}"\nEX: {}'
                             .format(task_id, ex))
         sleep(self._polling_interval)
Ejemplo n.º 2
0
    def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
        """
        :summary: Pull and run tasks from queues.
        :description: 1. Go through ``queues`` by order.
                      2. Try getting the next task for each and run the first one that returns.
                      3. Go to step 1
        :param queues: IDs of queues to pull tasks from
        :type queues: list of ``Text``
        :param worker_params: Worker command line arguments
        :type worker_params: ``clearml_agent.helper.process.WorkerParams``
        """
        events_service = self.get_service(Events)

        # make sure we have a k8s pending queue
        # noinspection PyBroadException
        try:
            self._session.api_client.queues.create(self.k8s_pending_queue_name)
        except Exception:
            pass
        # get queue id
        self.k8s_pending_queue_name = self._resolve_name(self.k8s_pending_queue_name, "queues")

        _last_machine_update_ts = 0
        while True:
            # iterate over queues (priority style, queues[0] is highest)
            for queue in queues:
                # delete old completed / failed pods
                get_bash_output(self.KUBECTL_DELETE_CMD.format(namespace=self.namespace))

                # get next task in queue
                try:
                    response = self._session.api_client.queues.get_next_task(queue=queue)
                except Exception as e:
                    print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
                    continue
                else:
                    try:
                        task_id = response.entry.task
                    except AttributeError:
                        print("No tasks in queue {}".format(queue))
                        continue
                    events_service.send_log_events(
                        self.worker_id,
                        task_id=task_id,
                        lines="task {} pulled from {} by worker {}".format(
                            task_id, queue, self.worker_id
                        ),
                        level="INFO",
                    )

                    self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
                    self.run_one_task(queue, task_id, worker_params)
                    self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
                    break
            else:
                # sleep and retry polling
                print("No tasks in Queues, sleeping for {:.1f} seconds".format(self._polling_interval))
                sleep(self._polling_interval)

            if self._session.config["agent.reload_config"]:
                self.reload_config()
def old_replace(session, line):
    try:
        cuda_ver_suffix, cuda_ver, cuda_cudnn_ver = old_get_suffix(session)
    except Exception:
        return line
    if line.lstrip().startswith('#'):
        return line
    for package_name in _cuda_based_packages_hack:
        if package_name not in line:
            continue
        try:
            line_lstrip = line.lstrip()
            if line_lstrip.startswith('http://') or line_lstrip.startswith(
                    'https://'):
                pos = line.find(package_name) + len(package_name)
                # patch line with specific version
                line = line[:pos] + \
                    line[pos:].replace('-cp', cuda_ver_suffix + '-cp', 1)
            else:
                # this is a pypi package
                tokens = line.replace('=', ' ').replace('<', ' ').replace('>', ' ').replace(';', ' '). \
                    replace('!', ' ').split()
                if package_name != tokens[0]:
                    # how did we get here, probably a mistake
                    found_cuda_based_package = False
                    continue

                version_number = None
                if len(tokens) > 1:
                    # get the package version info
                    test_version_number = tokens[1]
                    # check if we have a valid version, i.e. does not contain post/dev
                    version_number = '.'.join([
                        v for v in test_version_number.split('.')
                        if v and '0' <= v[0] <= '9'
                    ])
                    if version_number != test_version_number:
                        raise ValueError()

                # we have no version, but we have to have one
                if not version_number:
                    # get the latest version from the extra index list
                    pip_search_cmd = ['pip', 'search']
                    if Worker._pip_extra_index_url:
                        pip_search_cmd.extend(
                            chain.from_iterable(
                                ('-i', x)
                                for x in Worker._pip_extra_index_url))
                    pip_search_cmd += [package_name]
                    pip_search_output = get_bash_output(
                        ' '.join(pip_search_cmd), strip=True)
                    version_number = pip_search_output.split(package_name)[1]
                    version_number = version_number.replace('(', ' ').replace(
                        ')', ' ').split()[0]
                    version_number = '.'.join([
                        v for v in version_number.split('.')
                        if v and '0' <= v[0] <= '9'
                    ])
                    if not version_number:
                        # somewhere along the way we failed
                        raise ValueError()

                package_name_version = package_name + '==' + version_number + cuda_ver_suffix
                if version_number in line:
                    # make sure we have the specific version not >=
                    tokens = line.split(';')
                    line = ';'.join([package_name_version] + tokens[1:])
                else:
                    # add version to the package_name
                    line = line.replace(package_name, package_name_version, 1)

            #  print('pip install %s using CUDA v%s CuDNN v%s' % (package_name, cuda_ver, cuda_cudnn_ver))
        except ValueError:
            pass
            #  print('Warning! could not find installed CUDA/CuDNN version for %s, '
            #  'using original requirements line: %s' % (package_name, line))
        # add the current line into the cuda requirements list
    return line