Example #1
0
 def on_error(self, ws, error):
     self.STATE = SOCKET_STATE.FAILED
     russell_logger.debug(str(error))
     ws.close()
     if isinstance(error, ClickException):
         # raised from on_message
         raise error
Example #2
0
    def init(cls):
        if os.path.isfile(cls.CONFIG_FILE_PATH):
            logger.debug("cl ignore file already present at {}".format(
                cls.CONFIG_FILE_PATH))
            return

        logger.debug("Setting default ch ignore in the file {}".format(
            cls.CONFIG_FILE_PATH))

        with open(cls.CONFIG_FILE_PATH, "w") as config_file:
            config_file.write(DEFAULT_FILE_IGNORE_LIST)
Example #3
0
    def check_response_status(self, response):
        """
        Check if response is successful. Else raise Exception.
        """
        # 处理流式响应
        flag = False
        for ct in ('application/json', 'text/html'):
            if ct in response.headers.get('Content-Type'):
                flag = True
                break
        if flag is False:
            logger.debug("Content-Type is {}".format(response.headers.get('Content-Type')))
            return response
        logger.debug("Http status code: {}".format(response.status_code))
        # 处理标准HTTP错误码
        if not (200 <= response.status_code < 300):
            if response.status_code == 401:
                raise AuthenticationException()
            elif response.status_code == 404:
                raise NotFoundException()
            else:
                raise InvalidResponseException()
        try:
            resp_json = response.json()
        except Exception as e:
            logger.debug(str(e))
            raise InvalidResponseException()

        # 处理自定义错误码
        code = resp_json.get("code", 500)
        if not (200 <= code < 300):
            try:
                message = resp_json.get("data")
            except Exception as e:
                logger.debug(str(e))
                message = None
            logger.debug("Error received : status_code: {}, message: {}"
                         .format(code, message or response.content))
            if code == 404:
                raise NotFoundException()
            elif code == 401:
                raise AuthenticationException()
            elif code == 400:
                raise BadRequestException()
            else:
                raise InvalidResponseException()
        return resp_json
Example #4
0
    def download(self, url, filename, timeout=10, api_version=1):
        """
        Download the file from the given url at the current path
        """
        logger.debug("Downloading file from url: {}".format(url))

        try:
            response = self.request(method='GET',
                                    url=url,
                                    stream=True,
                                    timeout=timeout,
                                    api_version=api_version)
            self.check_response_status(response)
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        f.write(chunk)
            return filename
        except requests.exceptions.ConnectionError as exception:
            logger.debug("Exception: {}".format(exception))
            sys.exit("Cannot connect to the Russell server. Check your internet connection.")
Example #5
0
    def on_message(self, ws, message):
        russell_logger.debug(ws.header)
        russell_logger.debug(message)

        def start_sending(*args):
            with open(self.FILE_NAME, 'rb') as f:
                # with progressbar.ProgressBar(maxval=int(ws.header.get('size', 0))) as bar:
                bar = progressbar.ProgressBar(maxval=int(ws.header.get('size', 0))).start()
                try:
                    total_uploaded_size = 0
                    block_size = 1024 * 1024
                    msg = f.read(block_size)
                    while msg:
                        total_uploaded_size += len(msg)
                        ws.sock.send_binary(msg)
                        msg = f.read(block_size)
                        bar.update(total_uploaded_size)
                except:
                    pass
                finally:
                    pass

        russell_logger.debug('received {}'.format(message))
        resp_json = json.loads(message)
        code = resp_json.get('code')
        if code == 200:  # to be modified
            if self.STATE == SOCKET_STATE.INIT:
                self.STATE = SOCKET_STATE.UPLOADING
                russell_logger.info('Start uploading...')
                _thread.start_new_thread(start_sending, ())
            else:
                self.STATE = SOCKET_STATE.FINISH
                ws.close()
        elif code == 522:
            self.STATE = SOCKET_STATE.FAILED
            raise OverPermissionException()
        else:
            self.STATE = SOCKET_STATE.FAILED
            raise ServiceBusyException()
Example #6
0
    def request(self, method, url, params=None, data=None, json=None, files=None, access_token=None,
                auth=None, timeout=5, stream=False, api_version=1):
        """
        Execute the request using requests library
        """
        request_url = self.base_url.format(api_version) + url
        logger.debug("Starting request to url: {} with params: {}, data: {}".format(request_url, params, data))
        headers = {}
        if access_token:
            headers = {"Authorization": "Basic {}".format(access_token)}
        elif not auth:
            headers = {"Authorization": "Basic {}".format(
                self.access_token.token if self.access_token else None)
            }

        try:
            response = requests.request(method=method,
                                        url=request_url,
                                        params=params,
                                        headers=headers,
                                        data=data,
                                        json=json,
                                        files=files,
                                        timeout=timeout,
                                        stream=stream,
                                        auth=auth)
        except requests.exceptions.ConnectionError:
            sys.exit("Cannot connect to the Russell server. Check your internet connection.")
        try:
            if not stream:
                logger.debug("Response Content: {}, Headers: {}".format(response.content, response.headers))
                return self.check_response_status(response)
            else:
                logger.debug('HTTP Stream Request/Response...')
                return self.check_response_status(response)
        except Exception as e:
            sys.exit(str(e))
Example #7
0
 def set_config(cls, experiment_config):
     russell_logger.debug("Setting {} in the file {}".format(experiment_config.to_dict(),
                                                             cls.CONFIG_FILE_PATH))
     with open(cls.CONFIG_FILE_PATH, "w") as config_file:
         config_file.write(json.dumps(experiment_config))
Example #8
0
 def set_config(cls, data_config):
     logger.debug("Setting {} in the file {}".format(
         data_config.to_dict(), cls.CONFIG_FILE_PATH))
     with open(cls.CONFIG_FILE_PATH, "w") as config_file:
         config_file.write(json.dumps(data_config.to_dict()))
Example #9
0
def run(resubmit, command, env, jupyter, tensorboard, data, version, message, os, cputype, cpunum, gputype, gpunum,
        memtype, memnum, eager, value, earliest, deadline, duration):
    '''

    :param resubmit:
    :param command:
    :param env:
    :param jupyter:
    :param tensorboard:
    :param data:
    :param version:
    :param message:
    :param os:
    :param cputype:
    :param cpunum:
    :param gputype:
    :param gpunum:
    :param memtype:
    :param memnum:
    :param eager:
    :param value:
    :param earliest:
    :param deadline:
    :param duration:
    :return:
    '''
    """
    """
    # 初始化客户端
    try:
        ec = ExperimentClient()
    except Exception as e:
        logger.error(str(e))
        return
    if resubmit is True:
        # 只关注竞价部分的参数
        jobSpec = {}  # 从本地配置文件或者服务器读取上次竞价失败的(或者本地配置文件中的,上次竞价成功的也行)作业详情
        jobId = jobSpec["id"]
        # 提交作业请求
        jobReq = JobReq(duration=duration, tw_end=deadline, tw_start=earliest, job_id=jobId, value=value,
                        resources=jobSpec["resources"])
        resp = ec.submit(jobId, jobReq)
        if resp["accepted"] == False:
            logger.info("This job submit is not accepted, reason: {}".format(resp["message"]))
            return
    # 检查备注信息长度
    if message and len(message) > 1024:
        logger.error("Message body length over limit")
        return

    # 获取认证令牌
    access_token = AuthConfigManager.get_access_token()
    # 读取本地作业配置信息
    experiment_config = ExperimentConfigManager.get_config()

    # 组装命令成列表
    command_str = ' '.join(command)
    # # 处理挂载数据集
    # success, data_ids = process_data_ids(data)
    # if not success:
    #     return

    # 处理深度学习框架配置
    if not env:
        # 未指定,获取作业所属项目的默认框架作为此次作业的框架
        env = ProjectClient().get_project_info_by_id(experiment_config["project_id"]).get('default_env')

    # 检查所有资源的组合是否合法
    if not validate_resource_list(env, jupyter, tensorboard, os, cputype, cpunum, gputype, gpunum):
        return

    # 上传代码到云端或者指定云端代码
    # # 如果指定了代码版本
    # if version:
    #     module_resp = ModuleClient().get_by_entity_id_version(experiment_config.project_id, version)
    #     if not module_resp:
    #         logger.error("Remote project does not existed")
    #         return
    #     module_id = module_resp.get('id')
    # else:
    #     # Gen temp dir
    #     try:
    #         # upload_files, total_file_size_fmt, total_file_size = get_files_in_directory('.', 'code')
    #         # save_dir(upload_files, _TEMP_DIR)
    #         file_count, size = get_files_in_current_directory('code')
    #         if size > 100 * 1024 * 1024:
    #             sys.exit("Total size: {}. "
    #                      "Code size too large to sync, please keep it under 100MB."
    #                      "If you have data files in the current directory, please upload them "
    #                      "separately using \"russell data\" command and remove them from here.\n".format(
    #                 sizeof_fmt(size)))
    #         copy_files('.', _TEMP_DIR)
    #     except OSError:
    #         sys.exit("Directory contains too many files to upload. Add unused directories to .russellignore file.")
    #         # logger.info("Creating project run. Total upload size: {}".format(total_file_size_fmt))
    #         # logger.debug("Creating module. Uploading: {} files".format(len(upload_files)))
    #
    #     hash_code = dirhash(_TEMP_DIR)
    #     logger.debug("Checking MD5 ...")
    #     module_resp = ModuleClient().get_by_codehash_entity_id(hash_code, experiment_config.project_id)
    #     if module_resp:  # if code same with older version, use existed, don`t need upload
    #         module_id = module_resp.get('id')
    #         version = module_resp.get('version')
    #         logger.info("Use older version-{}.".format(version))
    #     else:
    #         version = experiment_config.version
    #         # Create module
    #         module = Module(name=experiment_config.name,
    #                         description=message,
    #                         family_id=experiment_config.family_id,
    #                         version=version,
    #                         module_type="code",
    #                         entity_id=experiment_config.project_id
    #                         )
    #         module_resp = mc.create(module)
    #         if not module_resp:
    #             logger.error("Remote project does not existed")
    #             return
    #         version = module_resp.get('version')
    #         experiment_config.set_version(version=version)
    #         ExperimentConfigManager.set_config(experiment_config)
    #
    #         module_id = module_resp.get('id')
    #         project_id = module_resp.get('entity_id')
    #         if not project_id == experiment_config.project_id:
    #             logger.error("Project conflict")
    #
    #         logger.debug("Created module with id : {}".format(module_id))
    #
    #         # Upload code to fs
    #         logger.info("Syncing code ...")
    #         fc = FsClient()
    #         try:
    #             fc.socket_upload(file_type="code",
    #                              filename=_TEMP_DIR,
    #                              access_token=access_token.token,
    #                              file_id=module_id,
    #                              user_name=access_token.username,
    #                              data_name=experiment_config.name)
    #         except Exception as e:
    #             shutil.rmtree(_TEMP_DIR)
    #             logger.error("Upload failed: {}".format(str(e)))
    #             return
    #         else:
    #             ### check socket state, some errors like file-server down, cannot be catched by `except`
    #             state = fc.get_state()
    #             if state == SOCKET_STATE.FAILED:
    #                 logger.error("Upload failed, please try after a while...")
    #                 return
    #         finally:
    #             try:
    #                 shutil.rmtree(fc.temp_dir)
    #             except FileNotFoundError:
    #                 pass
    #
    #         ModuleClient().update_codehash(module_id, hash_code)
    #         logger.info("\nUpload finished")
    #
    #     # rm temp dir
    #     shutil.rmtree(_TEMP_DIR)
    #     logger.debug("Created code with id : {}".format(module_id))

    # 创建作业描述指标
    jobSpecification = JobSpecification(message=message, code_id="", data_ids=[],
                                        command=command_str,
                                        project_id=experiment_config["project_id"],
                                        framework=env,
                                        enable_jupyter=jupyter,
                                        enable_tensorboard=tensorboard,
                                        os="ubuntu:16",
                                        gpunum=gpunum,
                                        gputype=gputype,
                                        cpunum=cpunum,
                                        cputype=cputype,
                                        memnum=memnum,
                                        memtype=memtype)
    # 提交该作业描述,由服务器保存
    jobId = ec.create(jobSpecification)
    logger.debug("Created job specification : {}".format(jobId))

    # # 更新本地作业配置
    # experiment_config.set_experiment_predecessor(experiment_id)
    # ExperimentConfigManager.set_config(experiment_config)

    # 打印作业描述信息
    experiment_name = "{}/{}:{}".format(access_token.username,
                                        experiment_config["project_id"],
                                        version)

    table_output = [["JOB ID", "NAME", "VERSION"],
                    [jobId, experiment_name, version]]
    logger.info(tabulate(table_output, headers="firstrow"))
    logger.info("")

    # 提交作业请求
    jobReq = JobReq(duration=duration, tw_end=deadline, tw_start=earliest, job_id=jobId, value=value,
                    resources=jobSpecification.resources)
    resp = ec.submit(jobId, jobReq)
    if resp["accepted"] == False:
        logger.info("This job submit is not accepted, reason: {}".format(resp["message"]))
        return

    # 作业成功提交后,处理jupyter/tensorboard
    task_url = {}
    if jupyter is True:
        while True:
            # Wait for the experiment / task instances to become available
            try:
                experiment = ec.get(jobId)
                if experiment.state != "waiting" and experiment.task_instances:
                    break
            except Exception as e:
                logger.debug("Experiment not available yet: {}".format(jobId))

            logger.debug("Experiment not available yet: {}".format(jobId))
            sleep(1)
            continue

        task_url = ec.get_task_url(jobId)
        jupyter_url = task_url["jupyter_url"]
        print("Setting up your instance and waiting for Jupyter notebook to become available ...")
        if wait_for_url(jupyter_url, sleep_duration_seconds=2, iterations=900):
            logger.info("\nPath to jupyter notebook: {}".format(jupyter_url))
            webbrowser.open(jupyter_url)
        else:
            logger.info("\nPath to jupyter notebook: {}".format(jupyter_url))
            logger.info(
                "Notebook is still loading or can not be connected now. View logs to track progress")

    if tensorboard is True:
        if not task_url.get("tensorboard_url"):
            task_url = ec.get_task_url(jobId)
        tensorboard_url = task_url["tensorboard_url"]
        logger.info("\nPath to tensorboard: {}".format(tensorboard_url))

    logger.info("""
        To view logs enter:
            ch logs {}
                """.format(jobId))
Example #10
0
 def on_open(self, ws):
     russell_logger.debug('setup connection to server')
Example #11
0
 def on_close(self, ws):
     self.clear_archive()
     russell_logger.debug('close connection to server')
Example #12
0
 def set_access_token(cls, access_token):
     russell_logger.debug("Setting {} in the file {}".format(
         access_token.to_dict(), cls.CONFIG_FILE_PATH))
     with open(cls.CONFIG_FILE_PATH, "w") as config_file:
         config_file.write(json.dumps(access_token.to_dict()))