Ejemplo n.º 1
0
def main(arg_list=None):
    utils.init_logging(constants.FN_XT_EVENTS, logger, "XT Demo")

    args = parse_args(arg_list)
    auto_mode = args.auto
    nomonitor = args.nomonitor
    nogui = args.nogui
    quick_test = args.quick_test
    philly = args.philly

    build_cmds(auto_mode, quick_test, nomonitor, nogui, philly=philly)

    steps = parse_steps(args.steps)
    response = ""

    if not auto_mode:
        print()
        print("This demonstrates how to run common XT commands")
        print("Press ENTER to execute each command (or s=SKIP, b=BACK, q=QUIT)")
        print()

        print("hit any key to continue: ", end="", flush=True)
        response = wait_for_any_key(auto_mode)

    if response != "q":
        navigate(cmds, auto_mode, steps)

    # clean-up
    file_utils.ensure_dir_deleted(ARCHIVES_DIR)

    print("end of xt_demo")

    return cmd_count
def main():
    # init environment
    config = xt_config.get_merged_config()
    file_utils.ensure_dir_exists(TEST_DIR)

    with DirChange(TEST_DIR):
        tester = StorageProviderTests()

        tester.test_impl("xtsandboxstorage")
        tester.test_impl("filestorage")
    
    file_utils.ensure_dir_deleted(TEST_DIR)
    return tester._assert_count
Ejemplo n.º 3
0
def cleanup():
    set_test_group("cleanup...")

    file_utils.ensure_dir_deleted("upload_testing")
    file_utils.ensure_dir_deleted("download_testing")

    # check for errors in runs
    text = xt("xt list runs --status=error", capture_output=True)
    # print("\nruns with errors:")
    # print(text)

    if not "no matching runs found" in text[0]:
        errors.internal_error("quick-test: above 'list runs' contains errors")
Ejemplo n.º 4
0
    def create_demo(self, destination, response, overwrite):
        '''
        This command will removed the specified destination directory if it exists (prompting the user for approval).
        Specifying the current directory as the destination will produce an error.
        '''

        # set up from_dir
        from_dir = file_utils.get_xtlib_dir() + "/demo_files"

        # set up dest_dir
        dest_dir = destination
        if not dest_dir:
            errors.syntax_error("An output directory must be specified")

        create = True
        console.print("creating demo files at: {}".format(
            os.path.abspath(dest_dir)))

        if os.path.exists(dest_dir):
            answer = pc_utils.input_response(
                "'{}' already exists; OK to delete? (y/n): ".format(dest_dir),
                response)
            if answer != "y":
                create = False

        if create:
            file_utils.ensure_dir_deleted(dest_dir)

            shutil.copytree(from_dir, dest_dir)
            #file_utils.copy_tree(from_dir, dest_dir)

            if not self.store.does_workspace_exist("xt-demo"):
                # import xt-demo workspace from archive file
                console.print(
                    "importing xt-demo workspace (usually takes about 30 seconds)"
                )
                impl_storage_api = ImplStorageApi(self.config, self.store)

                fn_archive = os.path.join(file_utils.get_xtlib_dir(),
                                          "demo_files", "xt-demo-archive.zip")
                impl_storage_api.import_workspace(fn_archive,
                                                  "xt-demo",
                                                  "xtd",
                                                  overwrite=overwrite,
                                                  show_output=False)
Ejemplo n.º 5
0
    def snapshot_all_code(self, snapshot_dir, cmd_parts, args):
        '''
        make local snapshot of each code_dir (and xtlib, if needed)
        '''
        code_dirs = args["code_dirs"]
        xtlib_capture = args["xtlib_upload"]
        code_omit = args["code_omit"]
        script_dir = None

        code_upload = args["code_upload"]
        
        # this step should always be done so that script_dir is removed from cmd_parts
        script_dir = self.remove_script_dir_from_parts(cmd_parts)

        if code_upload:
            for i, code_dir in enumerate(code_dirs):
                # fixup "$scriptdir" relative paths
                if "$scriptdir" in code_dir:
                    code_dir = code_dir.replace("$scriptdir", script_dir)

                if "::" in code_dir:
                    code_dir, dest_dir = code_dir.split("::")
                else:
                    dest_dir = "."
                self.make_local_snapshot(snapshot_dir, code_dir, dest_dir, code_omit)
        else:
            script_dir = snapshot_dir

        if xtlib_capture:
            # copy XTLIB directory to "xtlib" subdir of temp
            xtlib_dir = file_utils.get_xtlib_dir()
            dest_dir = snapshot_dir + "/xtlib"
            file_utils.ensure_dir_deleted(dest_dir)

            # don't copy the "demo_files" directory
            shutil.copytree(xtlib_dir, dest_dir, ignore=shutil.ignore_patterns("demo_files"))

        console.diag("after create local snapshot")
        return script_dir
Ejemplo n.º 6
0
    def process_run_command(self, args):
        self.args = args

        # ensure workspace exists
        workspace = args['workspace']
        dry_run = args['dry_run']
        fake_submit = args["fake_submit"]

        if not fake_submit:
            self.store.ensure_workspace_exists(workspace, flag_as_error=False)

        # PRE-PROCESS ARGS
        service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \
            self.process_args(args)

        # create backend helper (pool, philly, batch, aml)
        cluster = utils.safe_value(compute_def, "cluster")
        vc = utils.safe_value(compute_def, "vc")
        self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None)

        # add conda_packages and pip_packages from SETUP to ARGS
        setup_def = self.config.get_setup_from_target_def(compute_def)

        conda_packages = utils.safe_value(setup_def, "conda-packages")
        pip_packages = utils.safe_value(setup_def, "pip-packages")

        args["conda_packages"] = conda_packages if conda_packages else []
        args["pip_packages"] = pip_packages if pip_packages else []

        self.adjust_pip_packages(args)

        snapshot_dir = self.temp_dir

        if fake_submit:
            script_dir = snapshot_dir
        else:
            # note: always create a snapshot dir for backends to add needed files
            file_utils.ensure_dir_deleted(snapshot_dir)
            script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args)

        self.script_dir = script_dir
        direct_run = args["direct_run"]

        # do we need to start the xt controller?
        use_controller = not direct_run
        adjustment_scripts = None

        # create a job_secret that can later be used to authenticate with the XT controller
        # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job
        job_secret = str(uuid.uuid4())

        # do we need to build a "docker run" command?
        if not self.backend.provides_container_support():
            env = args["docker"]
            if not env:
                docker_name = utils.safe_value(compute_def, "docker")
            if docker_name and docker_name != "none":
                cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args)
                args["docker"] = docker_name     # for use in building run context info

        # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd)
        cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \
            self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args)

        if dry_run:
            return

        # make new values available
        args["search_style"] = search_style
        args["total_run_count"] = total_run_count

        resume_name = args['resume_name']
        keep_name = False  # args['keep_name']
        experiment = args['experiment']
        is_distributed = args['distributed']
        direct_run = args["direct_run"]

        # CREATE JOB to hold all runs
        if fake_submit:
            # use lastrun/lastjob info to get a fast incremental fake job number
            xtd = xt_dict.read_xt_dict()
            fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1
            xtd["fake_job_num"] = fake_job_num + 1
            xt_dict.write_xt_dict(xtd)
            job_id = "fake_job" + str(fake_job_num)
        else:
            job_id = self.store.create_job()
        fb.feedback(job_id)

        # start the feedback (by parts)
        fb.feedback("{}: {}".format("target", compute))

        # write hparams to FILES
        boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args)

        if sweeps_text and not fake_submit:
            self.upload_sweep_data(sweeps_text, experiment, job_id, args=args)

        # if num_boxes > 1 and service_type != "batch":
        #     fb.feedback("", is_final=True)

        parent_name = None

        # BUILD RUNS, by box
        job_runs = []
        run_count = 1 if is_distributed else len(boxes) 
        secrets_by_node = {}
        remote_control = args["remote_control"]

        for i in range(run_count):
            box_name = boxes[i]

            # generate a box secret for talking to XT controller for this node
            box_secret =  str(uuid.uuid4()) if remote_control else ""

            # build runs for box_name
            run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, 
                parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args)

            # for now, adhere to the more general design of multiple runs per box
            box_runs = [run_data]      
            job_runs.append(box_runs)

            node_id = utils.node_id(i)            
            secrets_by_node[node_id] = box_secret

            # FEEDBACK 
            ptype = "single " if search_style == "single" else "parent "
            if is_distributed:
                ptype = "master "

            if run_count == 1:
                node_msg = "creating {}run".format(ptype)
            else:
                node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count)

            if service_type == "pool":
                node_msg += ", box: " + box_name

            fb.feedback(node_msg, id="node_msg")  # , add_seperator=is_last)
            last_msg = node_msg

            # run the job

        # build box: runs dict for job info file
        runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace)

        # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller
        if use_controller:
            # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB)
            # this will also adjust commands for each node to run the XT controller
            adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        else:
            adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        # add env vars used by both controller and runs
        env_vars = args["env_vars"]

        # create a job guid to uniquely identify this job across all XT instances
        job_guid = str(uuid.uuid4())

        # we add with "node0" and "job_secret", but backend service will override for each node
        scriptor.add_controller_env_vars(env_vars, self.config, None, "node0")

        data_local = args["data_local"]
        if "$scriptdir" in data_local:
            data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir))
            args["data_local"] = data_local

        model_local = args["model_local"]
        if "$scriptdir" in model_local:
            model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir))
            args["model_local"] = model_local

        # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands
        self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args)

        # upload CODE from snapshot_dir
        code_upload = args["code_upload"]
        code_omit = args["code_omit"]
        code_zip = args["code_zip"]
    
        if not fake_submit:
            if code_upload:
                self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args)

            # upload DATA from data_local (do we need to keep this?  should we upload to normal DATA location, vs. job?)
            data_upload = args["data_upload"]
            if data_upload:
                if not data_local:
                    errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file")

                data_omit = args["data_omit"]
                data_zip = "none"

                self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args)
        
        # dispatch to BACKEND submitters
        '''
        Note: backend submitter functions are responsible for:
            - submitting the job (for each node, queue runs for that node)
            - return service job id (or list of them if per node)

        NOTE: there is a timing issue where submitted job needs access to job info, but final piece
        of job info (service info) is only return after job is submitted.  Therefore, we structure steps as follows:

            - primary job info is logged
            - job is submitted thru backend
            - service info for job is logged
        '''

        # LOG PRIMARY JOB INFO
        dd = {}

        if not fake_submit:
            # mark runs as QUEUED
            for runs in runs_by_box.values():
                first_run = runs[0]
                self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) 

            # write the job info file (now that backend has had a chance to update it)
            job_num = int(job_id[3:])

            xt_cmd = args["xt_cmd"]
            schedule = args["schedule"]
            concurrent = args["concurrent"]

            # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node
            dynamic_runs_remaining = None if search_style == "single" else total_run_count
            node_count = len(runs_by_box)

            # static_runs_by_node = None
            # if schedule == "static":
            #     static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count)
            #console.diag("static_runs_by_node=", static_runs_by_node)

            active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count)

            dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, 
                "pool_info": compute_def, "runs_by_box": runs_by_box, 
                "primary_metric": args["primary_metric"], 
                "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], 
                "username": args["username"], "hold": args["hold"], "started": utils.get_time(),
                "job_status": "submitted", "running_nodes": 0, 
                "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret,
                "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style,     
                "active_runs": active_runs,  "connect_info_by_node": {}, "secrets_by_node": secrets_by_node,  
                "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent,
                "service_job_info": None, "service_info_by_node": None,
            }

            self.store.log_job_info(job_id, dd)

        # SUBMIT JOB 
        # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args)
        service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, 
            repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args)

        # POST SUBMIT processing

        # update job info 
        if not fake_submit:
            dd["service_job_info"] = service_job_info
            dd["service_info_by_node"] = service_info_by_node
            self.store.log_job_info(job_id, dd)

        # update lastrun/lastjob info
        xtd = xt_dict.read_xt_dict()
        xtd["last_run"] = last_run
        xtd["last_job"] = job_id
        xt_dict.write_xt_dict(xtd)

        # return values for API support (X)
        return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id