Ejemplo n.º 1
0
    def process_connection(self, conn):

        console.print("new connection established")

        data = conn.recv(16000)
        #console.print("data=", data)

        if data:
            # decode command
            text = data.decode()
            cd = json.loads(text)
            team_name = cd["team_name"] if "team_name" in cd else None

            first_key = next(iter(cd))
            console.print("cmd: {}".format(first_key))

            if "get_creds" in cd:
                response = self.creds[team_name]
            elif "set_creds" in cd:
                self.creds[team_name] = cd["set_creds"]
                response = "OK"
            elif "terminate" in cd:
                self.terminate = True
                response = "OK"
            else:
                error.internal_error(
                    "unrecognized cmd received by xt_cache_server: {}".format(
                        cd))

            byte_buff = response.encode()
            conn.send(byte_buff)
Ejemplo n.º 2
0
            def set_timer(timeout):
                console.print("set_timer called: timeout=", self.timeout)
                time.sleep(self.timeout)
                console.diag("timer triggered!")

                plt.close("all")
                print("closed all plots and the fig")
Ejemplo n.º 3
0
    def listen_for_commands(self):
        # if not os.path.exists(FN_CERT):
        #     errors.internal_error("cert file is missing: " + FN_CERT)

        # context = ssl.create_default_context(purpose=ssl.Purpose.CLIENT_AUTH, capath=FN_CERT)
        # context.set_ciphers('EECDH+AESGCM:EDH+AESGCM:AES256+EECDH:AES256+EDH')

        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as normal_sock:

            if self.use_ssl:
                sock = context.wrap_socket(normal_sock,
                                           server_hostname=HOST,
                                           ca_certs="server.crt",
                                           cert_reqs=ssl.CERT_REQUIRED)
            else:
                sock = normal_sock

            # Free up the port for reuse if the process is killed
            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)

            sock.bind((HOST, CACHE_SERVER_PORT))
            sock.listen()
            console.print("waiting for client input...")

            while not self.terminate:
                conn, addr = sock.accept()
                with conn:
                    try:
                        self.process_connection(conn)
                    except BaseException as ex:
                        logger.exception(
                            "Error during communication in xt_server, ex={}".
                            format(ex))
                        console.print("exception: " + str(ex))
Ejemplo n.º 4
0
    def get_status(self, fn_entry):
        status = "completed"      # unless below finds different

        fn_queue_entry = file_utils.path_join(self.psm_queue_path, fn_entry, for_windows=False)
        ssh_cmd = "ls -lt " + fn_queue_entry
        result = None

        #error_code, result = process_utils.sync_run_ssh(None, self.box_addr, ssh_cmd, report_error=False)
        result = self.run_cmd(ssh_cmd)

        if result and fn_entry in result:
            status = "queued"
        else:
            text = self.get_running_entry_name()
            if text == fn_entry:
                # entry might be running; is the runner script OR controller active?
                if self._get_runner_script_process_id():
                    status = "running"
                elif self._get_controller_process_id():
                    status = "running"
                else:
                    console.print("--> runner script and controller processes not running")
            else:
                console.print("PSM current job:", text)

        return status
Ejemplo n.º 5
0
    def dos2unix(self, name):
        with open(name, "rt") as infile:
            text = infile.read()
            text = text.replace("\r", "")

        # specify newline="" here to prevent open() from messing with our newlines
        with open(name, "wt", newline="") as outfile:
            outfile.write(text)

        console.print("CR characters removed: {}".format(name))
Ejemplo n.º 6
0
def run_scp_cmd(caller, scp_parts, report_error=True):
    # cmd = 'scp -i {} {}'.format(constants.LOCAL_KEYPAIR_PRIVATE, cmd)
    cmd_parts = ["scp", "-i", constants.LOCAL_KEYPAIR_PRIVATE] + scp_parts
    console.print("  running SCP cmd: {}".format(" ".join(cmd_parts)))

    exit_code, output = sync_run(cmd_parts)
    if report_error and exit_code:
        console.print(output)
        raise Exception("scp copy command failed")

    return exit_code, output
Ejemplo n.º 7
0
    def _send_cmd_to_cache_server(self, cmd_dict, max_retries,
                                  can_start_server):
        # retry up to 5 secs (to handle case where XT cache server is being started)

        if True:  # os.path.exists(FN_CERT):
            # context = ssl.create_default_context(purpose=ssl.Purpose.SERVER_AUTH, capath=FN_CERT)
            # context.set_ciphers('EECDH+AESGCM:EDH+AESGCM:AES256+EECDH:AES256+EDH')

            for i in range(max_retries):
                try:
                    byte_buffer = json.dumps(cmd_dict).encode()

                    with socket.socket(socket.AF_INET,
                                       socket.SOCK_STREAM) as normal_sock:

                        if self.use_ssl:
                            sock = context.wrap_socket(
                                normal_sock,
                                server_hostname=HOST,
                                ca_certs="server.crt",
                                cert_reqs=ssl.CERT_REQUIRED)
                        else:
                            sock = normal_sock

                        sock.connect((HOST, CACHE_SERVER_PORT))

                        # send cmd_dict as bytes
                        sock.sendall(byte_buffer)

                        # read response
                        data = sock.recv(16000)
                        response = data.decode()

                        return response

                except BaseException as ex:
                    if i == 0 and can_start_server:
                        # first try failed; try starting the server
                        self._start_xt_cache_server()

                    if i > 0:
                        # we are retrying some error after trying to start the server
                        console.print(".", end="", flush=True)
                    #console.print(ex)
                    time.sleep(1)

                    # don't log this since it shows up to user as a confusing message
                    # if i == max_retries-1:
                    #     logger.exception("Error retry exceeded sending cmd to XT cache server.  Last ex={}".format(ex))

        return None
Ejemplo n.º 8
0
def sync_run(cmd_parts,
             capture_output=True,
             shell=False,
             report_error=False,
             env_vars=None,
             capture_as_bytes=False):
    ''' this does a synchronous run of the specified cmd/app and returns the app's exitcode. It runs
    in the current working directory, but target app MUST be a fully qualified path. '''
    universal_newlines = False

    #cmd = " ".join(cmd_parts) if isinstance(cmd_parts, list) else cmd_parts
    console.diag("sync_run: {}".format(cmd_parts))

    # linux won't accept a command, only cmd parts
    #assert isinstance(cmd_parts, (list, tuple))
    if isinstance(cmd_parts, str):
        cmd_parts = cmd_parts.split(" ")

    if capture_output:
        process = subprocess.run(cmd_parts,
                                 cwd=".",
                                 env=env_vars,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 universal_newlines=universal_newlines,
                                 shell=shell)

        output = process.stdout

        if not capture_as_bytes:
            if not universal_newlines:
                # since universal_newlines=False, we need to map bytes to str
                output = output.decode("utf-8",
                                       errors='backslashreplace').replace(
                                           '\r', '')

            output = filter_out_verbose_lines(output)
    else:
        process = subprocess.run(cmd_parts, cwd=".", env=env_vars, shell=shell)
        output = None

    exit_code = process.returncode

    if report_error and exit_code:
        console.print(output)
        raise Exception("sync run failed, exit code={}, error={}".format(
            exit_code, output))

    return exit_code, output
Ejemplo n.º 9
0
def single_char_input(prompt=None, end="\n"):

    if prompt:
        console.print(prompt, end="", flush=True)

    try:
        with KeyPressChecker() as kpc:
            ch = kpc.getch_wait()
    except KeyboardInterrupt:
        ch = constants.CONTROL_C

    if end:
        console.print(end, end="")

    return ch
Ejemplo n.º 10
0
def scp_copy_file_to_box(caller,
                         box_addr,
                         fn_local,
                         box_fn,
                         report_error=True):
    #cmd = 'scp -i {} "{}" {}:{}'.format(constants.LOCAL_KEYPAIR_PRIVATE, fn_local, box_addr, box_fn)
    cmd_parts = [
        "scp", "-i",
        os.path.expanduser(constants.LOCAL_KEYPAIR_PRIVATE), fn_local,
        "{}:{}".format(box_addr, box_fn)
    ]
    console.diag("  copying script to box; cmd={}".format(cmd_parts))

    exit_code, output = sync_run(cmd_parts)
    if report_error and exit_code:
        console.print(output)
        raise Exception("scp copy command failed: {}".format(output))

    return exit_code, output
Ejemplo n.º 11
0
 def report(self):
     self.max_runs_per_runset = 0
     n1 = 0
     for runset in self.runsets:
         num_runs = len(runset.runs)
         if num_runs > self.max_runs_per_runset:
             self.max_runs_per_runset = num_runs
         if num_runs == 1:
             n1 += 1
     console.print("{} runs".format(len(self.runs)))
     console.print("{} runsets".format(len(self.runsets)))
     console.print("{} have 1 run".format(n1))
     console.print("{} max runs per runset".format(
         self.max_runs_per_runset))
Ejemplo n.º 12
0
    def _load_node_creds(self):
        loaded = False

        sc = os.getenv("XT_STORE_CREDS")
        sc = utils.base64_to_text(sc)

        mc = os.getenv("XT_MONGO_CONN_STR")
        mc = utils.base64_to_text(mc)
        #print("init_cred: sc={}, mc={}".format(sc, mc))

        if sc and mc:
            # XT client on compute node

            # cleanup (for testing from client)
            sc = sc.replace('\\"', '"')
            mc = mc.replace('\\"', '"')
            if mc.startswith('"'):
                mc = mc[1:-1]

            # print("sc=", sc)
            sc_data = json.loads(sc)
            store_name = sc_data["name"]
            store_key = sc_data["key"]

            # sample mc: mongodb://xt-sandbox-cosmos:kBOWLQrseZ
            prefix, mc_rest = mc.split("://", 1)
            mc_name, _ = mc_rest.split(":", 1)

            # creds are limited in this case to just Store access [storage + mongo]
            creds = json.dumps({store_name: store_key, mc_name: mc})
            self.apply_creds(creds)

            console.print(
                "init_creds: using compute node ENV VAR settings for store={}, mongo={}"
                .format(store_name, mc_name))
            loaded = True

        return loaded
Ejemplo n.º 13
0
 def console_callback(run_name, msg):
     if msg.startswith(constants.APP_EXIT_MSG):
         #console.print(msg)
         status = msg.split(":")[1].strip()
         desc = self.status_to_desc(run_name, status)
         console.print(desc, flush=True)
         context.remote_app_is_running = False
     else:
         if show_run_name:
             console.print(run_name + ": " + msg, end="", flush=True)
         else:
             console.print(msg, end="", flush=True)
     sys.stdout.flush()
Ejemplo n.º 14
0
def fixup_jobs_if_needed(client):
    collection = client["__jobs__"]
    updated_count = 0

    while True:
        # build next batch of original records where JOB_ID is defined but JOB_NUM is not
        cursor = collection.find( {"job_id": {"$exists": True}, "job_num": {"$exists": False}} , {"_iid": 1}).limit(batch_size)
        records = list(cursor)
        if len(records) == 0:
            break

        if updated_count == 0:
            console.print("found mongo-db JOB records written by older version of XT; upgrading them to new format...")

        process_job_batch(collection, records)
        updated_count += len(records)
        console.print("update progress=", updated_count)

    if updated_count:
        console.print("upgrade complete (updated {:,} records)".format(updated_count))
Ejemplo n.º 15
0
def fixup_runs_if_needed(client, workspace):
    collection = client[workspace]
    updated_count = 0

    # count = collection.count()
    # console.print("collection count=", count)

    while True:
        # get next batch of original records
        #cursor = collection.find( {"run_name": {"$exists": True}, "run_num": {"$exists": False}} , {"_iid": 1}).limit(batch_size)
        cursor = collection.find( {"run_name": {"$exists": True}, "run_num": 0} , {"_iid": 1}).limit(batch_size)
        records = list(cursor)
        if len(records) == 0:
            break

        if updated_count == 0:
            console.print("found mongo-db RUN records written by older version of XT; upgrading them to new format...")

        process_run_batch(collection, records)
        updated_count += len(records)
        console.print("update progress=", updated_count)

    if updated_count:
        console.print("upgrade complete (updated {:,} records)".format(updated_count))
Ejemplo n.º 16
0
    def attach_task_to_console(self,
                               ws_name,
                               run_name,
                               show_waiting_msg=False,
                               show_run_name=False,
                               escape=0):
        full_run_name = ws_name + "/" + run_name

        # callback for each console msg from ATTACHED task
        def console_callback(run_name, msg):
            if msg.startswith(constants.APP_EXIT_MSG):
                #console.print(msg)
                status = msg.split(":")[1].strip()
                desc = self.status_to_desc(run_name, status)
                console.print(desc, flush=True)
                context.remote_app_is_running = False
            else:
                if show_run_name:
                    console.print(run_name + ": " + msg, end="", flush=True)
                else:
                    console.print(msg, end="", flush=True)
            sys.stdout.flush()

        # RPYC bug workaround - callback cannot write to variable in its context
        # but it CAN write to an object's attribute
        context = Bag()
        context.remote_app_is_running = True

        show_detach_msg = False
        detach_requested = False

        attached, status = self.xtc.attach(ws_name, run_name, console_callback)
        #console.print("attached=", attached, ", status=", status)

        if attached:
            #if show_waiting_msg:
            #    console.print("\n<attached: {}>\n".format(full_run_name))

            started = time.time()
            timeout = escape
            if timeout:
                timeout = float(timeout)

            try:
                with KeyPressChecker() as checker:

                    # ATTACH LOOP
                    #console.print("entering ATTACH WHILE LOOP...")
                    while context.remote_app_is_running:
                        #console.print(".", end="")
                        #sys.stdout.flush()

                        if checker.getch_nowait() == 27:
                            detach_requested = True
                            break

                        time.sleep(.1)

                        if timeout:
                            elapsed = time.time() - started
                            if elapsed >= timeout:
                                break

            except KeyboardInterrupt:
                detach_requested = True
            finally:
                self.xtc.detach(ws_name, run_name, console_callback)

            if detach_requested or show_waiting_msg:
                console.print(
                    "\n<detached from run: {}>".format(full_run_name))
        else:
            desc = self.status_to_desc(run_name, status)
            console.print(desc)
Ejemplo n.º 17
0
def hex_dump(fn):
    console.print("hex dump of: {}\n".format(fn))

    # read file raw
    with open(fn, "rb") as infile:
        byte_buff = infile.read()

    start_index = 0
    addr = 0
    console.print("{:04x}    ".format(addr), end="")

    for i in range(len(byte_buff)):
        value = byte_buff[i]
        console.print("{:02x} ".format(value), end="")

        if (i + 1) % 16 == 0:
            text = get_nice_text(byte_buff, start_index, i)
            console.print("   " + text)
            addr += 16
            console.print("{:04x}    ".format(addr), end="")
            start_index = i + 1

    # console.print last text
    i -= 1
    if start_index <= i:
        text = get_nice_text(byte_buff, start_index, i + 1)
        missing = 15 - ((i + 1) % 16)
        spaces = "   " * missing
        text = spaces + " " + text
        console.print(text)
Ejemplo n.º 18
0
#
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
#
# aml_shim.py: AML wants to run a python script, so we use this to launch our shell script
import sys
import os
from xtlib import console

# MAIN code
args = sys.argv[1:]
console.print("aml_shim: args=", args)

cmd = args[
    0]  # all are passed as a logical string (but args[1] is "1", so don't use that)
console.print("aml_shim: about to run cmd=", cmd)
os.system(cmd)
Ejemplo n.º 19
0
        # RUN command
        os.chdir(cwd)
        fb.reset_feedback()

        xt_cmds.main(cmd_text, cmd_started)
        break


# main code

pid = sys.argv[1] if len(sys.argv) > 1 else None
if pid:
    pid = int(pid)

    # kill old process before we try to own resources
    console.print("canceling old version of server: pid=", pid)
    p = psutil.Process(pid)
    p.terminate()

    time.sleep(
        2)  # wait for job to fully terminate so we can access its resources

xtlib_dir = os.path.realpath(os.path.dirname(__file__))
#console.print("xtlib_dir=", xtlib_dir)

worker = WatchWorker(xtlib_dir + "/**")
worker.start()

with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
    s.bind((HOST, PORT))
    s.listen()
Ejemplo n.º 20
0
    def _get_creds_from_login(self, authentication, reason=None):

        # use normal Key Value
        from azure.keyvault.secrets import SecretClient

        if authentication == "auto":
            authentication = "browser" if pc_utils.has_gui() else "device-code"

        if authentication == "browser":
            console.print("authenticating with azure thru browser... ",
                          flush=True,
                          end="")
            from azure.identity import InteractiveBrowserCredential
            if self.azure_tenant_id is not None:
                credential = InteractiveBrowserCredential(
                    tenant_id=self.azure_tenant_id)
            else:
                credential = InteractiveBrowserCredential()
        elif authentication == "device-code":
            # console.print("authenticating with azure thru device code... ", flush=True, end="")
            from azure.identity import DeviceCodeCredential
            from azure.identity._constants import AZURE_CLI_CLIENT_ID

            console.print(
                "using device-code authorization (Azure AD currently requires 2-4 authenications here)"
            )
            if self.azure_tenant_id is not None:
                credential = DeviceCodeCredential(
                    tenant_id=self.azure_tenant_id,
                    client_id=AZURE_CLI_CLIENT_ID)
            else:
                credential = DeviceCodeCredential(
                    client_id=AZURE_CLI_CLIENT_ID)
        else:
            errors.syntax_error(
                "unrecognized authentication type '{}'".format(authentication))

        new_creds = True
        outer_token = credential.get_token()
        token = outer_token.token

        # expires = outer_token[1]
        # elapsed = expires - time.time()
        #print(" [new token expires in {:.2f} mins] ".format(elapsed/60), end="")

        # get keys from keyvault
        self.client = SecretClient(self.vault_url, credential=credential)
        key_text = self.get_secret_live("xt-keys")
        console.print("authenticated successfully", flush=True)

        #xt_client_cert = self.get_secret_live("xt-clientcert")
        xt_server_cert = self.get_secret_live("xt-servercert")

        # write all our creds to self.keys
        self.apply_creds(key_text)
        self.keys["xt_server_cert"] = xt_server_cert

        self.keys["object_id"] = self.get_me_graph_property(token, "id")

        # return creds as json string
        return json.dumps(self.keys)
Ejemplo n.º 21
0
    def monitor_attach_run(self,
                           ws,
                           run_name,
                           show_waiting_msg=True,
                           escape=0):
        console.print(
            "")  # separate the waiting loop output from previous output
        attach_attempts = 0

        def monitor_work():
            nonlocal attach_attempts

            connected = self.xtc.connect()
            #azure_task_state, connected, box_name, job_id = self.connect_to_box_for_run(ws, run_name)
            azure_task_state = None
            box_name = self.xtc.box_name
            job_id = "xxxxx"  # TODO

            attach_attempts += 1

            if azure_task_state:
                #console.print("azure_task_state=", azure_task_state)
                # its an azure-batch controlled run
                if azure_task_state == "active":
                    text = "Waiting for run to start: {} ({} in azure-batch)".format(
                        run_name.upper(), job_id)
                elif azure_task_state == "running" and not connected:
                    text = "Waiting for run to initialize: {} ({} in azure-batch)".format(
                        run_name.upper(), job_id)
                else:
                    # exit monitor loop
                    return azure_task_state, connected, box_name, job_id, attach_attempts
            else:
                # its a normal box-controller run
                if not connected:
                    errors.env_error("could not connect to box: " + box_name)
                # we are connected, but has run started yet?
                status_dict = self.xtc.get_status_of_runs(ws, [run_name])
                # controller may not have heard of run yet (if we were fast)
                status = status_dict[
                    run_name] if run_name in status_dict else "created"
                if status in ["created", "queued"]:
                    text = "Waiting for run to start: {} (queued to run on {})".format(
                        run_name.upper(), box_name)
                else:
                    # status is one of running, killed, completed, spawning, ...
                    # exit monitor loop
                    return azure_task_state, connected, box_name, job_id, attach_attempts
            return text

        # wait for run to be attachable in a MONITOR LOOP
        result = monitor_loop(True, monitor_work, "[hit ESCAPE to detach] ",
                              escape)
        #console.print("")    # separate the waiting loop output from subsequent output

        if result:
            state, connected, box_name, job_id, attach_attempts = result
            #console.print("state=", state, ", connected=", connected, ", box_name=", box_name, ", job_id=", job_id)

            if not connected:
                if False:  #   attach_attempts == 1:
                    errors.user_exit(
                        "Unable to attach to run (state={})".format(state))
                else:
                    # not an error in this case
                    console.print(
                        "Unable to attach to run (state={})".format(state))
                    return

            console.print("<attaching to: {}/{}>\n".format(ws, run_name))
            self.attach_task_to_console(ws,
                                        run_name,
                                        show_waiting_msg=show_waiting_msg,
                                        escape=escape)
        else:
            # None returned; user cancelled with ESCAPE, so no further action needed
            pass
Ejemplo n.º 22
0
 def report(self):
     sz = "{} = {}".format(self.name, self.values)
     if self.default_setting is not None:
         sz += "  # default {}".format(self.default_setting.value)
     console.print(sz)
Ejemplo n.º 23
0
    def choose_config(self):
        # Gather the subset of hparams with multiple values.
        self.multivalued_hparams = []

        for hparam in self.hparams:
            if hparam.has_multiple_values:
                self.multivalued_hparams.append(hparam)

        # for hparam in self.multivalued_hparams:
        #     console.print("{} = {}".format(hparam.name, [setting.value for setting in hparam.settings]))

        # If there are no runs yet, just return a random configuration.
        if len(self.runsets) == 0:
            hp_id__setting_id__list = []
            for hparam in self.multivalued_hparams:
                last_setting_id = len(hparam.settings) - 1
                if hparam.default_setting is None:
                    # Select from all settings.
                    setting_id = dgd_rand.randint(0, last_setting_id)
                else:
                    # Select from the default setting, +/- one.
                    default_setting_id = hparam.default_setting.id
                    min_id = default_setting_id
                    if min_id > 0:
                        min_id -= 1
                    max_id = default_setting_id
                    if max_id < last_setting_id:
                        max_id += 1
                    setting_id = dgd_rand.randint(min_id, max_id)
                hp_id__setting_id__list.append((hparam.id, setting_id))
            config_str = str(hp_id__setting_id__list)
            chosen_runset = RunSet(hp_id__setting_id__list, config_str)
            chosen_runset.report('Random runset   ')
            return chosen_runset

        # Find the best runset so far.
        best_runset = self.runsets[0]
        best_metric = best_runset.metric
        for runset in self.runsets:
            if runset.metric >= best_metric:
                best_metric = runset.metric
                best_runset = runset
        best_runset.report('Best runset    ')

        # Build a neighborhood around (and including) the best runset.
        neighborhood = [best_runset]
        for hp_i, hparam in enumerate(self.multivalued_hparams):
            best_hparam_id = best_runset.hp_id__setting_id__list[hp_i][0]
            assert hparam.id == best_hparam_id
            best_setting_id = best_runset.hp_id__setting_id__list[hp_i][1]
            best_setting = hparam.settings[best_setting_id]
            # console.print("For hp={}, best config's setting is {}".format(hparam.name, best_setting.value))

            if best_setting_id > 0:
                neighbor = self.get_neighbor_runset(best_runset, hp_i,
                                                    best_hparam_id,
                                                    best_setting_id - 1)
                neighborhood.append(neighbor)
            if best_setting_id < len(hparam.settings) - 1:
                neighbor = self.get_neighbor_runset(best_runset, hp_i,
                                                    best_hparam_id,
                                                    best_setting_id + 1)
                neighborhood.append(neighbor)

        # Choose one runset, weighted by how many runs it needs to exceed those of the runset with the most.
        ceiling = max([len(runset.runs) for runset in neighborhood]) + 1
        console.print("ceiling = {} runs".format(ceiling))
        probs = np.zeros((len(neighborhood)))
        for i, runset in enumerate(neighborhood):
            gap = max(0, ceiling - runset.num_runs)
            probs[i] = gap
        sum = np.sum(probs)
        probs /= sum
        for i, runset in enumerate(neighborhood):
            runset.id = i
            runset.report(" {:2d} prob={:6.4f}".format(runset.id, probs[i]))
        chosen_runset = dgd_rand.choices(neighborhood, probs)[0]
        chosen_runset.report(' {:2d} was chosen '.format(chosen_runset.id))
        return chosen_runset
Ejemplo n.º 24
0
 def report(self, title):
     sz = "{}  {}".format(title, self.config_str)
     if self.metric is not None:
         sz += "    {:12.5f},  {} runs".format(self.metric, self.num_runs)
     console.print(sz)
Ejemplo n.º 25
0
def monitor_loop(monitor, func, action_msg="monitoring ", escape_secs=0):
    '''
    set up a loop to continually call 'func' and display its output, until the ESCAPE key is pressed
    '''
    # handle the easy case first
    if not monitor:
        text = func()
        console.print(text, end="")
        return

    pc_utils.enable_ansi_escape_chars_on_windows_10()

    if monitor == True:
        monitor = 5  # default wait time
    else:
        monitor = int(monitor)
    started = datetime.datetime.now()

    started2 = time.time()
    timeout = escape_secs
    if timeout:
        timeout = float(timeout)

    last_result = None

    # MONITOR LOOP
    with KeyPressChecker() as checker:
        while True:
            result = func()
            if not isinstance(result, str):
                # func has decided to stop the monitor loop itself
                if last_result:
                    console.print("\n")
                return result

            if last_result:
                # erase last result on screen
                console.print("\r", end="")
                line_count = len(last_result.split("\n")) - 1

                # NOTE: on some systems, the number of lines needed to be erased seems to
                # vary by 1.  when it is too many, it destroys prevous output/commands.  until
                # this is corrected, we pick the lower values that will cause some extra
                # output on some systems.

                #line_count += 1     # add 1 for the \n we will use to clearn the line

                pc_utils.move_cursor_up(line_count, True)

            elapsed = utils.elapsed_time(started)
            result += "\n" + action_msg + "(elapsed time: {})...".format(
                elapsed)

            console.print(result, end="")
            sys.stdout.flush()

            if timeout:
                elapsed = time.time() - started2
                if elapsed >= timeout:
                    console.print("\nmonitor timed out")
                    break

            # wait a few seconds during refresh
            if pc_utils.wait_for_escape(checker, monitor):
                console.print("\nmonitor cancelled")
                break

            last_result = result
    return None
Ejemplo n.º 26
0
 def assign_settings(self, hp_config):
     console.print('assigning settings')
     return
Ejemplo n.º 27
0
            if not hparam.has_multiple_values:
                value = hparam.single_value
                if value == "$randint()":
                    value = np.random.randint(2147483647)
                arg_dict[hparam.name] = value

        # now, output values used in runset
        for hp_i, hparam in enumerate(self.multivalued_hparams):
            hparam_id = runset.hp_id__setting_id__list[hp_i][0]
            assert hparam.id == hparam_id

            value_id = runset.hp_id__setting_id__list[hp_i][1]
            value = hparam.settings[value_id]

            arg_dict[hparam.name] = value.value

        return arg_dict


if __name__ == '__main__':
    dgd = DGD(unit_test=True)
    console.print()
    for hp in dgd.hparams:
        hp.report()
    console.print()
    dgd.runs = []
    dgd.runsets = []
    dgd.configstr_runset_dict = {}
    chosen_runset = dgd.choose_config()
    chosen_runset.report("Chosen runset")
Ejemplo n.º 28
0
    def build_data_frames(self):
        '''
        1. for each run, collect the reported metrics as metric sets (by reported col list)

        2. append to the dataframe for that col list
        '''
        # build "data_frames"
        no_metrics = []
        pp_run_names = []
        used_max = False
        data_frames_by_cols = {}
        got_columns = False

        for i, record in enumerate(self.run_log_records):
            # extract metrics for this run
            run = record["_id"]
            node = utils.node_id(record["node_index"])
            job = record["job_id"]
            experiment = record["exper_name"]
            workspace = record["ws"]
            search_style = utils.safe_value(record, "search_style")
            if search_style and search_style != "single":
                # parent run with children - skip it
                continue

            log_records = record["log_records"]

            metric_sets = run_helper.build_metrics_sets(log_records)
            if not metric_sets:
                no_metrics.append(run)
                continue

            if self.max_runs and len(pp_run_names) >= self.max_runs:
                used_max = True
                break

            if not got_columns:
                # set x and y columns
                explicit = qfe.get_explicit_options()
                if not "x" in explicit:
                    self.x_col = self.get_actual_x_column(
                        metric_sets, self.x_col, self.col_names)

                if not self.col_names:
                    # not specified by user, so build defaults
                    self.col_names = self.get_default_y_columns(
                        metric_sets, self.x_col)

                got_columns = True

            # merge metric sets into dfx
            for metric_set in metric_sets:

                # create a pandas DataFrame
                df = pd.DataFrame(metric_set["records"])
                cols = str(list(df.columns))

                # ensure this df has our x_col
                if self.x_col and not self.x_col in cols:
                    continue

                # ensure this df has at least 1 y_col
                found_y = False
                for y in self.col_names:
                    if y in cols:
                        found_y = True
                        break

                if not found_y:
                    continue

                # add run_name column
                df["run"] = [run] * df.shape[0]
                df["node"] = [node] * df.shape[0]
                df["job"] = [job] * df.shape[0]
                df["experiment"] = [experiment] * df.shape[0]
                df["workspace"] = [workspace] * df.shape[0]

                if not cols in data_frames_by_cols:
                    data_frames_by_cols[cols] = df
                else:
                    dfx = data_frames_by_cols[cols]
                    dfx = dfx.append(df)
                    data_frames_by_cols[cols] = dfx

            pp_run_names.append(run)

        if no_metrics:
            console.print(
                "\nnote: following runs were skipped (currently have no logged metrics): \n    {}\n"
                .format(", ".join(no_metrics)))

        if used_max:
            console.print(
                "plotting first {} runs (use --max-runs to override)".format(
                    self.max_runs))
        else:
            console.print("plotting {} runs...".format(len(pp_run_names)))

        # update our list of run_names to proces
        self.run_names = pp_run_names

        return data_frames_by_cols