Ejemplo n.º 1
0
    def __init__(self):
        print("AAA")
        logger.debug("Executer init")
        # save/log current jobs, so that it can restart.
        self.task_info_dir = os.path.join(root_dir, "task_info")
        self.root_info_dir = os.path.join(root_dir, "root_info")
        exist_or_mkdir(self.task_info_dir)
        exist_or_mkdir(self.root_info_dir)

        # load task info for all active / queued task
        self.active_task_list = TaskList(
            os.path.join(self.root_info_dir, "active_task.json"),
            self.task_info_dir)
        self.queued_task_list = TaskList(
            os.path.join(self.root_info_dir, "queued_task.json"),
            self.task_info_dir)
        self.info_dict = JsonTiedDict(
            os.path.join(self.root_info_dir, "info.json"))

        tpu_info_path = os.path.join(self.root_info_dir, "tpu_info.json")
        self.tpu_resource = ResourceList(tpu_info_path, ["v2-tf2", "v2-tf2-2"])
        self.current_task_handles = {}  # task_id -> process object
        # task_id being in current_task_handle does NOT imply the task is active, we don't delete handles
        self.task_cache = {}  # task_id -> TaskObj
        self._init_info()
Ejemplo n.º 2
0
    def __init__(self):
        self.request_dir = os.environ["request_dir"]
        self.tf_record_dir = os.environ["tf_record_dir"]
        info_path = os.path.join(self.request_dir, "info.json")
        self.json_tied_dict = JsonTiedDict(info_path)
        self.next_job_id = self.json_tied_dict.last_id() + 1
        self.qck_generator: QCKGenDynamicKDP = get_qck_gen_dynamic_kdp()
        self.save_dir = os.path.join(output_path, "cppnc_auto")

        score_save_path_format = os.path.join(self.request_dir, "{}")
        self.job_runner = FileWatchingJobRunner(score_save_path_format,
                                                info_path, self.make_tfrecord,
                                                "tfrecord maker")

        print("")
        print("  [ TFRecordMaker ]")
        print()
Ejemplo n.º 3
0
class KDPEvalServer:
    def __init__(self):
        self.request_dir = os.environ["request_dir"]
        info_path = os.path.join(self.request_dir, "req_job_info.json")
        self.json_tied_dict = JsonTiedDict(info_path)
        self.next_job_id = self.json_tied_dict.last_id()

    def start(self):
        class RequestHandler(SimpleXMLRPCRequestHandler):
            rpc_paths = ('/RPC2', )

        class RPCThreading(socketserver.ThreadingMixIn, SimpleXMLRPCServer):
            pass

        print("")
        print("  [ KDPEvalServer ]")
        print()
        print("Preparing server")
        server = RPCThreading(("0.0.0.0", port),
                              requestHandler=RequestHandler,
                              allow_none=True)
        server.register_introspection_functions()
        server.register_function(self.eval_job, 'eval_job')
        print("Waiting")
        server.serve_forever()

    def save_request(self, job_id, kdp_list: List[KDP]):
        save_path = os.path.join(self.request_dir, str(job_id))
        temp_save_path = save_path + ".tmp"
        pickle.dump(kdp_list, open(temp_save_path, "wb"))
        os.rename(temp_save_path, save_path)

    def eval_job(self, kdp_list_raw: List[Tuple]):
        kdp_list: List[KDP] = lmap(KDP.from_state, kdp_list_raw)
        job_id = self.next_job_id
        self.save_request(job_id, kdp_list)
        self.next_job_id += 1
        self.json_tied_dict.set('last_task_id', self.next_job_id)
        return job_id
Ejemplo n.º 4
0
class TFRecordMaker:
    def __init__(self):
        self.request_dir = os.environ["request_dir"]
        self.tf_record_dir = os.environ["tf_record_dir"]
        info_path = os.path.join(self.request_dir, "info.json")
        self.json_tied_dict = JsonTiedDict(info_path)
        self.next_job_id = self.json_tied_dict.last_id() + 1
        self.qck_generator: QCKGenDynamicKDP = get_qck_gen_dynamic_kdp()
        self.save_dir = os.path.join(output_path, "cppnc_auto")

        score_save_path_format = os.path.join(self.request_dir, "{}")
        self.job_runner = FileWatchingJobRunner(score_save_path_format,
                                                info_path, self.make_tfrecord,
                                                "tfrecord maker")

        print("")
        print("  [ TFRecordMaker ]")
        print()

    def file_watch_daemon(self):
        self.job_runner.start()
        print("TFRecordMaker thread()")

    def make_tfrecord(self, job_id: int):
        save_path = os.path.join(self.request_dir, str(job_id))
        kdp_list = pickle.load(open(save_path, "rb"))
        data_id_manager = DataIDManager(0, 1000 * 1000)
        print("{} kdp".format(len(kdp_list)))
        insts = self.qck_generator.generate(kdp_list, data_id_manager)
        record_save_path = os.path.join(self.tf_record_dir, str(job_id))
        write_records_w_encode_fn(record_save_path,
                                  self.qck_generator.encode_fn, insts)
        # Save for backup
        info_save_path = os.path.join(self.tf_record_dir,
                                      "{}.info".format(job_id))
        pickle.dump(data_id_manager.id_to_info, open(info_save_path, "wb"))
        # launch estimator
        add_estimator_job(job_id)
Ejemplo n.º 5
0
import subprocess
import time
from collections import Counter
from subprocess import PIPE
from typing import List, Dict

from cpath import output_path
from galagos.parse import save_queries_to_file, parse_galago_ranked_list, parse_galago_passage_ranked_list
from galagos.types import SimpleRankedListEntry, GalagoPassageRankEntry
from misc_lib import exist_or_mkdir
from taskman_client.sync import JsonTiedDict

dyn_query_dir = os.path.join(output_path, "dyn_query")
exist_or_mkdir(dyn_query_dir)
info_path = os.path.join(dyn_query_dir, "info.json")
task_info = JsonTiedDict(info_path)


class DocQuery(Dict):
    pass


class PassageQuery(Dict):
    pass


def get_new_query_json_path() -> str:
    last_query_file_idx = get_last_query_file_idx()
    new_query_id = last_query_file_idx + 1
    task_info.last_task_id = new_query_id
    return get_json_path_for_idx(new_query_id)
Ejemplo n.º 6
0
 def __init__(self):
     self.request_dir = os.environ["request_dir"]
     info_path = os.path.join(self.request_dir, "req_job_info.json")
     self.json_tied_dict = JsonTiedDict(info_path)
     self.next_job_id = self.json_tied_dict.last_id()
Ejemplo n.º 7
0
class Executer:
    def __init__(self):
        print("AAA")
        logger.debug("Executer init")
        # save/log current jobs, so that it can restart.
        self.task_info_dir = os.path.join(root_dir, "task_info")
        self.root_info_dir = os.path.join(root_dir, "root_info")
        exist_or_mkdir(self.task_info_dir)
        exist_or_mkdir(self.root_info_dir)

        # load task info for all active / queued task
        self.active_task_list = TaskList(
            os.path.join(self.root_info_dir, "active_task.json"),
            self.task_info_dir)
        self.queued_task_list = TaskList(
            os.path.join(self.root_info_dir, "queued_task.json"),
            self.task_info_dir)
        self.info_dict = JsonTiedDict(
            os.path.join(self.root_info_dir, "info.json"))

        tpu_info_path = os.path.join(self.root_info_dir, "tpu_info.json")
        self.tpu_resource = ResourceList(tpu_info_path, ["v2-tf2", "v2-tf2-2"])
        self.current_task_handles = {}  # task_id -> process object
        # task_id being in current_task_handle does NOT imply the task is active, we don't delete handles
        self.task_cache = {}  # task_id -> TaskObj
        self._init_info()

    def _get_new_task_id(self):
        new_task_id = self.info_dict.last_task_id + 1
        self.info_dict.set("last_task_id", new_task_id)
        return new_task_id

    def _get_task_info_path(self, task_id):
        return os.path.join(self.task_info_dir, "{}.json".format(task_id))

    def run(self):
        # start _thread
        t = threading.Thread(target=self._thread)
        t.daemon = True
        t.start()

    def add_task_to_schedule(self, task):
        task.task_id = self._get_new_task_id()
        logger.debug("add_task_to_schedule() task_id={} proc_name={}".format(
            task.task_id, task.process_name))
        new_task = QueuedTask.from_task(task,
                                        self._get_task_info_path(task.task_id))
        new_task.set_status(STATUS_WAIT)
        self.queued_task_list.add(new_task)

    def remove_task(self, task_name):
        # Kill task if it is active
        task_obj = self._remove_task_from_active_list(task_name)

        # Remove from the list if not active
        if task_obj is None:
            task_obj = self._remove_task_from_queued_list(task_name)
        return task_obj

    def _remove_task_from_active_list(self, task_name):
        deleted_task_obj = None
        for task_obj in self.active_task_list:
            if task_obj.task_name == task_name:
                self._kill_task(task_obj)
                task_obj.set_status(STATUS_CANCELLED)
                deleted_task_obj = task_obj
                break
        self.active_task_list.remove(deleted_task_obj)
        return deleted_task_obj

    def _remove_task_from_queued_list(self, task_name):
        deleted_task_obj = -1
        for task_obj in self.queued_task_list:
            if task_obj.task_name == task_name:
                task_obj.set_status(STATUS_CANCELLED)
                deleted_task_obj = task_obj
                break

        self.queued_task_list.remove(deleted_task_obj)
        return deleted_task_obj

    def _kill_task(self, task_obj):
        task_id = task_obj.task_id
        p = self.current_task_handles[task_id]
        p.kill()

    def _init_info(self):
        print("Init Info")
        logger.info("Init_info")
        logger2.info("Init Info")

        # Init self.current_task_handles
        task_to_mark_complete = []
        for task_obj in self.active_task_list:
            print("ActiveTask : ", task_obj.task_id)
            try:
                print("Acquiring Handle")

                logger.debug("Acquiring Handle {}".format(task_obj.task_id))
                self.current_task_handles[task_obj.task_id] = psutil.Process(
                    task_obj.pid)
                logger.debug("Find task, task_id={} pid={}".format(
                    task_obj.task_id, task_obj.pid))
            except psutil.NoSuchProcess as e:
                task_to_mark_complete.append(task_obj)

        self._clean_up_completed_list(task_to_mark_complete)

    # tpu_name should be already acquired before this function
    # TODO handle std_output redirection
    def _execute(self, task: Task, tpu_name=None):
        if tpu_name is not None:
            task.update_argument({"tpu_name": tpu_name})
        p = psutil.Popen(
            [task.process_name, task.get_param_str()],
            env=task.env,
            shell=True,
        )
        task.pid = p.pid
        return p

    def _task_sanity_check(self, task: Task):
        # TODO : Check if related gs files are available
        #   TODO : Cache information about gs file information
        # TODO : Check if necessary parameters are set
        return True

    def _thread(self):
        #  1. Poll Current Task status : By handle
        # 3. If resource is available, execute next task
        logger.info("_thread")
        while True:
            self._check_active_tasks()
            self._launch_task_if_possible()
            time.sleep(1)

    def _check_active_tasks(self):
        logger.info("check_active_tasks")
        task_to_mark_complete = []
        for task_obj in self.active_task_list:
            task_process: psutil.Process = self.current_task_handles[
                task_obj.task_id]
            try:
                status = task_process.status()
                logger.info("Task {} active".format(task_obj.task_id))
            except psutil.NoSuchProcess as e:
                status = "dead"
                logger.info("Task {} dead".format(task_obj.task_id))

            if status == "running":
                pass
            elif status == "dead":
                task_to_mark_complete.append(task_obj)
        # TODO
        #  2. Check stdout/stderr to see if process crashed

        self._clean_up_completed_list(task_to_mark_complete)

    def _launch_task_if_possible(self):
        task_that_just_got_executed = []
        for task_obj in self.queued_task_list:
            is_ready = True
            tpu_name = None
            if task_obj.use_tpu:
                tpu_name = self.tpu_resource.assign()
                if tpu_name is None:
                    is_ready = False

            if not self._task_sanity_check(task_obj):
                is_ready = False

            if is_ready:
                p = self._execute(task_obj, tpu_name)
                task_obj.pid = p.pid
                self.current_task_handles[task_obj.task_id] = p
                task_that_just_got_executed.append(task_obj)
            else:
                # return resource
                if tpu_name is not None:
                    self.tpu_resource.release(tpu_name)
        for task_obj in task_that_just_got_executed:
            logger.debug("execute() task_id={} proc_name={}".format(
                task_obj.task_id, task_obj.process_name))
            assert task_obj.pid is not None
            self.queued_task_list.remove(task_obj)
            self.active_task_list.add(task_obj)
            task_obj.set_status(STATUS_RUNNING)

    def _clean_up_completed_list(self, task_to_mark_complete):
        for task_obj in task_to_mark_complete:
            self.active_task_list.remove(task_obj)
            self._clean_up_completed(task_obj)

    def _clean_up_completed(self, task):
        logger.debug("_clean_up_completed() task_id={} ".format(task.task_id))
        task.set_status(STATUS_COMPLETED)
        if task.use_tpu:
            self.tpu_resource.release(task.tpu_name)