Esempio n. 1
0
    def __init__(self, config_dict=None, config_path=None):

        assert config_dict is None or config_path is None

        # If specified, load a config from the given JSON file. Custom modification to the JSON spec
        # permits lines to be commented out using a '#' character for ease of testing
        if config_path is not None:
            with open(config_path, 'r') as f:
                lines = f.readlines()
                for i, line in enumerate(lines):
                    if '#' in line:
                        lines[i] = line[:line.index('#')]

                config_dict = json.loads(''.join(lines))

        config_dict = config_dict if config_dict is not None else {}

        # Update the default values using the supplied configuration dict
        if not isinstance(config_dict, dict):
            raise ConfigurationError("no keys found in configuration file")

        # check all the configurations
        for k, v in config_dict.items():

            print("checking for attribute {}..".format(k))
            if not hasattr(self, k):
                raise ConfigurationError(
                    "Unexpected configuration keyword provided - {}:{}".format(
                        k, v))
            setattr(self, k, v)

        # ------------ check against schema --------------
        ConfigFormat().validate_json(config_dict)

        # ---------------------------------------------
        # TODO: #nodes does not pass through the model (set by kronos_executor config for now..)
        if self.model:
            if self.model.get('schedule_generation'):
                self.model['schedule_generation']['synthapp_n_nodes'] = 1
        # ---------------------------------------------

        # if input or output folders do not exist, an error is raised
        if not os.path.exists(self.dir_input):
            raise ConfigurationError(
                "input folder {} - does not exist!".format(self.dir_input))

        if not os.path.exists(self.dir_output):
            raise ConfigurationError(
                "output folder {} - does not exist!".format(self.dir_output))

        # ----------------- logging setup --------------------
        root_logger = logging.getLogger()
        fh = logging.FileHandler(self.kronos_log_file, mode='w')
        fh.setFormatter(logging.Formatter(log_msg_format))
        fh.setLevel(logging.DEBUG if self.verbose else logging.INFO)
        root_logger.addHandler(fh)
Esempio n. 2
0
def ingest_accounting_logs(path, cfg=None):
    """
    Read PBS logs into a dataset
    """
    if not os.path.exists(path):
        raise ConfigurationError(
            "Specified path to ingest accounting profiles does not exist: {}".
            format(path))

    if not os.path.isfile(path):
        raise ConfigurationError(
            "Specified path for accounting log is not a file")

    jobs = read_accounting_logs(path)

    return PBSDataSet(jobs)
Esempio n. 3
0
def ingest_epcc_csv_logs(path, cfg=None):
    """
    Read PBS logs into a dataset
    """
    if not os.path.exists(path):
        raise ConfigurationError(
            "Specified path to ingest CSV profiles does not exist: {}".format(
                path))

    if not os.path.isfile(path):
        raise ConfigurationError(
            "Specified path for CSV time_schedule is not a file")

    jobs = read_epcc_csv_logs(path)

    return PBSDataSet(jobs)
Esempio n. 4
0
def step_function(function_config):
    """
    Function that defines a step
    :param function_config:
    :return:
    """

    required_config_fields = [
        'x_step',
    ]

    # check that all the required fields are set
    for req_item in required_config_fields:
        if req_item not in function_config.keys():
            raise ConfigurationError(
                "'step_function' requires to specify {}".format(req_item))

    # x of the step (between 0 and 1)
    x_step = function_config['x_step']

    # default value of x points
    n_values = 6
    eps = 1.0e-6

    # then add two very close points at the step location
    x_values = np.sort(
        np.append(np.linspace(0, 1, n_values), [x_step, x_step + eps]))
    y_values = np.array([float(cc) for cc in np.sign(x_values - x_step) > 0])

    return x_values, y_values
Esempio n. 5
0
def ingest_allinea_profiles(path,
                            cfg=None,
                            jobs_n_bins=None,
                            list_json_files=None,
                            json_label_map=None):
    """
    Does what it says on the tin.
    """
    if not os.path.exists(path):
        raise ConfigurationError(
            "Specified path to ingest Allinea profiles does not exist: {}".
            format(path))

    if not list_json_files:
        if os.path.isdir(path):
            jobs = read_allinea_logs(path, cfg=cfg, jobs_n_bins=jobs_n_bins)
        else:
            jobs = [read_allinea_log(path, cfg=cfg, jobs_n_bins=jobs_n_bins)]
    else:
        jobs = read_allinea_logs(path,
                                 cfg=cfg,
                                 jobs_n_bins=jobs_n_bins,
                                 list_json_files=list_json_files)

    if not jobs:
        raise RuntimeError("No file found")

    return AllineaDataSet(jobs, json_label_map=json_label_map)
Esempio n. 6
0
    def ckeck_config(self):
        """
        Check the configuration
        :return:
        """

        # check the json data against the post-processing config schema
        ExportConfigFormat().validate_json(self._config_dict)

        # check all the configurations
        for k, v in self._config_dict.items():

            if not hasattr(self, k):
                raise ConfigurationError("Unexpected configuration keyword provided - {}:{}".format(k, v))

            # if OK, set this attribute..
            setattr(self, k, v)

        # take the timestamp to be used to archive run folders (if existing)
        out_dir = self._config_dict["output_path"]
        if os.path.exists(out_dir):
            time_stamp_now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
            time_stamped_outdir = out_dir + "." + time_stamp_now
            print("Dir: {} already exists!\n..moving it to: {}".format(out_dir, time_stamped_outdir))
            os.rename(out_dir, time_stamped_outdir)
Esempio n. 7
0
def bin_array(t, data, bins_in, mode="sum"):
    """
    Function that returns a binned array (elements that fall within a bin can be either discretized or averaged)
    :param t:
    :param data:
    :param bins: [integer]: bins span the t vector
                 [numpy.ndarray]: bins fully specified
    :param mode: "sum" or "mean"
    :return:
    """

    eps = 1e-8

    t = np.asarray(t)
    data = np.asarray(data)

    if isinstance(bins_in, int):
        bins = np.linspace(min(t)-eps, max(t)+eps, bins_in)
        t_bins = (bins[1:]+bins[:-1])/2.0
    elif isinstance(bins_in, np.ndarray):
        bins = bins_in
        bins[0] -= eps
        bins[-1] += eps
        t_bins = (bins[1:]+bins[:-1])/2.0
    else:
        raise ConfigurationError("bins must be either interger or numpy array!")

    digitized = np.digitize(t, bins)

    # method "sum"
    if mode=="sum":
        bin_values = np.asarray([data[digitized == i].sum() if data[digitized == i].size else 0 for i in range(1, len(bins))])
        bin_values = np.asarray(bin_values)

        # just a check..
        if sum(data)-sum(bin_values) > 1e-10:
            print("different sum! orig: {}, binned: {}".format(sum(data), sum(bin_values)))

    # method "mean"
    elif mode == "mean":
        bin_values = np.asarray([data[digitized == i].mean() if data[digitized == i].size else 0 for i in range(1, len(bins))])
        bin_values = np.asarray(bin_values)

    else:
        raise ConfigurationError("mode must be either sum or mean!")

    return t_bins, bin_values
Esempio n. 8
0
    def check_export_config(self, export_config, out_path, **kwargs):

        # create output dir if it does not exists..
        if not os.path.exists(out_path):
            os.makedirs(out_path)

        # check that export type is consistent with the class type
        if export_config["type"] != self.export_type:
            raise ConfigurationError("Export type {}, does not match class: {}".format(export_config["type"],
                                                                                       self.__class__.__name__))

        if not self.optional_configs and kwargs:
            raise ConfigurationError("Class: {} does not accept optional config keys!".format(self.__class__.__name__))
        else:
            if not all(k in self.optional_configs for k in kwargs.keys()):
                for k in kwargs.keys():
                    if k not in self.optional_configs:
                        print("Class: {} incompatible with config {}".format(self.__class__.__name__, k))
                raise ConfigurationError
Esempio n. 9
0
 def check_config(self, config):
     """
     make sure that all the required params
     are set in the config
     :return:
     """
     # check that all the required fields are set
     for req_item in self.required_config_fields:
         if req_item not in config.keys():
             err = "{} requires config {}".format(self.__class__.__name__, req_item)
             raise ConfigurationError(err)
Esempio n. 10
0
    def __init__(self, workload_set, config):

        assert all(isinstance(wl, Workload) for wl in workload_set.workloads)
        assert isinstance(config, Config)

        # check that there is the "model" entry in the config file..
        if not config.model:
            raise ConfigurationError(
                "'model' entry not set in config file, but required!")

        self.config = config
        self.workload_set = workload_set

        # check that there is the "model" entry in the config file..
        if not self.config.model:
            raise ConfigurationError(
                "'model' entry not set in config file, but required!")

        # check that all the required fields are set
        for req_item in self.required_config_fields:
            if req_item not in self.config.model.keys():
                raise ConfigurationError("{} requires to specify {}".format(
                    self.__class__.__name__, req_item))
Esempio n. 11
0
    def _apply(self, config):

        # Apply each source workload into each destination workload
        n_job_matched = 0
        n_destination_jobs = 0

        for wl_source_tag in config['source_workloads']:

            try:
                wl_source = next(wl for wl in self.workloads
                                 if wl.tag == wl_source_tag)
            except StopIteration:
                raise ConfigurationError(
                    "Source Workload {} not found".format(wl_source_tag))

            for wl_dest_tag in config['apply_to']:

                try:
                    wl_dest = next(wl for wl in self.workloads
                                   if wl.tag == wl_dest_tag)
                except StopIteration:
                    raise ConfigurationError(
                        "Destination Workload {} not found".format(
                            wl_dest_tag))

                n_destination_jobs += len(wl_dest.jobs)

                n_job_matched += self.apply_lookup_table(
                    wl_source,
                    wl_dest,
                    config['similarity_threshold'],
                    config['priority'],
                    config['keywords'],
                )

        logger.info("jobs matched/destination jobs = [{}/{}]".format(
            n_job_matched, n_destination_jobs))
Esempio n. 12
0
    def split_by_keywords(workload, split_config_output):
        """
        Auxiliary internal splitting function
        :param workload:
        :param split_config_output:
        :return:
        """

        # Extract configurations for the splitting
        new_wl_name = split_config_output['create_workload']
        split_attr = split_config_output['split_by']
        kw_include = split_config_output['keywords_in']
        kw_exclude = split_config_output['keywords_out']

        sub_wl_jobs = []
        if kw_include and not kw_exclude:
            for j in workload.jobs:
                if getattr(j, split_attr):
                    if all(kw in getattr(j, split_attr) for kw in kw_include):
                        sub_wl_jobs.append(j)

        elif not kw_include and kw_exclude:
            for j in workload.jobs:
                if getattr(j, split_attr):
                    if not any(kw in getattr(j, split_attr)
                               for kw in kw_exclude):
                        sub_wl_jobs.append(j)

        elif kw_include and kw_exclude:

            sub_wl_jobs = [
                j for j in workload.jobs
                if all(kw in getattr(j, split_attr)
                       for kw in kw_include) and not any(
                           kw in getattr(j, split_attr) for kw in kw_exclude)
            ]

        else:
            raise ConfigurationError(
                "either included or excluded "
                "keywords are needed for splitting a workload")

        if not sub_wl_jobs:
            logger.error("Workload splitting has produced an empty workload!")

        return Workload(jobs=sub_wl_jobs, tag=new_wl_name)
Esempio n. 13
0
def custom_function(function_config):
    """
    Function that defines a custom distribution of values
    :param function_config:
    :return:
    """

    required_config_fields = ['x_values', 'y_values']

    # check that all the required fields are set
    for req_item in required_config_fields:
        if req_item not in function_config.keys():
            raise ConfigurationError(
                "'step_function' requires to specify {}".format(req_item))

    # x of the step (between 0 and 1)
    x_values = np.array(function_config['x_values'])
    y_values = np.array(function_config['y_values'])

    return x_values, y_values
Esempio n. 14
0
File: base.py Progetto: ecmwf/kronos
    def __init__(self,
                 path,
                 recursive=None,
                 file_pattern=None,
                 label_method=None,
                 pool_readers=None):
        self.path = path

        print("Log reader ({})".format(self.log_type_name))

        self.label_method = label_method if label_method is not None else self.label_method
        self.recursive = recursive if recursive is not None else self.recursive
        self.pool_readers = pool_readers if pool_readers is not None else self.pool_readers

        # Some checks
        if self.label_method not in self.available_label_methods:
            raise ConfigurationError(
                "Configuring LogReader with unavailable label method ({})".
                format(label_method))

        # Only override the file pattern if it is supplied.
        if file_pattern:
            self.file_pattern = file_pattern
Esempio n. 15
0
    def from_logs_path(cls, ingest_path, ingest_config):
        """
        This method should construct a log reader, read the logs and return an IngestedDataSet.

        If the logs are cached, then those should be read in instead.
        """
        abs_ingest_path = os.path.abspath(os.path.realpath(ingest_path))
        cache_file = "cache.{}".format(base64.b64encode(abs_ingest_path))
        dataset = None

        # Remove reparse from the dictionary, so it is never used to compare validity of cached files.
        print(ingest_config)
        reparse = ingest_config.pop('reparse', False)
        cache = ingest_config.pop('cache', True)

        if not reparse:

            try:
                with open(cache_file, 'rb') as f:
                    print("Using cached data from: {}".format(f.name))
                    dataset = pickle.load(f)

            except (IOError, OSError) as e:
                if e.errno == errno.ENOENT:
                    print("No cache file found for ingest path")
                else:
                    # An actual file read error occurred. Throw back to the user.
                    raise

            if dataset:

                if dataset.ingest_config != ingest_config:
                    logger.info(
                        "Log reader configuration doesn't match cache file")
                    logger.info("Reader: {}".format(ingest_config))
                    logger.info("Cached: {}".format(dataset.ingest_config))
                    logger.info(
                        "Please modify configuration, or delete cache file and try again"
                    )
                    raise ConfigurationError(
                        "Log reader configuration doesn't match cache file")

                if os.path.abspath(os.path.realpath(
                        dataset.ingest_path)) != abs_ingest_path:
                    raise ConfigurationError(
                        "Ingestion path in cache file does not match ingestion path"
                    )

        if dataset is None:

            # Finally read the logs, if that is required
            lr = cls.log_reader_class(ingest_path, **ingest_config)
            dataset = cls(lr.read_logs(), ingest_path, ingest_config)

            # Pickle the object for later rapid loading.
            if cache:
                print("Writing cache file: {}".format(cache_file))
                with open(cache_file, "wb") as f:
                    pickle.dump(dataset, f)

        return dataset
Esempio n. 16
0
    def apply_lookup_table(self, look_up_wl, wl_dest, threshold, priority,
                           match_keywords):
        """
        Uses another workload as lookup table to fill missing job information
        :param look_up_wl:
        :param wl_dest:
        :param threshold:
        :param priority:
        :param match_keywords:
        :return:
        """

        logger.info(
            'Applying look up from workload: {} onto workload: {}'.format(
                look_up_wl.tag, wl_dest.tag))

        assert isinstance(look_up_wl, Workload)
        assert isinstance(threshold, float)
        assert isinstance(priority, int)
        assert isinstance(match_keywords, list)

        n_jobs_replaced = 0

        # apply matching logic (if threshold < 1.0 - so not an exact matching is sought)
        n_print = 10
        if threshold < 1.0:
            for jj, job in enumerate(wl_dest.jobs):

                pc_scanned = progress_percentage(jj, len(wl_dest.jobs),
                                                 n_print)
                if pc_scanned > 0:
                    print("Scanned {}% of source jobs".format(pc_scanned))

                for lu_job in look_up_wl.jobs:

                    # in case of multiple keys considers tha average matching ratio
                    current_match = 0
                    for kw in match_keywords:
                        if getattr(job, kw) and getattr(lu_job, kw):
                            current_match += SequenceMatcher(
                                lambda x: x in "-_", str(getattr(job, kw)),
                                str(getattr(lu_job, kw))).ratio()
                    current_match /= float(len(match_keywords))
                    # ---------------------------------------------------------------

                    if current_match >= threshold:
                        n_jobs_replaced += 1
                        for tsk in job.timesignals.keys():

                            # if the time series happen to be empty, apply the
                            # cross-over
                            if not job.timesignals[tsk] and lu_job.timesignals[
                                    tsk]:
                                logger.warning(
                                    "job {} has empty time series {}!".format(
                                        job.label, tsk))
                                job.timesignals[tsk] = copy.deepcopy(
                                    lu_job.timesignals[tsk])

                            # if there is no priority associated to the target job,
                            # do the cross-over
                            elif not job.timesignals[
                                    tsk].priority and lu_job.timesignals[tsk]:
                                job.timesignals[tsk] = copy.deepcopy(
                                    lu_job.timesignals[tsk])

                            # if there is a priority associated to the target job, but is
                            # lower that the source job, do the cross-over
                            elif job.timesignals[
                                    tsk].priority <= priority and lu_job.timesignals[
                                        tsk]:
                                job.timesignals[tsk] = copy.deepcopy(
                                    lu_job.timesignals[tsk])

        # compare directly (much faster..)
        elif threshold == 1:
            for jj, job in enumerate(wl_dest.jobs):

                pc_scanned = progress_percentage(jj, len(wl_dest.jobs),
                                                 n_print)
                if pc_scanned > 0:
                    print("Scanned {}% of source jobs".format(pc_scanned))

                for lu_job in look_up_wl.jobs:

                    if all(
                            getattr(job, kw) == getattr(lu_job, kw)
                            for kw in match_keywords):
                        n_jobs_replaced += 1
                        for tsk in job.timesignals.keys():

                            if not job.timesignals[tsk]:
                                job.timesignals[tsk] = lu_job.timesignals[tsk]
                            elif job.timesignals[
                                    tsk].priority <= priority and lu_job.timesignals[
                                        tsk]:
                                job.timesignals[tsk] = lu_job.timesignals[tsk]
                            else:
                                pass
        else:
            raise ConfigurationError(
                "matching threshold should be in [0,1], provided {} instead".
                format(threshold))

        return n_jobs_replaced