Exemple #1
0
    def _get_warnings_text_and_table(self):
        """
        Return a :py:class:`Table <lab.reports.Table>` containing one line for
        each run where an unexplained error occured.
        """
        if not self.ERROR_ATTRIBUTES:
            logging.critical("The list of error attributes must not be empty.")

        table = reports.Table(title="Unexplained errors")
        table.set_column_order(self.ERROR_ATTRIBUTES)

        wrote_to_slurm_err = any(
            "output-to-slurm.err" in run.get("unexplained_errors", [])
            for run in self.runs.values())

        for run in self.runs.values():
            error_message = tools.get_unexplained_errors_message(run)
            if error_message:
                logging.error(error_message)
                run_dir = run["run_dir"]
                for attr in self.ERROR_ATTRIBUTES:
                    value = run.get(attr, "?")
                    if attr == "unexplained_errors":
                        value = self._format_unexplained_errors(value)
                        # Use formatted value as-is.
                        table.cell_formatters[run_dir][
                            attr] = reports.CellFormatter()
                    table.add_cell(run_dir, attr, value)

        errors = []

        if wrote_to_slurm_err:
            src_dir = self.eval_dir.rstrip("/")[:-len("-eval")]
            slurm_err_file = src_dir + "-grid-steps/slurm.err"
            try:
                slurm_err_content = tools.get_slurm_err_content(src_dir)
            except FileNotFoundError:
                slurm_err_file = "*-grid-steps/slurm.err"
                errors.append(
                    f"There was output to {slurm_err_file}, but the file was missing "
                    f"when this report was made.")
            else:
                slurm_err_content = tools.filter_slurm_err_content(
                    slurm_err_content)
                errors.append(
                    f"There was output to {slurm_err_file}. Below is the output without"
                    f'"memory cg" errors:\n```\n{slurm_err_content}\n```')
            logging.error(f"There was output to {slurm_err_file}.")

        if table:
            errors.append(str(table))

        infai_1_nodes = {f"ase{i:02d}.cluster.bc2.ch" for i in range(1, 25)}
        infai_2_nodes = {f"ase{i:02d}.cluster.bc2.ch" for i in range(31, 55)}
        nodes = self._get_node_names()
        if nodes & infai_1_nodes and nodes & infai_2_nodes:
            errors.append(
                "Report combines runs from infai_1 and infai_2 partitions.")

        return "\n".join(errors)
Exemple #2
0
    def _scan_planning_data(self):
        problems = set()
        self.domains = defaultdict(list)
        self.problem_runs = defaultdict(list)
        self.domain_algorithm_runs = defaultdict(list)
        self.runs = {}
        for run in self.props.values():
            domain, problem, algo = run["domain"], run["problem"], run[
                "algorithm"]
            problems.add((domain, problem))
            self.problem_runs[(domain, problem)].append(run)
            self.domain_algorithm_runs[(domain, algo)].append(run)
            self.runs[(domain, problem, algo)] = run
        for domain, problem in problems:
            self.domains[domain].append(problem)

        self.algorithms = self._get_algorithm_order()

        num_unexplained_errors = sum(
            int(bool(tools.get_unexplained_errors_message(run)))
            for run in self.runs.values())
        func = logging.info if num_unexplained_errors == 0 else logging.error
        func("Report contains {num_unexplained_errors} runs with unexplained"
             " errors.".format(**locals()))

        if len(problems) * len(self.algorithms) != len(self.runs):
            logging.warning(
                "Not every algorithm has been run on every task. "
                "However, if you applied a filter this is to be "
                "expected. If not, there might be old properties in the "
                "eval-dir that got included in the report. "
                "Algorithms (%d): %s, problems (%d), domains (%d): %s, runs (%d)"
                % (
                    len(self.algorithms),
                    self.algorithms,
                    len(problems),
                    len(self.domains),
                    list(self.domains.keys()),
                    len(self.runs),
                ))

        # Sort each entry in problem_runs by algorithm.
        algo_to_index = {
            algorithm: index
            for index, algorithm in enumerate(self.algorithms)
        }

        def run_key(run):
            return algo_to_index[run["algorithm"]]

        for problem_runs in self.problem_runs.values():
            problem_runs.sort(key=run_key)

        self.algorithm_info = self._scan_algorithm_info()
    def __call__(self,
                 src_dir,
                 eval_dir=None,
                 merge=None,
                 filter=None,
                 **kwargs):
        """
        This method can be used to copy properties from an exp-dir or
        eval-dir into an eval-dir. If the destination eval-dir already
        exist, the data will be merged. This means *src_dir* can either
        be an exp-dir or an eval-dir and *eval_dir* can be a new or
        existing directory.

        We recommend using lab.Experiment.add_fetcher() to add fetchers
        to an experiment. See the method's documentation for a
        description of the parameters.

        """
        if not os.path.isdir(src_dir):
            logging.critical(
                "{} is missing or not a directory".format(src_dir))
        run_filter = tools.RunFilter(filter, **kwargs)

        eval_dir = eval_dir or src_dir.rstrip("/") + "-eval"
        logging.info("Fetching properties from {} to {}".format(
            src_dir, eval_dir))

        if merge is None:
            _check_eval_dir(eval_dir)
        elif merge:
            # No action needed, data will be merged.
            pass
        else:
            tools.remove_path(eval_dir)

        # Load properties in the eval_dir if there are any already.
        combined_props = tools.Properties(os.path.join(eval_dir, "properties"))
        fetch_from_eval_dir = not os.path.exists(
            os.path.join(src_dir, "runs-00001-00100"))
        if fetch_from_eval_dir:
            src_props = tools.Properties(
                filename=os.path.join(src_dir, "properties"))
            run_filter.apply(src_props)
            combined_props.update(src_props)
            logging.info("Fetched properties of {} runs.".format(
                len(src_props)))
        else:
            slurm_err_content = tools.get_slurm_err_content(src_dir)
            if slurm_err_content:
                logging.error("There was output to *-grid-steps/slurm.err")

            new_props = tools.Properties()
            run_dirs = sorted(glob(os.path.join(src_dir, "runs-*-*", "*")))
            total_dirs = len(run_dirs)
            logging.info(
                "Scanning properties from {:d} run directories".format(
                    total_dirs))
            for index, run_dir in enumerate(run_dirs, start=1):
                loglevel = logging.INFO if index % 100 == 0 else logging.DEBUG
                logging.log(loglevel,
                            "Scanning: {:6d}/{:d}".format(index, total_dirs))
                props = self.fetch_dir(run_dir)
                if slurm_err_content:
                    props.add_unexplained_error("output-to-slurm.err")
                id_string = "-".join(props["id"])
                new_props[id_string] = props
            run_filter.apply(new_props)
            combined_props.update(new_props)

        unexplained_errors = 0
        for props in combined_props.values():
            error_message = tools.get_unexplained_errors_message(props)
            if error_message:
                logging.error(error_message)
                unexplained_errors += 1

        tools.makedirs(eval_dir)
        combined_props.write()
        logging.info("Wrote properties file (contains {unexplained_errors} "
                     "runs with unexplained errors).".format(**locals()))
Exemple #4
0
    def _get_warnings_text_and_table(self):
        """
        Return a :py:class:`Table <lab.reports.Table>` containing one line for
        each run where an unexplained error occured.
        """
        if not self.ERROR_ATTRIBUTES:
            logging.critical('The list of error attributes must not be empty.')

        table = reports.Table(title='Unexplained errors')
        table.set_column_order(self.ERROR_ATTRIBUTES)

        wrote_to_slurm_err = any(
            'output-to-slurm.err' in run.get('unexplained_errors', [])
            for run in self.runs.values())

        num_unexplained_errors = 0
        for run in self.runs.values():
            error_message = tools.get_unexplained_errors_message(run)
            if error_message:
                logging.error(error_message)
                num_unexplained_errors += 1
                for attr in self.ERROR_ATTRIBUTES:
                    table.add_cell(run['run_dir'], attr, run.get(attr, '?'))

        if num_unexplained_errors:
            logging.error(
                'There were {num_unexplained_errors} runs with unexplained'
                ' errors.'.format(**locals()))

        errors = []

        if wrote_to_slurm_err:
            src_dir = self.eval_dir.rstrip('/')[:-len('-eval')]
            slurm_err_file = src_dir + '-grid-steps/slurm.err'
            try:
                slurm_err_content = tools.get_slurm_err_content(src_dir)
            except IOError:
                slurm_err_content = (
                    'The slurm.err file was missing while creating the report.'
                )
            else:
                slurm_err_content = tools.filter_slurm_err_content(
                    slurm_err_content)

            logging.error(
                'There was output to {slurm_err_file}.'.format(**locals()))

            errors.append(
                ' Contents of {slurm_err_file} without "memory cg"'
                ' errors:\n```\n{slurm_err_content}\n```'.format(**locals()))

        if table:
            errors.append(str(table))

        infai_1_nodes = set('ase{:02d}.cluster.bc2.ch'.format(i)
                            for i in range(1, 25))
        infai_2_nodes = set('ase{:02d}.cluster.bc2.ch'.format(i)
                            for i in range(31, 55))
        nodes = self._get_node_names()
        if nodes & infai_1_nodes and nodes & infai_2_nodes:
            errors.append(
                'Report combines runs from infai_1 and infai_2 partitions.')

        return '\n'.join(errors)