Esempio n. 1
0
    def check_metadata(self):
        """checks some dependencies among metadata keys

        Raises:
            ConfigurationError is issues found

        """
        if self.config_metadata:

            if "traverser" in self.config_metadata:
                classname = self.config_metadata["traverser"]
                try:
                    TraverserFactory().instantiate(classname, self)
                except KeyError:
                    raise Exception(
                        classname + " is not a valid and/or registered Traverser"
                    )

            if "data_object" in self.config_metadata:

                cfg = self.config_metadata["data_object"]

                if "read_from_cache" in cfg and (
                    cfg["read_from_cache"]
                    or str(cfg["read_from_cache"]).lower() == "true"
                ):

                    if not "read_filename" in cfg:
                        raise ConfigurationError(
                            "metadata.data_object: if read_from_cache==true, you must set 'read_filename'"
                        )

                    # just check path exists but not that one can read into a DataObject
                    if not os.path.exists(cfg["read_filename"]):
                        raise ConfigurationError(
                            "Invalid metadata.data_object.read_filename: "
                            + str(cfg["read_filename"])
                        )

                if "write_to_cache" in cfg and (
                    cfg["write_to_cache"]
                    or str(cfg["write_to_cache"]).lower() == "true"
                ):

                    if not "write_filename" in cfg:
                        raise ConfigurationError(
                            "metadata.data_object: if write_to_cache==true, you must set 'write_filename'"
                        )
Esempio n. 2
0
    def perform_any_config_fragment_substitution(config_str):
        """Given some configuration file content string, look for \
        subtitutions given by `$$FILE=path/to/config/file/fragment.json$$` and make the \
        replacements using the filenames provided\
        For example: \
        { \
            $$FILE=/tmp/metadata.json$$ \
            "implementation_config": { \
                $$FILE= config/read_write_fragment.json $$ \
            } \
        } \
        will inject /tmp/metadata.json into the 2nd line of that config.

        Args:
            config_str (str): content of some configuration file that may or may not contain substition variables

        Returns:
            config_str (str): the post-substituted configuration string

        """
        def env_override(value, key):
            return os.getenv(key, value)

        jinja_env = Environment(loader=FileSystemLoader([".", "/"]))
        jinja_env.filters["env_override"] = env_override
        try:
            config_str_template = jinja_env.from_string(config_str)
            config_str = config_str_template.render()
        except (TemplateNotFound) as error:
            filenames = str(error)
            raise ConfigurationError(
                f"Substitution files do not exist: {filenames}")
        return config_str
Esempio n. 3
0
    def check_sections(self):
        """Check that all the sections in implementation are supported ones.
        Either the user supplied metata.section_registry, or they are using default sections

        Raises:
            ConfigurationError if declaring metadata.section_registry and sections from implementation were not found in metadata
            or vice versa, or if using default operations but sections found that were not supported

        """
        if (
            self.config_metadata
            and "section_registry" in self.config_metadata
            and len(self.config_metadata["section_registry"]) > 0
        ):
            actual_set = set(self.config.keys())
            user_set = set(self.config_metadata["section_registry"])

            if actual_set != user_set:

                diff = user_set.difference(actual_set)
                if len(diff) > 0:
                    msg = (
                        "Following sections from metadata were not found implementation: "
                        + str(diff)
                    )
                    raise ConfigurationError(msg)

                diff = actual_set.difference(user_set)
                if len(diff) > 0:
                    msg = (
                        "Following sections from implementation were not found in metadata: "
                        + str(diff)
                    )
                    raise ConfigurationError(msg)

            logging.info("OK: section_registry sections match implementation sections")
            return

        # otherwise, let's check for default operations
        supported = OperationType.values()
        for section_key in self.config.keys():
            if section_key not in supported:
                raise ConfigurationError("Unspported operation: %s" % section_key)
        logging.info("OK: all sections are supported operations")
Esempio n. 4
0
    def check_node_exists(node_names, key):
        """check that some specified destination is node on graph

        Args:
            nodes_names (list): list of node names
            key (str): name of node to check

        Raises:
            ConfigurationError

        """
        if not key in node_names:
            raise ConfigurationError("Destination %s does not exist" % key)
Esempio n. 5
0
    def upstream_keys(self, instance_name):
        """get list of keys (names of nodes in the DAG) that feed into instance_name node

        Args:
            instance_name (str): name of instance

        Returns:
            list of nodes

        """
        if not self.G2.has_node(instance_name):
            raise ConfigurationError("Node not found in the DAG: %s" %
                                     instance_name)
        return list(self.G2.predecessors(instance_name))
Esempio n. 6
0
    def check_connected_components(self):
        """now we can count the number of connected components. >1 is problem

        Raises:
            ConfigurationError if multiuple connected components

        """
        connected_components = nx.connected_components(self.G)
        n = sum([1 for c in connected_components])
        if n > 1:
            raise ConfigurationError(
                "Found multiple connected components: %s" %
                str(list(nx.connected_components(self.G))))
        else:
            logging.info("OK: 1 connected component")
Esempio n. 7
0
    def check_for_cycles(self):
        """check for cycles

        Raise:
            ConfigurationError if cycles found

        """
        cycles = None
        try:
            cycles = nx.find_cycle(self.G2)
            logging.info("cycles:" + str(cycles))
        except nx.exception.NetworkXNoCycle as e:
            # it throws an error if there are no cycles, the opposite of what we want
            if str(e) != "No cycle found.":
                raise e  # pragma: no cover
            else:
                logging.info("OK: no cycles found")
        # to get here, we must have cycles!
        if cycles:
            raise ConfigurationError("Cycle(s) found: %s" % str(cycles))
Esempio n. 8
0
    def dict_raise_on_duplicates(self, ordered_pairs):
        """Reject duplicate keys in JSON string, ie. sections and node names.

        Args:
            ordered_pairs (list): list of key:values from the config \
            Example: ordered_pairs `[('class', 'CsvReader'), ('filename', 'data/tennis.csv'), ('destinations', ['write_output'])]` \
            ordered_pairs `[('read_data', {'class': 'CsvReader', 'filename': 'data/tennis.csv', 'destinations': ['write_output']})]`

        Returns:
            dictionary (dict): dictionary of key (node type) and value (node name)

        """
        # https://stackoverflow.com/questions/14902299/json-loads-allows-duplicate-keys-in-a-dictionary-overwriting-the-first-value

        d = {}
        for k, v in ordered_pairs:
            if k in d:
                raise ConfigurationError("duplicate key: %r" % (k, ))
            else:
                d[k] = v
        return d
Esempio n. 9
0
    def _parse_config(self):
        """Assign top level keys to config attributes

        Note:
            Assign top level keys to config attributes. The method then assigns the inner dict to the top level key object

        Returns:
            Nothing, side effect is that top level keys are added as config attributes

        """
        # __init__ json.load checks that implementation_config top-level keys are unique.
        # Here, we check next-level down and error out on first duplicate found
        all_keys = set()
        for key in self.config.keys():
            section_dict = self.config[key]
            self.__setattr__(key, section_dict)
            for k2 in section_dict.keys():
                if k2 in all_keys:
                    raise ConfigurationError(
                        "Operations must all have unique names in the configuration. Duplicate key: '%s'"
                        % k2)
                else:
                    all_keys.add(k2)
Esempio n. 10
0
    def __init__(self,
                 config_location,
                 is_dict_config=False,
                 dict_config=None):
        """Read in configuration file and parse into specified values

        Args:
            config_location (str): valid filepath for file
            is_dict_config (bool): are we passing in a dictionary configuration directly
            dict_config (dict): dictionary object, if is_dict_config

        """
        if is_dict_config:
            ext = None

            if dict_config is None:
                raise Exception("expected dict_config was None")

            if not isinstance(dict_config, dict):
                raise Exception("did not receive expected dict_config")

            dict_str = jstyleson.dumps(dict_config)

            config_str = Configuration.perform_any_config_fragment_substitution(
                dict_str)

        else:
            logging.info("Loading config file at {}".format(config_location))
            self.config_location = config_location

            if os.path.exists(config_location):
                ext = os.path.splitext(config_location)[1].lower()
                if ext not in SUPPORTED_EXTS:
                    raise ValueError(
                        "config file at: {} has improper extension type - please use a .json or .yml file"
                        .format(config_location))

                with open(config_location, "r") as f:
                    config_str = f.read()

                config_str = Configuration.perform_any_config_fragment_substitution(
                    config_str)

            else:
                raise Exception(
                    "config file at: {} not found".format(config_location))

        if ext is None or ext == ".json":
            self.config = jstyleson.loads(
                config_str, object_pairs_hook=self.dict_raise_on_duplicates)
        elif ext in [".yaml", ".yml"]:
            self.config = yaml.load(config_str, Loader=yaml.FullLoader)

        assert isinstance(self.config, dict)

        # check top-level keys
        for k in self.config:
            if k not in ConfigurationSectionType.values():
                msg = "Unsupported top-level key: %s. " % k
                msg += "Supported keys are %s" % str(
                    ConfigurationSectionType.values())
                raise ConfigurationError(msg)

        # metadata section can be optional
        self.config_metadata = None
        if ConfigurationSectionType.METADATA.value in self.config:
            self.config_metadata = self.config[
                ConfigurationSectionType.METADATA.value]

        # implemetation_config section is required
        if not ConfigurationSectionType.IMPLEMENTATION_CONFIG.value in self.config:
            raise ConfigurationError(
                "Did not find required top-level key %s" %
                ConfigurationSectionType.IMPLEMENTATION_CONFIG.value)

        # keep a copy of the complete configuration
        self.complete_config = self.config.copy()

        # note: config is now just the implementation component of the dictionary
        self.config = self.config[
            ConfigurationSectionType.IMPLEMENTATION_CONFIG.value]

        # store the dag object
        self.dag = ConfigurationDag(self.config)

        # populate configuration file string and hash
        self.config_string, self.config_hash = self._get_configuration_hash()

        # get the formatted time this file was instantiated
        self.config_time = datetime.datetime.now().strftime("%Y%m%d_%H%M")

        # parse the file into an internal config object
        self._parse_config()

        self.check_config()
Esempio n. 11
0
    def check_config(self):
        """check the configuration as much as we can as early as we can

        Raises:
            various exceptions if any checks fail

        """
        self.check_metadata()

        self.check_sections()

        self.nodename_to_classname = {}

        unique_class_keys = set()

        self.instance_to_config = {}

        # check that all child nodes of each section have a Factory.CLASS_KEY field
        for section_key in self.config.keys():

            for child_key in self.config[section_key].keys():

                child = self.config[section_key][child_key]

                self.instance_to_config[child_key] = child

                if not NodeFactory.CLASS_KEY in child:
                    raise ConfigurationError("No class key found in %s.%s" %
                                             (section_key, child_key))

                self.nodename_to_classname[child_key] = child[
                    NodeFactory.CLASS_KEY]

                unique_class_keys.add((child[NodeFactory.CLASS_KEY],
                                       child.get(NodeFactory.CLASS_PREFIX)))

                for k in [
                        "destination_pipeline",
                        "destination_models",
                        "destination_postprocesses",
                        "destination_writer",
                ]:
                    if k in child:
                        raise Exception(
                            "Do you have a old config file? You have %s. Nodes just have 'destinations':[] now",
                            k,
                        )

        logging.info("OK: all class keys are present")

        # get class_prefixes by traversing node package
        unique_class_keys = self._traverse_node_package(unique_class_keys)
        unique_nodes = set([x[0] for x in unique_class_keys])

        # check that each referenced class is registered in NodeFactory
        # Regex pattern in `_traverse_node_package` should capture the right file (class prefix) to register
        # node (class_key). Here we attempt to register any class that is not already registered.
        for class_key, class_prefix in unique_class_keys:
            if not NodeFactory().is_registered(class_key):
                try:
                    logging.info(f"attempting to register {class_key}")
                    self._register_class(class_key, class_prefix)
                except:
                    logging.error(
                        f"Cannot register node class {class_key} with prefix {class_prefix}"
                    )
        for class_key in unique_nodes:
            if not NodeFactory().is_registered(class_key):
                raise ConfigurationError(
                    f"Cannot register node class {class_key}")

        # check necessary_configs
        for instance_name in self.nodename_to_classname:
            class_key = self.nodename_to_classname[instance_name]

            configuration_dict = self.instance_to_config[instance_name]

            instance = NodeFactory().instantiate(class_key, self,
                                                 instance_name)

            NodeFactory().valid_configuration(instance, configuration_dict)

        logging.info("OK: all classes recognized")
        logging.info("OK: good necessary_configs")

        # run our DAG checks. Throws error if not OK
        self.dag.check_dag()
Esempio n. 12
0
    def create_dag(self):
        """Create the DAG

        Returns:
            nothing. Side effect is to set up graphs and node map

        """
        logging.info("Checking configuration DAG")
        G = nx.Graph()
        G2 = nx.DiGraph()
        node_names = set()
        cleanup_nodes = set()

        some_postprocess_node = None

        # key to section type
        node_map = {}

        self.conditional_nodes = set()

        # add the nodes to the graph:
        for section_key in self.config.keys():

            for key in self.config[section_key].keys():
                logging.debug("Adding node '%s'" % key)
                G.add_node(key)
                G2.add_node(key)
                node_names.add(key)

                node_map[key] = section_key

                # root out conditional nodes...
                node_config = self.config[section_key][key]
                node_class = node_config["class"]
                class_obj = NodeFactory().name_dict[node_class]
                if issubclass(class_obj, AbstractConditionalPath):
                    self.conditional_nodes.add(key)

                # cleanup section can be disconnected from rest of graph so let's keep track of these nodes
                if section_key == OperationType.cleanup.value:
                    cleanup_nodes.add(key)

                # hack: we are going to add an edge from a postprcocess step (any one) to cleanup nodes so they
                # are not a separate connected component
                if (section_key == OperationType.postprocess.value
                        and some_postprocess_node is None):
                    some_postprocess_node = key

        # add the edges
        for section_key in self.config.keys():

            for key in self.config[section_key].keys():
                d = self.config[section_key][key]

                if "destinations" in d:
                    for destination in d["destinations"]:

                        if not isinstance(destination, str):
                            raise ConfigurationError(
                                "Unrecognized destination type: %s" %
                                destination)

                        if destination in node_map:
                            ConfigurationDag.add_edge(G, G2, node_names, key,
                                                      destination)
                        else:
                            raise ConfigurationError(
                                "Did not find %s destination in %s.%s" %
                                (destination, section_key, key))

        logging.info("OK: good referential integrity")

        self.G = G
        self.G2 = G2
        self.node_map = node_map