Esempio n. 1
0
 def _run(self, cohort_config):
     if "query" not in cohort_config:
         raise ValueError(
             dedent("""
         Section: cohort_config -
         key 'query' not found. You must define a cohort query."""))
     query = cohort_config["query"]
     if "{as_of_date}" not in query:
         raise ValueError(
             dedent("""
         Section: cohort_config -
         If 'query' is used as cohort_config,
         {as_of_date} must be present"""))
     if 'name' in cohort_config and not string_is_tablesafe(
             cohort_config['name']):
         raise ValueError(
             "Section: cohort_config - "
             "name should only contain letters, numbers, and underscores")
     dated_query = query.replace("{as_of_date}", "2016-01-01")
     logging.info("Validating cohort query")
     try:
         self.db_engine.execute(f"explain {dated_query}")
     except Exception as e:
         raise ValueError(
             dedent(f"""
             Section: cohort_config -
             given query can not be run with a sample as_of_date .
             query: "{query}"
             Full error: {e}"""))
Esempio n. 2
0
    def _run(self, label_config):
        logger.spam("Validating label configuration")
        if not label_config:
            raise ValueError(
                dedent(
                    """
            Section: label_config -
            Section not found. You must define a label config."""
                )
            )

        if len(set(label_config.keys()).intersection({"query", "filepath"})) != 1:
            raise ValueError(
                dedent(
                    """
            Section: label_config -
            keys ({label_config.keys()}) do not contain exactly one of 'filepath'
            or 'query'. You must pass a filepath to a label query or include one
            in the config."""
                )
            )
        label_config = load_query_if_needed(label_config)
        if "name" in label_config and not string_is_tablesafe(label_config["name"]):
            raise ValueError(
                "Section: label_config - "
                "name should only contain lowercase letters, numbers, and underscores"
            )
        self._validate_query(label_config["query"])
        self._validate_include_missing_labels_in_train_as(
            label_config.get("include_missing_labels_in_train_as", None)
        )
        logger.debug("Validation of label configuration was successful")
Esempio n. 3
0
    def _validate_keys(self, aggregation_config):
        logger.spam("Validating feature aggregation keys")
        for key in [
            "from_obj",
            "intervals",
            "knowledge_date_column",
            "prefix",
        ]:
            if key not in aggregation_config:
                raise ValueError(
                    dedent(
                        """
                Section: feature_aggregations -
                '{} required as key: aggregation config: {}""".format(
                            key, aggregation_config
                        )
                    )
                )
        if not string_is_tablesafe(aggregation_config["prefix"]):
            raise ValueError(
                dedent(
                    f"""Section: feature_aggregations -
                    Feature aggregation prefix should only contain
                    lowercase letters, numbers, and underscores.
                    Aggregation config: {aggregation_config}
                    """
                )
            )
        if "groups" in aggregation_config:
            if aggregation_config["groups"] != [self.entity_id_column]:
                raise ValueError(
                    dedent(
                        """Specifying groupings for feature aggregation is 
                        not supported. Features can only be grouped at the 
                        entity_id level."""
                    )
                )
            else:
                logger.warning(
                    dedent(
                        """Specifying groupings for feature aggregation is 
                        not supported. In the future, please exclude this key 
                        from your feature configuration."""
                    )
                )

        logger.debug("Validation of feature aggregation keys was successful")
Esempio n. 4
0
 def _run(self, cohort_config):
     logger.spam("Validating of cohort configuration")
     if len(set(cohort_config.keys()).intersection({"query", "filepath"})) != 1:
         raise ValueError(
             dedent(
                 """
         Section: cohort_config -
         keys ({cohort_config.keys()}) do not contain exactly one of 'filepath'
         or 'query'. You must pass a filepath to a cohort query or include one
         in the config."""
             )
         )
     cohort_config = load_query_if_needed(cohort_config)
     query = cohort_config["query"]
     if "{as_of_date}" not in query:
         raise ValueError(
             dedent(
                 """
         Section: cohort_config -
         If 'query' is used as cohort_config,
         {as_of_date} must be present"""
             )
         )
     if "name" in cohort_config and not string_is_tablesafe(cohort_config["name"]):
         raise ValueError(
             "Section: cohort_config - "
             "name should only contain lowercase letters, numbers, and underscores"
         )
     dated_query = query.replace("{as_of_date}", "2016-01-01")
     logger.spam("Validating cohort query via SQL EXPLAIN")
     try:
         self.db_engine.execute(f"explain {dated_query}")
         logger.debug("Validation of cohort query was successful")
     except Exception as e:
         raise ValueError(
             dedent(
                 f"""
             Section: cohort_config -
             given query can not be run with a sample as_of_date .
             query: "{query}"
             Full error: {e}"""
             )
         )
     logger.debug("Validation of cohort configuration was successful")
Esempio n. 5
0
    def _run(self, label_config):
        if not label_config:
            raise ValueError(
                dedent("""
            Section: label_config -
            Section not found. You must define a label config."""))

        if "query" not in label_config:
            raise ValueError(
                dedent("""
            Section: label_config -
            key 'query' not found. You must define a label query."""))
        if 'name' in label_config and not string_is_tablesafe(
                label_config['name']):
            raise ValueError(
                "Section: label_config - "
                "name should only contain letters, numbers, and underscores")
        self._validate_query(label_config["query"])
        self._validate_include_missing_labels_in_train_as(
            label_config.get("include_missing_labels_in_train_as", None))
Esempio n. 6
0
 def _validate_keys(self, aggregation_config):
     for key in [
             "from_obj",
             "intervals",
             "groups",
             "knowledge_date_column",
             "prefix",
     ]:
         if key not in aggregation_config:
             raise ValueError(
                 dedent("""
             Section: feature_aggregations -
             '{} required as key: aggregation config: {}""".format(
                     key, aggregation_config)))
             if not string_is_tablesafe(aggregation_config['prefix']):
                 raise ValueError(
                     dedent(f"""Section: feature_aggregations -
                         Feature aggregation prefix should only contain
                         lowercase letters, numbers, and underscores.
                         Aggregation config: {aggregation_config}
                         """))
Esempio n. 7
0
def test_string_is_not_tablesafe(s):
    assert not string_is_tablesafe(s)
Esempio n. 8
0
    def _run(self, scoring_config):
        logger.spam("Validating scoring configuration")
        if "testing_metric_groups" not in scoring_config:
            logger.warning(
                "Section: scoring - No testing_metric_groups configured. " +
                "Your experiment may run, but you will not have any " +
                "evaluation metrics computed")
        if "training_metric_groups" not in scoring_config:
            logger.warning(
                "Section: scoring - No training_metric_groups configured. " +
                "If training set evaluation metrics are desired, they must be added"
            )
        metric_lookup = catwalk.evaluation.ModelEvaluator.available_metrics
        available_metrics = set(metric_lookup.keys())
        for group in ("testing_metric_groups", "training_metric_groups"):
            for metric_group in scoring_config.get(group, {}):
                given_metrics = set(metric_group["metrics"])
                bad_metrics = given_metrics - available_metrics
                if bad_metrics:
                    raise ValueError(
                        dedent("""Section: scoring -
                        The following given metrics '{}' are unavailable.
                        Available metrics are: '{}'
                        """.format(bad_metrics, available_metrics)))
                for given_metric in given_metrics:
                    metric_function = metric_lookup[given_metric]
                    if not hasattr(metric_function, "greater_is_better"):
                        raise ValueError(
                            dedent("""Section: scoring -
                        The metric {} does not define the attribute
                        'greater_is_better'. This can only be fixed in the catwalk.metrics
                        module. If you still would like to use this metric, consider
                        submitting a pull request""".format(given_metric)))

            if "subsets" in scoring_config:
                for subset in scoring_config["subsets"]:
                    # 1. Validate that all required keys are present
                    if "query" not in subset:
                        raise ValueError(
                            dedent(f"""Section: subsets -
                                The subset {subset} does not have a query key.
                                To run evaluations on a subset, you must
                                include a query that returns a list of distinct
                                entity_ids and has a placeholder for an
                                as_of_date
                                """))
                    if "name" not in subset:
                        raise ValueError(
                            dedent(f"""Section: subsets -
                                The subset {subset} does not have a name key.
                                Please give a name to your subset. This is used
                                in the namespacing of subset tables created by
                                triage.
                                """))
                    if not string_is_tablesafe(subset['name']):
                        raise ValueError(
                            dedent(f"""Section: subsets -
                                The subset {subset} name should only contain
                                lowercase letters, numbers, and underscores
                                """))

                    # 2. Validate that query conforms to the expectations
                    if "{as_of_date}" not in subset["query"]:
                        raise ValueError(
                            dedent(f"""Section: subsets -
                                The subset query {subset["query"]} must
                                include a placeholder for the as_of_date
                                """))
                    if "entity_id" not in subset["query"]:
                        raise ValueError(
                            dedent(f"""The subset qeury {subset["query"]} must
                                return a list of distinct entity_ids
                                """))

        logger.debug("Validation of scoring configuration was successful")