def test_prod_staging_same(self) -> None: staging_yaml = YAMLDict.from_path(self.path_for_build_file("staging.yaml")) prod_yaml = YAMLDict.from_path(self.path_for_build_file("prod.yaml")) diff = deepdiff.DeepDiff(staging_yaml.get(), prod_yaml.get()) # We expect the RECIDIVIZ_ENV values to be different env_diff = diff["values_changed"].pop("root['env_variables']['RECIDIVIZ_ENV']") self.assertEqual({"new_value": "production", "old_value": "staging"}, env_diff) # We expect the cloud sql instance names to be different, but names should match same pattern cloud_sql_instance_diff = diff["values_changed"].pop( "root['beta_settings']['cloud_sql_instances']" ) staging_cloud_sql_instances: str = cloud_sql_instance_diff["old_value"] prod_cloud_sql_instances = cloud_sql_instance_diff["new_value"] self.assertEqual( ( staging_cloud_sql_instances.replace( "recidiviz-staging", "recidiviz-123" ) # Staging project becomes production .replace("dev-", "prod-") # Dev prefix becomes prod .replace("-0af0a", "") # Development case triage suffix is dropped ), prod_cloud_sql_instances, ) # There should be no other values changed between the two self.assertFalse(diff.pop("values_changed")) # Aside from the few values changed, there should be no other changes self.assertFalse(diff)
def _get_raw_data_file_configs( self) -> Dict[str, DirectIngestRawFileConfig]: """Returns list of file tags we expect to see on raw files for this region.""" if os.path.isdir(self.yaml_config_file_dir): default_filename = f"{self.region_code}_default.yaml" default_file_path = os.path.join(self.yaml_config_file_dir, default_filename) if not os.path.exists(default_file_path): raise ValueError( f"Missing default raw data configs for region: {self.region_code}" ) default_contents = YAMLDict.from_path(default_file_path) default_encoding = default_contents.pop("default_encoding", str) default_separator = default_contents.pop("default_separator", str) raw_data_configs = {} for filename in os.listdir(self.yaml_config_file_dir): if filename == default_filename: continue yaml_file_path = os.path.join(self.yaml_config_file_dir, filename) if os.path.isdir(yaml_file_path): continue yaml_contents = YAMLDict.from_path(yaml_file_path) file_tag = yaml_contents.pop("file_tag", str) if not file_tag: raise ValueError(f"Missing file_tag in [{yaml_file_path}]") if filename != f"{self.region_code.lower()}_{file_tag}.yaml": raise ValueError( f"Mismatched file_tag [{file_tag}] and filename [{filename}]" f" in [{yaml_file_path}]") if file_tag in raw_data_configs: raise ValueError( f"Found file tag [{file_tag}] in [{yaml_file_path}]" f" that is already defined in another yaml file.") raw_data_configs[ file_tag] = DirectIngestRawFileConfig.from_yaml_dict( self.region_code, file_tag, yaml_file_path, default_encoding, default_separator, yaml_contents, filename, ) else: raise ValueError( f"Missing raw data configs for region: {self.region_code}") return raw_data_configs
def get_model_params(cls, yaml_file_path: str) -> Dict[str, Any]: """Get the model parameters from the YAMLDict""" initialization_params = YAMLDict.from_path(yaml_file_path) cls._check_valid_yaml_inputs(initialization_params) model_params: Dict[str, Any] = dict() model_params["reference_year"] = initialization_params.pop( "reference_date", float) model_params["time_step"] = initialization_params.pop( "time_step", float) model_params["disaggregation_axes"] = initialization_params.pop( "disaggregation_axes", list) model_params["data_inputs_raw"] = cls._get_valid_data_inputs( initialization_params) model_params["user_inputs_raw"] = cls._get_user_inputs( initialization_params, model_params) ( model_params["compartments_architecture"], model_params["compartment_costs"], ) = cls._get_valid_compartments(initialization_params) return model_params
def _pipeline_regions_by_job_name() -> Dict[str, str]: """Parses the production_calculation_pipeline_templates.yaml config file to determine which region a pipeline should be run in.""" daily_pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts( "daily_pipelines" ) historical_pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts( "historical_pipelines" ) pipeline_regions = { pipeline.pop("job_name", str): pipeline.pop("region", str) for pipeline in daily_pipelines } pipeline_regions.update( { pipeline.pop("job_name", str): pipeline.pop("region", str) for pipeline in historical_pipelines } ) return pipeline_regions
def _get_month_range_for_metric_and_state() -> Dict[str, Dict[str, int]]: """Determines the maximum number of months that each metric is calculated regularly for each state. Returns a dictionary in the format: { metric_table: { state_code: int, state_code: int } } where the int values are the number of months for which the metric is regularly calculated for that state. """ # Map metric type enum values to the corresponding tables in BigQuery metric_type_to_table: Dict[str, str] = { metric_type.value: table for table, metric_type in dataflow_config.DATAFLOW_TABLES_TO_METRIC_TYPES.items() } all_pipelines = YAMLDict.from_path( dataflow_config.PRODUCTION_TEMPLATES_PATH) daily_pipelines = all_pipelines.pop_dicts("daily_pipelines") historical_pipelines = all_pipelines.pop_dicts("historical_pipelines") # Dict with the format: {metric_table: {state_code: int}} month_range_for_metric_and_state: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for pipeline_config_group in [daily_pipelines, historical_pipelines]: for pipeline_config in pipeline_config_group: if (pipeline_config.pop("pipeline", str) in dataflow_config.ALWAYS_UNBOUNDED_DATE_PIPELINES): # This pipeline is always run in full, and is handled separately continue metrics = pipeline_config.pop("metric_types", str) calculation_month_count = pipeline_config.pop( "calculation_month_count", int) state_code = pipeline_config.pop("state_code", str) for metric in metrics.split(" "): metric_table = metric_type_to_table[metric] current_max = month_range_for_metric_and_state[metric_table][ state_code] month_range_for_metric_and_state[metric_table][ state_code] = max(current_max, calculation_month_count) return month_range_for_metric_and_state
def default_config(self) -> DirectIngestRawFileDefaultConfig: default_filename = f"{self.region_code.lower()}_default.yaml" default_file_path = os.path.join(self.yaml_config_file_dir, default_filename) if not os.path.exists(default_file_path): raise ValueError( f"Missing default raw data configs for region: {self.region_code}. " f"None found at path: [{default_file_path}]") default_contents = YAMLDict.from_path(default_file_path) default_encoding = default_contents.pop("default_encoding", str) default_separator = default_contents.pop("default_separator", str) default_ignore_quotes = default_contents.pop("default_ignore_quotes", bool) return DirectIngestRawFileDefaultConfig( filename=default_filename, default_encoding=default_encoding, default_separator=default_separator, default_ignore_quotes=default_ignore_quotes, )
def from_yaml(cls, yaml_path: str) -> "DatasetSchemaInfo": yaml_contents = YAMLDict.from_path(yaml_path) dataset = yaml_contents.pop("dataset", str) yaml_tables = yaml_contents.pop_dicts("tables") tables: List[TableSchemaInfo] = [] for table_dict in yaml_tables: table_name = table_dict.pop("name", str) columns = table_dict.pop("columns", list) tables.append( TableSchemaInfo( table_name=table_name, columns=sorted([c.lower() for c in columns]), )) return DatasetSchemaInfo( dataset=dataset, tables=tables, )
def from_file(cls, path: str = PRODUCTS_CONFIG_PATH) -> "ProductConfigs": """Reads a product config file and returns a list of corresponding ProductConfig objects.""" product_config = YAMLDict.from_path(path).pop("products", list) products = [ ProductConfig( name=product["name"], description=product["description"], exports=product["exports"], states=[ ProductStateConfig( state_code=state["state_code"], environment=state["environment"] ) for state in product["states"] ] if "states" in product else None, environment=product.get("environment"), is_state_agnostic=product.get("is_state_agnostic", False), ) for product in product_config ] return cls(products=products)
def _validate_yaml(yaml_path: str, uploads: List[Dict[str, Any]]) -> None: "Validate the contents of the relevant yaml file" yaml_dict = YAMLDict.from_path(yaml_path) # Check for all required and no extra inputs required_inputs = { "user_inputs", "compartments_architecture", "reference_date", "time_step", "data_inputs", "disaggregation_axes", "per_year_costs", } given_inputs = set(yaml_dict.keys()) missing_inputs = required_inputs.difference(given_inputs) if len(missing_inputs) > 0: raise ValueError(f"Missing yaml inputs: {missing_inputs}") unexpected_inputs = given_inputs.difference(required_inputs) if len(unexpected_inputs) > 0: raise ValueError(f"Unexpected yaml inputs: {unexpected_inputs}") # Check that all disaggregation axes are in all the dataframes disaggregation_axes = yaml_dict.pop("disaggregation_axes", list) for axis in disaggregation_axes: for upload in uploads: if upload["table"] != "total_population_data_raw": df = upload["data_df"] if axis not in df.columns: raise ValueError( f"All disagregation axes must be included in the input dataframe columns\n" f"Expected: {disaggregation_axes}, Actual: {df.columns}" )
def test_travis_yaml_parses(self) -> None: yaml_dict = YAMLDict.from_path(self.path_for_build_file(".travis.yml")) self.assertTrue(yaml_dict.get())
def test_staging_yaml_parses(self) -> None: yaml_dict = YAMLDict.from_path(self.path_for_build_file("staging.yaml")) self.assertTrue(yaml_dict.get())
def compare_dataflow_output_to_sandbox( sandbox_dataset_prefix: str, job_name_to_compare: str, base_output_job_id: str, sandbox_output_job_id: str, additional_columns_to_compare: List[str], allow_overwrite: bool = False, ) -> None: """Compares the output for all metrics produced by the daily pipeline job with the given |job_name_to_compare| between the output from the |base_output_job_id| job in the dataflow_metrics dataset and the output from the |sandbox_output_job_id| job in the sandbox dataflow dataset.""" bq_client = BigQueryClientImpl() sandbox_dataflow_dataset_id = (sandbox_dataset_prefix + "_" + DATAFLOW_METRICS_DATASET) sandbox_comparison_output_dataset_id = (sandbox_dataset_prefix + "_dataflow_comparison_output") sandbox_comparison_output_dataset_ref = bq_client.dataset_ref_for_id( sandbox_comparison_output_dataset_id) if bq_client.dataset_exists(sandbox_comparison_output_dataset_ref) and any( bq_client.list_tables(sandbox_comparison_output_dataset_id)): if not allow_overwrite: if __name__ == "__main__": logging.error( "Dataset %s already exists in project %s. To overwrite, set --allow_overwrite.", sandbox_comparison_output_dataset_id, bq_client.project_id, ) sys.exit(1) else: raise ValueError( f"Cannot write comparison output to a non-empty dataset. Please delete tables in dataset: " f"{bq_client.project_id}.{sandbox_comparison_output_dataset_id}." ) else: # Clean up the existing tables in the dataset for table in bq_client.list_tables( sandbox_comparison_output_dataset_id): bq_client.delete_table(table.dataset_id, table.table_id) bq_client.create_dataset_if_necessary( sandbox_comparison_output_dataset_ref, TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS) query_jobs: List[Tuple[QueryJob, str]] = [] pipelines = YAMLDict.from_path(PRODUCTION_TEMPLATES_PATH).pop_dicts( "daily_pipelines") for pipeline in pipelines: if pipeline.pop("job_name", str) == job_name_to_compare: pipeline_metric_types = pipeline.peek_optional("metric_types", str) if not pipeline_metric_types: raise ValueError( f"Pipeline job {job_name_to_compare} missing required metric_types attribute." ) metric_types_for_comparison = pipeline_metric_types.split() for metric_class, metric_table in DATAFLOW_METRICS_TO_TABLES.items( ): metric_type_value = DATAFLOW_TABLES_TO_METRIC_TYPES[ metric_table].value if metric_type_value in metric_types_for_comparison: comparison_query = _query_for_metric_comparison( bq_client, base_output_job_id, sandbox_output_job_id, sandbox_dataflow_dataset_id, metric_class, metric_table, additional_columns_to_compare, ) query_job = bq_client.create_table_from_query_async( dataset_id=sandbox_comparison_output_dataset_id, table_id=metric_table, query=comparison_query, overwrite=True, ) # Add query job to the list of running jobs query_jobs.append((query_job, metric_table)) for query_job, output_table_id in query_jobs: # Wait for the insert job to complete before looking for the table query_job.result() output_table = bq_client.get_table( sandbox_comparison_output_dataset_ref, output_table_id) if output_table.num_rows == 0: # If there are no rows in the output table, then the output was identical bq_client.delete_table(sandbox_comparison_output_dataset_id, output_table_id) metrics_with_different_output = peekable( bq_client.list_tables(sandbox_comparison_output_dataset_id)) logging.info( "\n*************** DATAFLOW OUTPUT COMPARISON RESULTS ***************\n" ) if metrics_with_different_output: for metric_table in metrics_with_different_output: # This will always be true, and is here to silence mypy warnings assert isinstance(metric_table, bigquery.table.TableListItem) logging.warning( "Dataflow output differs for metric %s. See %s.%s for diverging rows.", metric_table.table_id, sandbox_comparison_output_dataset_id, metric_table.table_id, ) else: logging.info( "Dataflow output identical. Deleting dataset %s.", sandbox_comparison_output_dataset_ref.dataset_id, ) bq_client.delete_dataset(sandbox_comparison_output_dataset_ref, delete_contents=True)
def test_circleci_yaml_parses(self) -> None: yaml_dict = YAMLDict.from_path(self.path_for_build_file(".circleci/config.yml")) self.assertTrue(yaml_dict.get())
def __init__( self, products: List[ProductConfig], root_calc_docs_dir: str, ): self.root_calc_docs_dir = root_calc_docs_dir self.products = products self.states_by_product = self.get_states_by_product() # Reverses the states_by_product dictionary self.products_by_state: Dict[StateCode, Dict[ GCPEnvironment, List[ProductName]]] = defaultdict(lambda: defaultdict(list)) for product_name, environments_to_states in self.states_by_product.items( ): for environment, states in environments_to_states.items(): for state in states: self.products_by_state[state][environment].append( product_name) self.dag_walker = BigQueryViewDagWalker( _build_views_to_update( view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS, candidate_view_builders=DEPLOYED_VIEW_BUILDERS, dataset_overrides=None, override_should_build_predicate=True, )) self.prod_templates_yaml = YAMLDict.from_path( PRODUCTION_TEMPLATES_PATH) self.daily_pipelines = self.prod_templates_yaml.pop_dicts( "daily_pipelines") self.historical_pipelines = self.prod_templates_yaml.pop_dicts( "historical_pipelines") self.metric_calculations_by_state = self._get_state_metric_calculations( self.daily_pipelines, "daily") # combine with the historical pipelines for name, metric_info_list in self._get_state_metric_calculations( self.historical_pipelines, "triggered by code changes").items(): self.metric_calculations_by_state[name].extend(metric_info_list) # Reverse the metric_calculations_by_state dictionary self.state_metric_calculations_by_metric: Dict[ str, List[StateMetricInfo]] = defaultdict(list) for state_name, metric_info_list in self.metric_calculations_by_state.items( ): for metric_info in metric_info_list: self.state_metric_calculations_by_metric[ metric_info.name].append( StateMetricInfo( name=state_name, month_count=metric_info.month_count, frequency=metric_info.frequency, )) self.metrics_by_generic_types = self._get_metrics_by_generic_types() self.generic_types_by_metric_name = {} for generic_type, metric_list in self.metrics_by_generic_types.items(): for metric in metric_list: self.generic_types_by_metric_name[ DATAFLOW_METRICS_TO_TABLES[metric]] = generic_type def _preprocess_views( v: BigQueryView, _parent_results: Dict[BigQueryView, None]) -> None: dag_key = DagKey(view_address=v.address) node = self.dag_walker.nodes_by_key[dag_key] # Fills out full child/parent dependencies and tree representations for use # in various sections. self.dag_walker.populate_node_family_for_node( node=node, datasets_to_skip={DATAFLOW_METRICS_MATERIALIZED_DATASET} | RAW_TABLE_DATASETS, custom_node_formatter=self. _dependency_tree_formatter_for_gitbook, view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS | LATEST_VIEW_DATASETS, ) self.dag_walker.process_dag(_preprocess_views) self.all_views_to_document = self._get_all_views_to_document()