def _create_dataset_and_deploy_views( views_to_update: List[BigQueryView], set_temp_dataset_table_expiration: bool = False ) -> None: """Create and update the given views and their parent datasets. For each dataset key in the given dictionary, creates the dataset if it does not exist, and creates or updates the underlying views mapped to that dataset. If a view has a set materialized_view_table_id field, materializes the view into a table. Args: views_to_update: A list of view objects to be created or updated. set_temp_dataset_table_expiration: If True, new datasets will be created with an expiration of TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS. """ bq_client = BigQueryClientImpl() _create_all_datasets_if_necessary( bq_client, views_to_update, set_temp_dataset_table_expiration ) dag_walker = BigQueryViewDagWalker(views_to_update) def process_fn(v: BigQueryView, parent_results: Dict[BigQueryView, bool]) -> bool: """Returns True if this view or any of its parents were updated.""" return _create_or_update_view_and_materialize_if_necessary( bq_client, v, parent_results ) dag_walker.process_dag(process_fn)
def test_dag_exception_handling(self) -> None: """Test that exceptions during processing propagate properly.""" class TestDagWalkException(ValueError): pass walker = BigQueryViewDagWalker(self.all_views) def process_throws( _view: BigQueryView, _parent_results: Dict[BigQueryView, None] ) -> None: raise TestDagWalkException() with self.assertRaises(TestDagWalkException): _ = walker.process_dag(process_throws) def process_throws_after_root( view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey] ) -> DagKey: node_key = (view.dataset_id, view.view_id) node = walker.nodes_by_key[node_key] if not node.is_root: raise TestDagWalkException() return node_key with self.assertRaises(TestDagWalkException): _ = walker.process_dag(process_throws_after_root)
def rematerialize_views_for_namespace( # TODO(#5785): Clarify use case of BigQueryViewNamespace filter (see ticket for more) bq_view_namespace: BigQueryViewNamespace, candidate_view_builders: Sequence[BigQueryViewBuilder], dataset_overrides: Optional[Dict[str, str]] = None, skip_missing_views: bool = False, ) -> None: """For all views in a given namespace, re-materializes any materialized views. This should be called only when we want to refresh the data in the materialized view, not when we want to update the underlying query of the view. """ set_default_table_expiration_for_new_datasets = bool(dataset_overrides) if set_default_table_expiration_for_new_datasets: logging.info( "Found non-empty dataset overrides. New datasets created in this process will have a " "default table expiration of 24 hours." ) try: views_to_update = _build_views_to_update( candidate_view_builders=candidate_view_builders, dataset_overrides=dataset_overrides, ) bq_client = BigQueryClientImpl() _create_all_datasets_if_necessary( bq_client, views_to_update, set_default_table_expiration_for_new_datasets ) dag_walker = BigQueryViewDagWalker(views_to_update) def _materialize_view( v: BigQueryView, _parent_results: Dict[BigQueryView, None] ) -> None: if not v.materialized_view_table_id: logging.info( "Skipping non-materialized view [%s.%s].", v.dataset_id, v.view_id ) return if skip_missing_views and not bq_client.table_exists( bq_client.dataset_ref_for_id(dataset_id=v.dataset_id), v.view_id ): logging.info( "Skipping materialization of view [%s.%s] which does not exist", v.dataset_id, v.view_id, ) return bq_client.materialize_view_to_table(v) dag_walker.process_dag(_materialize_view) except Exception as e: with monitoring.measurements( {monitoring.TagKey.CREATE_UPDATE_VIEWS_NAMESPACE: bq_view_namespace.value} ) as measurements: measurements.measure_int_put(m_failed_view_update, 1) raise e from e
def test_dag_returns_parent_results(self) -> None: walker = BigQueryViewDagWalker(self.all_views) def process_check_parents( _view: BigQueryView, parent_results: Dict[BigQueryView, int] ) -> int: if not parent_results: return 1 return max(parent_results.values()) + 1 result = walker.process_dag(process_check_parents) self.assertEqual(len(self.all_views), len(result)) max_depth = 0 max_depth_view = None for view, depth in result.items(): if depth > max_depth: max_depth = depth max_depth_view = view if not max_depth_view: self.fail("Found no max_depth_view") max_depth_node = walker.nodes_by_key[ (max_depth_view.dataset_id, max_depth_view.view_id) ] self.assertEqual(set(), max_depth_node.child_keys)
def test_dag_process_time(self) -> None: num_views = len(self.all_views) walker = BigQueryViewDagWalker(self.all_views) serial_processing_time_seconds = num_views * MOCK_VIEW_PROCESS_TIME_SECONDS serial_processing_time = datetime.timedelta( seconds=serial_processing_time_seconds ) def process_simple( view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey] ) -> DagKey: time.sleep(MOCK_VIEW_PROCESS_TIME_SECONDS) return view.dataset_id, view.table_id start = datetime.datetime.now() result = walker.process_dag(process_simple) end = datetime.datetime.now() self.assertEqual(num_views, len(result)) processing_time = end - start # We expect to see significant speedup over the processing time if we ran the process function for each view # in series. self.assertLess(processing_time * 5, serial_processing_time)
def test_views_use_materialized_if_present(self) -> None: """Checks that each view is using the materialized version of a parent view, if one exists.""" walker = BigQueryViewDagWalker(self.all_views) def process_check_using_materialized( view: BigQueryView, _parent_results: Dict[BigQueryView, None] ) -> None: node = walker.node_for_view(view) for parent_table_address in node.parent_tables: if parent_table_address in walker.materialized_addresss: # We are using materialized version of a table continue parent_key = DagKey(view_address=parent_table_address) if parent_key not in walker.nodes_by_key: # We assume this is a source data table (checked in other tests) continue parent_view: BigQueryView = walker.view_for_key(parent_key) self.assertIsNone( parent_view.materialized_address, f"Found view [{node.dag_key}] referencing un-materialized version " f"of view [{parent_key}] when materialized table " f"[{parent_view.materialized_address}] exists.", ) result = walker.process_dag(process_check_using_materialized) self.assertEqual(len(self.all_views), len(result))
def _create_managed_dataset_and_deploy_views( views_to_update: List[BigQueryView], bq_region_override: Optional[str], force_materialize: bool, set_temp_dataset_table_expiration: bool = False, ) -> None: """Create and update the given views and their parent datasets. Cleans up unmanaged views and datasets For each dataset key in the given dictionary, creates the dataset if it does not exist, and creates or updates the underlying views mapped to that dataset. If a view has a set materialized_address field, materializes the view into a table. Then, cleans up BigQuery by deleting unmanaged datasets and unmanaged views within managed datasets. This is not performed if a temporary dataset table expiration is already set. Args: views_to_update: A list of view objects to be created or updated. set_temp_dataset_table_expiration: If True, new datasets will be created with an expiration of TEMP_DATASET_DEFAULT_TABLE_EXPIRATION_MS. """ bq_client = BigQueryClientImpl(region_override=bq_region_override) dag_walker = BigQueryViewDagWalker(views_to_update) managed_views_map = get_managed_view_and_materialized_table_addresses_by_dataset( dag_walker) managed_dataset_ids = list(managed_views_map.keys()) _create_all_datasets_if_necessary(bq_client, managed_dataset_ids, set_temp_dataset_table_expiration) if not set_temp_dataset_table_expiration: # We don't want to be deleting unmanaged views/tables if we're creating sandbox datasets cleanup_datasets_and_delete_unmanaged_views(bq_client, managed_views_map, dry_run=False) def process_fn(v: BigQueryView, parent_results: Dict[BigQueryView, bool]) -> bool: """Returns True if this view or any of its parents were updated.""" return _create_or_update_view_and_materialize_if_necessary( bq_client, v, parent_results, force_materialize) dag_walker.process_dag(process_fn)
def test_dag_parents_materialized_non_default(self) -> None: self.maxDiff = None view_1 = BigQueryView( dataset_id="dataset_1", view_id="table_1", description="table_1 description", should_materialize=True, materialized_address_override=BigQueryAddress( dataset_id="other_dataset_1", table_id="other_table_1" ), view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table`", ) view_2 = BigQueryView( dataset_id="dataset_2", view_id="table_2", description="table_2 description", should_materialize=True, materialized_address_override=BigQueryAddress( dataset_id="other_dataset_2", table_id="other_table_2" ), view_query_template="SELECT * FROM `{project_id}.source_dataset.source_table_2`", ) view_3 = BigQueryView( dataset_id="dataset_3", view_id="table_3", description="table_3 description", view_query_template=""" SELECT * FROM `{project_id}.dataset_1.table_1` JOIN `{project_id}.other_dataset_2.other_table_2` USING (col)""", ) walker = BigQueryViewDagWalker([view_1, view_2, view_3]) def process_simple( view: BigQueryView, parent_results: Dict[BigQueryView, DagKey] ) -> str: if view == view_3: # View 3 should have two parents self.assertEqual( {view_1: view_1.view_id, view_2: view_2.view_id}, parent_results ) return view.view_id result = walker.process_dag(process_simple) self.assertEqual( {view_1: view_1.view_id, view_2: view_2.view_id, view_3: view_3.view_id}, result, )
def test_dag_touches_all_views(self) -> None: walker = BigQueryViewDagWalker(self.all_views) def process_simple( view: BigQueryView, _parent_results: Dict[BigQueryView, DagKey] ) -> DagKey: time.sleep(MOCK_VIEW_PROCESS_TIME_SECONDS / 10) return DagKey.for_view(view) result = walker.process_dag(process_simple) expected_view_keys = set(walker.nodes_by_key) self.assertEqual(expected_view_keys, set(result.values())) walked_view_keys_from_process_results = set(result.values()) self.assertEqual(expected_view_keys, walked_view_keys_from_process_results)
def test_dag_does_not_process_until_parents_processed(self) -> None: walker = BigQueryViewDagWalker(self.all_views) mutex = threading.Lock() all_processed = set() def process_check_parents( view: BigQueryView, parent_results: Dict[BigQueryView, None] ) -> None: with mutex: node_key = (view.dataset_id, view.view_id) node = walker.nodes_by_key[node_key] if not node.is_root: for parent_key in node.parent_keys: if parent_key not in all_processed: # The only parents that won't have been fully processed are source data tables try: self.assertIsValidSourceDataTable( child_view_key=node_key, source_table_key=parent_key ) except ValueError as e: raise ValueError( f"Found parent view [{parent_key}] that was not processed before " f"child [{node_key}] started processing." ) from e else: self.assertIn( walker.nodes_by_key[parent_key].view, parent_results ) time.sleep( random.uniform( MOCK_VIEW_PROCESS_TIME_SECONDS, MOCK_VIEW_PROCESS_TIME_SECONDS * 2 ) ) with mutex: all_processed.add(node_key) result = walker.process_dag(process_check_parents) self.assertEqual(len(self.all_views), len(result))
class CalculationDocumentationGenerator: """A class for generating documentation about our calculations.""" def __init__( self, products: List[ProductConfig], root_calc_docs_dir: str, ): self.root_calc_docs_dir = root_calc_docs_dir self.products = products self.states_by_product = self.get_states_by_product() # Reverses the states_by_product dictionary self.products_by_state: Dict[StateCode, Dict[ GCPEnvironment, List[ProductName]]] = defaultdict(lambda: defaultdict(list)) for product_name, environments_to_states in self.states_by_product.items( ): for environment, states in environments_to_states.items(): for state in states: self.products_by_state[state][environment].append( product_name) self.dag_walker = BigQueryViewDagWalker( _build_views_to_update( view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS, candidate_view_builders=DEPLOYED_VIEW_BUILDERS, dataset_overrides=None, override_should_build_predicate=True, )) self.prod_templates_yaml = YAMLDict.from_path( PRODUCTION_TEMPLATES_PATH) self.daily_pipelines = self.prod_templates_yaml.pop_dicts( "daily_pipelines") self.historical_pipelines = self.prod_templates_yaml.pop_dicts( "historical_pipelines") self.metric_calculations_by_state = self._get_state_metric_calculations( self.daily_pipelines, "daily") # combine with the historical pipelines for name, metric_info_list in self._get_state_metric_calculations( self.historical_pipelines, "triggered by code changes").items(): self.metric_calculations_by_state[name].extend(metric_info_list) # Reverse the metric_calculations_by_state dictionary self.state_metric_calculations_by_metric: Dict[ str, List[StateMetricInfo]] = defaultdict(list) for state_name, metric_info_list in self.metric_calculations_by_state.items( ): for metric_info in metric_info_list: self.state_metric_calculations_by_metric[ metric_info.name].append( StateMetricInfo( name=state_name, month_count=metric_info.month_count, frequency=metric_info.frequency, )) self.metrics_by_generic_types = self._get_metrics_by_generic_types() self.generic_types_by_metric_name = {} for generic_type, metric_list in self.metrics_by_generic_types.items(): for metric in metric_list: self.generic_types_by_metric_name[ DATAFLOW_METRICS_TO_TABLES[metric]] = generic_type def _preprocess_views( v: BigQueryView, _parent_results: Dict[BigQueryView, None]) -> None: dag_key = DagKey(view_address=v.address) node = self.dag_walker.nodes_by_key[dag_key] # Fills out full child/parent dependencies and tree representations for use # in various sections. self.dag_walker.populate_node_family_for_node( node=node, datasets_to_skip={DATAFLOW_METRICS_MATERIALIZED_DATASET} | RAW_TABLE_DATASETS, custom_node_formatter=self. _dependency_tree_formatter_for_gitbook, view_source_table_datasets=VIEW_SOURCE_TABLE_DATASETS | LATEST_VIEW_DATASETS, ) self.dag_walker.process_dag(_preprocess_views) self.all_views_to_document = self._get_all_views_to_document() def _get_all_export_config_view_builder_addresses( self) -> Set[BigQueryAddress]: all_export_view_builder_addresses: Set[BigQueryAddress] = set() for product in self.products: all_export_view_builder_addresses = all_export_view_builder_addresses.union( self._get_all_config_view_addresses_for_product(product)) return all_export_view_builder_addresses def get_states_by_product( self, ) -> Dict[ProductName, Dict[GCPEnvironment, List[StateCode]]]: """Returns the dict of products to states and environments.""" states_by_product: Dict[ProductName, Dict[ GCPEnvironment, List[StateCode]]] = defaultdict(lambda: defaultdict(list)) for product in self.products: if product.states is not None: for state in product.states: environment = GCPEnvironment(state.environment) state_code = StateCode(state.state_code) states_by_product[product.name][environment].append( state_code) return states_by_product @staticmethod def bulleted_list(string_list: List[str], tabs: int = 1, escape_underscores: bool = True) -> str: """Returns a string holding a bulleted list of the input string list.""" return "\n".join([ f"{' '*tabs}- {s.replace('__', ESCAPED_DOUBLE_UNDERSCORE) if escape_underscores else s}" for s in string_list ]) def _get_dataflow_pipeline_enabled_states(self) -> Set[StateCode]: """Returns the set of StateCodes for all states present in our production calc pipeline template.""" states = { pipeline.peek("state_code", str).upper() for pipeline in self.daily_pipelines }.union({ pipeline.peek("state_code", str).upper() for pipeline in self.historical_pipelines }) for state_code in states: if not StateCode.is_state_code(state_code): raise ValueError( f"Found invalid state code value [{state_code}]" f" in pipeline template config.") return {StateCode(state_code) for state_code in states} def _get_product_enabled_states(self) -> Set[StateCode]: states: Set[str] = set() for product in self.products: if product.states is not None: states = states.union( {state.state_code for state in product.states}) for state_code in states: if not StateCode.is_state_code(state_code): raise ValueError( f"Found invalid state code value [{state_code}]" f" in product config.") return {StateCode(state_code) for state_code in states} def _get_calculation_states_summary_str(self) -> str: states = self._get_dataflow_pipeline_enabled_states().union( self._get_product_enabled_states()) state_names = [str(state_code.get_state()) for state_code in states] header = "- States\n" return header + self.bulleted_list( sorted([ f"[{state_name}](calculation/states/{self._normalize_string_for_path(state_name)}.md)" for state_name in state_names ])) def _get_products_summary_str(self) -> str: header = "\n- Products\n" product_names = sorted([product.name for product in self.products]) return header + self.bulleted_list([ f"[{product_name}](calculation/products/" f"{self._normalize_string_for_path(product_name)}/" f"{self._normalize_string_for_path(product_name)}_summary.md)" for product_name in product_names ]) def _get_views_summary_str(self) -> str: header = "\n- Views" bullets = "" for dataset_id, dag_key_list in self._get_keys_by_dataset( self.all_views_to_document).items(): bullets += f"\n - {dataset_id}\n" bullets += self.bulleted_list( [ f"[{dag_key.table_id.replace('__', ESCAPED_DOUBLE_UNDERSCORE)}](calculation/views/{dataset_id}/{dag_key.table_id}.md)" for dag_key in dag_key_list ], tabs=2, escape_underscores=False, ) return header + bullets + "\n" @staticmethod def _get_metrics_by_generic_types( ) -> Dict[str, List[Type[RecidivizMetric]]]: metrics_dict: Dict[str, List[Type[RecidivizMetric]]] = defaultdict(list) for metric in DATAFLOW_METRICS_TO_TABLES: if issubclass(metric, SupervisionMetric): metrics_dict["Supervision"].append(metric) elif issubclass(metric, ReincarcerationRecidivismMetric): metrics_dict["Recidivism"].append(metric) elif issubclass(metric, ProgramMetric): metrics_dict["Program"].append(metric) elif issubclass(metric, IncarcerationMetric): metrics_dict["Incarceration"].append(metric) elif issubclass(metric, ViolationMetric): metrics_dict["Violation"].append(metric) else: raise ValueError( f"{metric.__name__} is not a subclass of an expected" f" metric type.)") return metrics_dict def _get_dataflow_metrics_summary_str(self) -> str: dataflow_str = "\n- Dataflow Metrics\n" for header, class_list in self.metrics_by_generic_types.items(): dataflow_str += f" - {header.upper()}\n" dataflow_str += (self.bulleted_list( [ f"[{metric.__name__}](calculation/metrics/{header.lower()}/{DATAFLOW_METRICS_TO_TABLES[metric]}.md)" for metric in class_list ], 2, ) + "\n") return dataflow_str def generate_summary_strings(self) -> List[str]: logging.info("Generating calculation summary markdown") calculation_catalog_summary = ["## Calculation Catalog\n\n"] calculation_catalog_summary.extend( [self._get_calculation_states_summary_str()]) calculation_catalog_summary.extend([self._get_products_summary_str()]) calculation_catalog_summary.extend([self._get_views_summary_str()]) calculation_catalog_summary.extend( [self._get_dataflow_metrics_summary_str()]) return calculation_catalog_summary def products_list_for_env(self, state_code: StateCode, environment: GCPEnvironment) -> str: """Returns a bulleted list of products launched in the state in the given environment.""" if environment not in { GCPEnvironment.PRODUCTION, GCPEnvironment.STAGING }: raise ValueError(f"Unexpected environment: [{environment.value}]") if (not state_code in self.products_by_state or environment not in self.products_by_state[state_code] or not self.products_by_state[state_code][environment]): return "N/A" return self.bulleted_list([ f"[{product}](../products/{self._normalize_string_for_path(product)}/{self._normalize_string_for_path(product)}_summary.md)" for product in self.products_by_state[state_code][environment] ]) def states_list_for_env(self, product: ProductConfig, environment: GCPEnvironment) -> str: """Returns a bulleted list of states where a product is launched in the given environment.""" if environment not in { GCPEnvironment.PRODUCTION, GCPEnvironment.STAGING }: raise ValueError(f"Unexpected environment: [{environment.value}]") states_list = [ f"[{str(state_code.get_state())}](../../states/{self._normalize_string_for_path(str(state_code.get_state()))}.md)" for state_code in self.states_by_product[product.name][environment] ] return self.bulleted_list(states_list) if states_list else " N/A" def _get_shipped_states_str(self, product: ProductConfig) -> str: """Returns a string containing lists of shipped states and states in development for a given product.""" shipped_states_str = self.states_list_for_env( product, GCPEnvironment.PRODUCTION) development_states_str = self.states_list_for_env( product, GCPEnvironment.STAGING) return ("##SHIPPED STATES\n" + shipped_states_str + "\n\n## STATES IN DEVELOPMENT\n" + development_states_str + "\n\n") @staticmethod def _get_keys_by_dataset(dag_keys: Set[DagKey]) -> Dict[str, List[DagKey]]: """Given a set of DagKeys, returns a sorted dictionary of those keys, organized by dataset.""" datasets_to_views = defaultdict(list) for key in sorted(dag_keys, key=lambda dag_key: (dag_key.dataset_id, dag_key.table_id)): datasets_to_views[key.dataset_id].append(key) return datasets_to_views def _get_dataset_headers_to_views_str( self, dag_keys: Set[DagKey], source_tables_section: bool = False) -> str: """Given a set of DagKeys, returns a str list of those views, organized by dataset.""" datasets_to_keys = self._get_keys_by_dataset(dag_keys) views_str = "" for dataset, keys in datasets_to_keys.items(): views_str += f"####{dataset}\n" views_str += ( f"_{VIEW_SOURCE_TABLE_DATASETS_TO_DESCRIPTIONS[dataset]}_\n" if source_tables_section else "") views_str += (self.bulleted_list( [ self._dependency_tree_formatter_for_gitbook( dag_key, products_section=True) for dag_key in keys ], escape_underscores=False, )) + "\n\n" return views_str def _get_views_str_for_product(self, view_keys: Set[DagKey]) -> str: """Returns the string containing the VIEWS section of the product markdown.""" views_header = "##VIEWS\n\n" if not view_keys: return views_header + "*This product does not use any BigQuery views.*\n\n" return views_header + self._get_dataset_headers_to_views_str(view_keys) def _get_source_tables_str_for_product(self, source_keys: Set[DagKey]) -> str: """Returns the string containing the SOURCE TABLES section of the product markdown.""" source_tables_header = ( "##SOURCE TABLES\n" "_Reference views that are used by other views. Some need to be updated manually._\n\n" ) if not source_keys: return (source_tables_header + "*This product does not reference any source tables.*\n\n") return source_tables_header + self._get_dataset_headers_to_views_str( source_keys, source_tables_section=True) def _get_metrics_str_for_product(self, metric_keys: Set[DagKey]) -> str: """Builds the Metrics string for the product markdown file. Creates a table of necessary metric types and whether a state calculates those metrics""" metrics_header = ( "##METRICS\n_All metrics required to support this product and" " whether or not each state regularly calculates the metric._" "\n\n** DISCLAIMER **\nThe presence of all required metrics" " for a state does not guarantee that this product is ready to" " launch in that state.\n\n") if not metric_keys: return (metrics_header + "*This product does not rely on Dataflow metrics.*\n") state_codes = sorted(self._get_dataflow_pipeline_enabled_states(), key=lambda code: code.value) headers = ["**Metric**"] + [ f"**{state_code.value}**" for state_code in state_codes ] table_matrix = [[ f"[{DATAFLOW_TABLES_TO_METRIC_TYPES[metric_key.table_id].value}](../../metrics/{self.generic_types_by_metric_name[metric_key.table_id].lower()}/{metric_key.table_id}.md)" ] + [ "X" if DATAFLOW_TABLES_TO_METRIC_TYPES[metric_key.table_id].value in [ metric.name for metric in self.metric_calculations_by_state[ str(state_code.get_state())] ] else "" for state_code in state_codes ] for metric_key in sorted(metric_keys, key=lambda dag_key: dag_key.table_id)] writer = MarkdownTableWriter(headers=headers, value_matrix=table_matrix, margin=0) return metrics_header + writer.dumps() @staticmethod def _get_all_config_view_addresses_for_product( product: ProductConfig, ) -> Set[BigQueryAddress]: """Returns a set containing a BQ address for each view listed by each export necessary for the given product.""" all_config_view_addresses: Set[BigQueryAddress] = set() for export in product.exports: collection_config = VIEW_COLLECTION_EXPORT_INDEX[export] view_builders = collection_config.view_builders_to_export all_config_view_addresses = all_config_view_addresses.union({ BigQueryAddress( dataset_id=view_builder.dataset_id, table_id=view_builder.view_id, ) for view_builder in view_builders }) return all_config_view_addresses def _get_all_parent_keys_for_product( self, product: ProductConfig) -> Set[DagKey]: """Returns a set containing a DagKey for every view that this product relies upon. """ all_config_view_addresses = self._get_all_config_view_addresses_for_product( product) all_parent_keys: Set[DagKey] = set() for view_address in all_config_view_addresses: dag_key = DagKey(view_address=view_address) node = self.dag_walker.nodes_by_key[dag_key] # Add in the top level view all_parent_keys.add(dag_key) # Add in all ancestors all_parent_keys = all_parent_keys.union( node.node_family.full_parentage) # Ignore materialized metric views as relevant metric info can be found in a # different dataset (DATAFLOW_METRICS_DATASET). all_parent_keys.difference_update({ key for key in all_parent_keys if key.dataset_id == DATAFLOW_METRICS_MATERIALIZED_DATASET }) return all_parent_keys def _get_product_information(self, product: ProductConfig) -> str: """Returns a string containing all relevant information for a given product including name, views used, source tables, and required metrics.""" documentation = f"#{product.name.upper()}\n" documentation += product.description + "\n" documentation += self._get_shipped_states_str(product) all_parent_keys = self._get_all_parent_keys_for_product(product) source_keys = { key for key in all_parent_keys # Metric info will be included in the metric-specific section if key.dataset_id in OTHER_SOURCE_TABLE_DATASETS - {DATAFLOW_METRICS_DATASET} } metric_keys = { key for key in all_parent_keys if key.dataset_id == DATAFLOW_METRICS_DATASET } # Remove metric keys as they are surfaced in a metric-specific section. Remove # source table keys as they are surfaced in a reference-specific section view_keys = all_parent_keys - metric_keys - source_keys documentation += self._get_views_str_for_product(view_keys) documentation += self._get_source_tables_str_for_product(source_keys) documentation += self._get_metrics_str_for_product(metric_keys) return documentation @staticmethod def _normalize_string_for_path(target_string: str) -> str: """Returns a lowercase, underscore-separated string.""" return target_string.lower().replace(" ", "_") def generate_products_markdowns(self) -> bool: """Generates markdown files if necessary for the docs/calculation/products directories""" anything_modified = False for product in self.products: # Generate documentation for each product documentation = self._get_product_information(product) # Write documentation to markdown files product_name_for_path = self._normalize_string_for_path( product.name) product_dir_path = os.path.join(self.root_calc_docs_dir, "products", product_name_for_path) os.makedirs(product_dir_path, exist_ok=True) product_markdown_path = os.path.join( product_dir_path, f"{product_name_for_path}_summary.md", ) anything_modified |= persist_file_contents(documentation, product_markdown_path) return anything_modified @staticmethod def _get_state_metric_calculations( pipelines: List[YAMLDict], frequency: str) -> Dict[str, List[PipelineMetricInfo]]: """Returns a dict of state names to lists of info about their regularly calculated metrics.""" state_metric_calculations = defaultdict(list) for pipeline in pipelines: state_metric_calculations[str( StateCode(pipeline.peek("state_code", str)).get_state() )].extend([ PipelineMetricInfo( name=metric, month_count=pipeline.peek_optional( "calculation_month_count", int), frequency=frequency, ) for metric in pipeline.peek("metric_types", str).split() ], ) return state_metric_calculations def _get_sorted_state_metric_info( self) -> Dict[str, List[PipelineMetricInfo]]: """Returns a dictionary of state names (in alphabetical order) to their regularly calculated metric information (sorted by metric name)""" sorted_state_metric_calculations: Dict[ str, List[PipelineMetricInfo]] = { state_name_key: sorted( self.metric_calculations_by_state[state_name_key], key=lambda info: info.name, ) for state_name_key in sorted(self.metric_calculations_by_state) } return sorted_state_metric_calculations def _get_metrics_table_for_state(self, state_name: str) -> str: sorted_state_metric_calculations = self._get_sorted_state_metric_info() metric_names_to_tables = { metric.value: table for table, metric in DATAFLOW_TABLES_TO_METRIC_TYPES.items() } if state_name in sorted_state_metric_calculations: headers = [ "**Metric**", "**Number of Months Calculated**", "**Calculation Frequency**", ] table_matrix = [[ f"[{metric_info.name}](../metrics/{self.generic_types_by_metric_name[metric_names_to_tables[metric_info.name]].lower()}/{metric_names_to_tables[metric_info.name]}.md)", metric_info.month_count if metric_info.month_count else "N/A", metric_info.frequency, ] for metric_info in sorted_state_metric_calculations[state_name]] writer = MarkdownTableWriter(headers=headers, value_matrix=table_matrix, margin=0) return writer.dumps() return "_This state has no regularly calculated metrics._" def _get_state_information(self, state_code: StateCode, state_name: str) -> str: """Returns string contents for the state markdown.""" documentation = f"#{state_name}\n\n" # Products section documentation += "##Shipped Products\n\n" documentation += self.products_list_for_env(state_code, GCPEnvironment.PRODUCTION) documentation += "\n\n##Products in Development\n\n" documentation += self.products_list_for_env(state_code, GCPEnvironment.STAGING) # Metrics section documentation += "\n\n##Regularly Calculated Metrics\n\n" documentation += self._get_metrics_table_for_state(state_name) return documentation def generate_states_markdowns(self) -> bool: """Generate markdown files for each state.""" anything_modified = False states_dir_path = os.path.join(self.root_calc_docs_dir, "states") os.makedirs(states_dir_path, exist_ok=True) for state_code in self._get_dataflow_pipeline_enabled_states(): state_name = str(state_code.get_state()) # Generate documentation documentation = self._get_state_information(state_code, state_name) # Write to markdown files states_markdown_path = os.path.join( states_dir_path, f"{self._normalize_string_for_path(state_name)}.md", ) anything_modified |= persist_file_contents(documentation, states_markdown_path) return anything_modified def _dependency_tree_formatter_for_gitbook( self, dag_key: DagKey, products_section: bool = False, ) -> str: """Gitbook-specific formatting for the generated dependency tree.""" is_source_table = dag_key.dataset_id in OTHER_SOURCE_TABLE_DATASETS - { DATAFLOW_METRICS_DATASET } is_raw_data_table = dag_key.dataset_id in LATEST_VIEW_DATASETS is_metric = dag_key.dataset_id in DATAFLOW_METRICS_DATASET is_documented_view = not (is_source_table or is_raw_data_table or is_metric) if is_raw_data_table and ( not dag_key.dataset_id.endswith("_raw_data_up_to_date_views") or not dag_key.table_id.endswith("_latest")): raise ValueError( f"Unexpected raw data view address: [{dag_key.dataset_id}.{dag_key.table_id}]" ) staging_link = BQ_LINK_TEMPLATE.format( project="recidiviz-staging", dataset_id=dag_key.dataset_id, table_id=dag_key.table_id, ) prod_link = BQ_LINK_TEMPLATE.format( project="recidiviz-123", dataset_id=dag_key.dataset_id, table_id=dag_key.table_id, ) table_name_str = ( # Include brackets if metric or view ("[" if products_section else f"[{dag_key.dataset_id}.") + f"{dag_key.table_id.replace('__', ESCAPED_DOUBLE_UNDERSCORE)}]" if is_documented_view or is_metric else ("" if products_section else f"{dag_key.dataset_id}.") + f"{dag_key.table_id.replace('__', ESCAPED_DOUBLE_UNDERSCORE)}") if is_source_table: table_name_str += ( f" ([BQ Staging]({staging_link})) ([BQ Prod]({prod_link}))") elif is_metric: table_name_str += f"(../../metrics/{self.generic_types_by_metric_name[dag_key.table_id].lower()}/{dag_key.table_id}.md)" elif is_raw_data_table: table_name_str += RAW_DATA_LINKS_TEMPLATE.format( region=dag_key.dataset_id[:-26], raw_data_table=dag_key.table_id[:-7], staging_link=staging_link, prod_link=prod_link, ) else: table_name_str += f"(../{'../views/' if products_section else ''}{dag_key.dataset_id}/{dag_key.table_id}.md)" return table_name_str + " <br/>" @staticmethod def _get_view_tree_string( node_family: BigQueryViewDagNodeFamily, dag_key: DagKey, descendants: bool = False, ) -> str: if (node_family.full_parentage and not descendants) or (node_family.full_descendants and descendants): # Gitbook has a line count limit of ~525 for the view markdowns, so we # only print trees if they are small enough. Otherwise, we direct readers # to a script that prints the tree to console. if descendants: if (node_family.child_dfs_tree_str.count("<br/>") < MAX_DEPENDENCY_TREE_LENGTH): return node_family.child_dfs_tree_str else: if (node_family.parent_dfs_tree_str.count("<br/>") < MAX_DEPENDENCY_TREE_LENGTH): return node_family.parent_dfs_tree_str return DEPENDENCY_TREE_SCRIPT_TEMPLATE.format( dataset_id=dag_key.dataset_id, table_id=dag_key.table_id, descendants=descendants, ) return f"This view has no {'child' if descendants else 'parent'} dependencies." def _get_view_information(self, view_key: DagKey) -> str: """Returns string contents for a view markdown.""" view_node = self.dag_walker.nodes_by_key[view_key] # Gitbook only supports italicizing single lines so we need to ensure multi-line # descriptions format correctly formatted_description = "_<br/>\n_".join([ line.strip() for line in view_node.view.description.splitlines() if line.strip() ]) staging_link = BQ_LINK_TEMPLATE.format( project="recidiviz-staging", dataset_id=view_key.dataset_id, table_id=view_key.table_id, ) prod_link = BQ_LINK_TEMPLATE.format( project="recidiviz-123", dataset_id=view_key.dataset_id, table_id=view_key.table_id, ) documentation = VIEW_DOCS_TEMPLATE.format( view_dataset_id=view_key.dataset_id, view_table_id=view_key.table_id, description=formatted_description, staging_link=staging_link, prod_link=prod_link, parent_tree=self._get_view_tree_string(view_node.node_family, view_key), child_tree=self._get_view_tree_string(view_node.node_family, view_key, descendants=True), ) return documentation def _get_all_views_to_document(self) -> Set[DagKey]: """Retrieve all DAG Walker views that we want to document""" all_nodes = self.dag_walker.nodes_by_key.values() all_view_keys = { DagKey(view_address=node.view.address) for node in all_nodes if node.dag_key.dataset_id not in DATASETS_TO_SKIP_VIEW_DOCUMENTATION } return all_view_keys def generate_view_markdowns(self) -> bool: """Generate markdown files for each view.""" anything_modified = False views_dir_path = os.path.join(self.root_calc_docs_dir, "views") os.makedirs(views_dir_path, exist_ok=True) for view_key in self.all_views_to_document: # Generate documentation documentation = self._get_view_information(view_key) # Write to markdown files dataset_dir = os.path.join( views_dir_path, view_key.dataset_id, ) os.makedirs(dataset_dir, exist_ok=True) view_markdown_path = os.path.join( dataset_dir, f"{view_key.table_id}.md", ) anything_modified |= persist_file_contents(documentation, view_markdown_path) return anything_modified def _get_metric_information(self, metric: Type[RecidivizMetric]) -> str: """Returns string contents for a metric markdown.""" metric_table_id = DATAFLOW_METRICS_TO_TABLES[metric] metric_type = DATAFLOW_TABLES_TO_METRIC_TYPES[metric_table_id].value state_infos_list = sorted( self.state_metric_calculations_by_metric[metric_type], key=lambda info: (info.name, info.month_count), ) headers = [ "**State**", "**Number of Months Calculated**", "**Calculation Frequency**", ] table_matrix = [[ f"[{state_info.name}](../../states/{self._normalize_string_for_path(state_info.name)}.md)", state_info.month_count if state_info.month_count else "N/A", state_info.frequency, ] for state_info in state_infos_list] writer = MarkdownTableWriter(headers=headers, value_matrix=table_matrix, margin=0) documentation = METRIC_DOCS_TEMPLATE.format( staging_link=BQ_LINK_TEMPLATE.format( project="recidiviz-staging", dataset_id="dataflow_metrics", table_id=metric_table_id, ), prod_link=BQ_LINK_TEMPLATE.format( project="recidiviz-123", dataset_id="dataflow_metrics", table_id=metric_table_id, ), metric_name=metric.__name__, description=metric.get_description(), metrics_cadence_table=writer.dumps(), metric_table_id=metric_table_id, ) return documentation def generate_metric_markdowns(self) -> bool: """Generate markdown files for each metric.""" anything_modified = False metrics_dir_path = os.path.join(self.root_calc_docs_dir, "metrics") os.makedirs(metrics_dir_path, exist_ok=True) for generic_type, class_list in sorted( self.metrics_by_generic_types.items()): generic_type_dir = os.path.join( metrics_dir_path, generic_type.lower(), ) os.makedirs(generic_type_dir, exist_ok=True) for metric in class_list: # Generate documentation documentation = self._get_metric_information(metric) # Write to markdown files metric_markdown_path = os.path.join( generic_type_dir, f"{DATAFLOW_METRICS_TO_TABLES[metric]}.md", ) anything_modified |= persist_file_contents( documentation, metric_markdown_path) return anything_modified