def test_init_not_called(self) -> None: mock_transformer1 = MagicMock() mock_transformer2 = MagicMock() chained_transformer = ChainedTransformer( transformers=[mock_transformer1, mock_transformer2]) config = ConfigFactory.from_dict({}) chained_transformer.init(conf=config) chained_transformer.transform({'foo': 'bar'}) mock_transformer1.init.assert_not_called() mock_transformer1.transform.assert_called_once() mock_transformer2.init.assert_not_called() mock_transformer2.transform.assert_called_once()
def test_init_called(self) -> None: mock_transformer1 = MagicMock() mock_transformer1.get_scope.return_value = 'foo' mock_transformer2 = MagicMock() mock_transformer2.get_scope.return_value = 'bar' chained_transformer = ChainedTransformer( transformers=[mock_transformer1, mock_transformer2], is_init_transformers=True) config = ConfigFactory.from_dict({}) chained_transformer.init(conf=config) chained_transformer.transform({'foo': 'bar'}) mock_transformer1.init.assert_called_once() mock_transformer1.transform.assert_called_once() mock_transformer2.init.assert_called_once() mock_transformer2.transform.assert_called_once()
def test_transformer_transforms(self) -> None: mock_transformer1 = MagicMock() mock_transformer1.transform.side_effect = lambda s: s + "b" mock_transformer2 = MagicMock() mock_transformer2.transform.side_effect = lambda s: s + "c" chained_transformer = ChainedTransformer( transformers=[mock_transformer1, mock_transformer2]) config = ConfigFactory.from_dict({}) chained_transformer.init(conf=config) result = next(chained_transformer.transform("a")) self.assertEqual(result, "abc")
def test_init_not_called(self) -> None: mock_transformer1 = MagicMock() mock_transformer1.transform.return_value = "foo" mock_transformer2 = MagicMock() mock_transformer2.transform.return_value = "bar" chained_transformer = ChainedTransformer( transformers=[mock_transformer1, mock_transformer2]) config = ConfigFactory.from_dict({}) chained_transformer.init(conf=config) next(chained_transformer.transform({"foo": "bar"})) mock_transformer1.init.assert_not_called() mock_transformer1.transform.assert_called_once() mock_transformer2.init.assert_not_called() mock_transformer2.transform.assert_called_once()
class TableauDashboardQueryExtractor(Extractor): """ Extracts metadata about the queries associated with Tableau workbooks. In terms of Tableau's Metadata API, these queries are called "custom SQL tables". However, not every workbook uses custom SQL queries, and most are built with a mixture of using the datasource fields directly and various "calculated" columns. This extractor iterates through one query at a time, yielding a new relationship for every downstream workbook that uses the query. """ API_BASE_URL = const.API_BASE_URL API_VERSION = const.API_VERSION CLUSTER = const.CLUSTER EXCLUDED_PROJECTS = const.EXCLUDED_PROJECTS SITE_NAME = const.SITE_NAME TABLEAU_ACCESS_TOKEN_NAME = const.TABLEAU_ACCESS_TOKEN_NAME TABLEAU_ACCESS_TOKEN_SECRET = const.TABLEAU_ACCESS_TOKEN_SECRET VERIFY_REQUEST = const.VERIFY_REQUEST def init(self, conf: ConfigTree) -> None: self._conf = conf self.query = """query { customSQLTables { id name query downstreamWorkbooks { name projectName } } }""" self._extractor = self._build_extractor() transformers = [] dict_to_model_transformer = DictToModel() dict_to_model_transformer.init( conf=Scoped.get_scoped_conf(self._conf, dict_to_model_transformer.get_scope()).with_fallback( ConfigFactory.from_dict( {MODEL_CLASS: 'databuilder.models.dashboard.dashboard_query.DashboardQuery'}))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.tableau_dashboard_query' def _build_extractor(self) -> TableauGraphQLApiQueryExtractor: """ Builds a TableauGraphQLApiQueryExtractor. All data required can be retrieved with a single GraphQL call. :return: A TableauGraphQLApiQueryExtractor that provides dashboard query metadata. """ extractor = TableauGraphQLApiQueryExtractor() tableau_extractor_conf = \ Scoped.get_scoped_conf(self._conf, extractor.get_scope())\ .with_fallback(self._conf)\ .with_fallback(ConfigFactory.from_dict({TableauGraphQLApiExtractor.QUERY: self.query, STATIC_RECORD_DICT: {'product': 'tableau'} } ) ) extractor.init(conf=tableau_extractor_conf) return extractor
class ModeDashboardExtractor(Extractor): """ A Extractor that extracts core metadata on Mode dashboard. https://app.mode.com/ It extracts list of reports that consists of: Dashboard group name (Space name) Dashboard group id (Space token) Dashboard group description (Space description) Dashboard name (Report name) Dashboard id (Report token) Dashboard description (Report description) Other information such as report run, owner, chart name, query name is in separate extractor. """ def init(self, conf: ConfigTree) -> None: self._conf = conf restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf) # Payload from RestApiQuery has timestamp which is ISO8601. Here we are using TimestampStringToEpoch to # transform into epoch and then using DictToModel to convert Dictionary to Model transformers: List[Transformer] = [] timestamp_str_to_epoch_transformer = TimestampStringToEpoch() timestamp_str_to_epoch_transformer.init(conf=Scoped.get_scoped_conf( self._conf, timestamp_str_to_epoch_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ FIELD_NAME: 'created_timestamp', }))) transformers.append(timestamp_str_to_epoch_transformer) dashboard_group_url_transformer = TemplateVariableSubstitutionTransformer( ) dashboard_group_url_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dashboard_group_url_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ VAR_FIELD_NAME: 'dashboard_group_url', TEMPLATE: 'https://app.mode.com/{organization}/spaces/{dashboard_group_id}' }))) transformers.append(dashboard_group_url_transformer) dashboard_url_transformer = TemplateVariableSubstitutionTransformer() dashboard_url_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dashboard_url_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ VAR_FIELD_NAME: 'dashboard_url', TEMPLATE: 'https://app.mode.com/{organization}/reports/{dashboard_id}' }))) transformers.append(dashboard_url_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_metadata.DashboardMetadata' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.mode_dashboard' def _build_restapi_query(self) -> RestApiQuery: """ Build REST API Query. To get Mode Dashboard metadata, it needs to call two APIs (spaces API and reports API) joining together. :return: A RestApiQuery that provides Mode Dashboard metadata """ # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace reports_url_template = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports' spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) params = ModeDashboardUtils.get_auth_params(conf=self._conf) # Reports # JSONPATH expression. it goes into array which is located in _embedded.reports and then extracts token, name, # and description json_path = '_embedded.reports[*].[token,name,description,created_at]' field_names = [ 'dashboard_id', 'dashboard_name', 'description', 'created_timestamp' ] reports_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=reports_url_template, params=params, json_path=json_path, field_names=field_names, skip_no_result=True) return reports_query
class TableauDashboardExternalTableExtractor(Extractor): """ Creates the "external" Tableau tables. In this context, "external" tables are "tables" that are not from a typical database, and are loaded using some other data format, like CSV files. This extractor has been tested with the following types of external tables: Excel spreadsheets Text files (including CSV files) Salesforce connections Google Sheets connections Excel spreadsheets, Salesforce connections, and Google Sheets connections are all classified as "databases" in terms of Tableau's Metadata API, with their "subsheets" forming their "tables" when present. However, these tables are not assigned a schema, this extractor chooses to use the name parent sheet as the schema, and assign a new table to each subsheet. The connection type is always used as the database, and for text files, the schema is set using the EXTERNAL_SCHEMA_NAME config option. Since these external tables are usually named for human consumption only and often contain a wider range of characters, all inputs are transformed to remove any problematic occurences before they are inserted: see the sanitize methods TableauDashboardUtils for specifics. A more concrete example: if one had a Google Sheet titled "Growth by Region & County" with 2 subsheets called "FY19 Report" and "FY20 Report", two tables would be generated with the following keys: googlesheets://external.growth_by_region_county/FY_19_Report googlesheets://external.growth_by_region_county/FY_20_Report """ API_VERSION = const.API_VERSION CLUSTER = const.CLUSTER EXCLUDED_PROJECTS = const.EXCLUDED_PROJECTS EXTERNAL_CLUSTER_NAME = const.EXTERNAL_CLUSTER_NAME EXTERNAL_SCHEMA_NAME = const.EXTERNAL_SCHEMA_NAME EXTERNAL_TABLE_TYPES = const.EXTERNAL_TABLE_TYPES SITE_NAME = const.SITE_NAME TABLEAU_HOST = const.TABLEAU_HOST TABLEAU_ACCESS_TOKEN_NAME = const.TABLEAU_ACCESS_TOKEN_NAME TABLEAU_ACCESS_TOKEN_SECRET = const.TABLEAU_ACCESS_TOKEN_SECRET VERIFY_REQUEST = const.VERIFY_REQUEST def init(self, conf: ConfigTree) -> None: self._conf = conf self.query = """query externalTables($externalTableTypes: [String]) { databases (filter: {connectionTypeWithin: $externalTableTypes}) { name connectionType description tables { name } } }""" self.query_variables = { 'externalTableTypes': self._conf.get_list( TableauDashboardExternalTableExtractor.EXTERNAL_TABLE_TYPES) } self._extractor = self._build_extractor() transformers = [] dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.table_metadata.TableMetadata' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.tableau_external_table' def _build_extractor(self) -> TableauGraphQLExternalTableExtractor: """ Builds a TableauGraphQLExternalTableExtractor. All data required can be retrieved with a single GraphQL call. :return: A TableauGraphQLExternalTableExtractor that creates external table metadata entities. """ extractor = TableauGraphQLExternalTableExtractor() config_dict = { TableauGraphQLApiExtractor.QUERY_VARIABLES: self.query_variables, TableauGraphQLApiExtractor.QUERY: self.query } tableau_extractor_conf = \ Scoped.get_scoped_conf(self._conf, extractor.get_scope())\ .with_fallback(self._conf)\ .with_fallback(ConfigFactory.from_dict(config_dict)) extractor.init(conf=tableau_extractor_conf) return extractor
class ModeDashboardQueriesExtractor(Extractor): """ A Extractor that extracts Query information """ def init(self, conf): # type: (ConfigTree) -> None self._conf = conf restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf ) # Constructing URL using several ID via TemplateVariableSubstitutionTransformer transformers = [] variable_substitution_transformer = TemplateVariableSubstitutionTransformer() variable_substitution_transformer.init( conf=Scoped.get_scoped_conf(self._conf, variable_substitution_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({FIELD_NAME: 'url', TEMPLATE: 'https://app.mode.com/{organization}' '/reports/{dashboard_id}/queries/{query_id}'}))) transformers.append(variable_substitution_transformer) # Escape backslash as it breaks Cypher statement. replace_transformer = RegexStrReplaceTransformer() replace_transformer.init( conf=Scoped.get_scoped_conf(self._conf, replace_transformer.get_scope()).with_fallback( ConfigFactory.from_dict( {REGEX_REPLACE_TUPLE_LIST: [('\\', '\\\\')], ATTRIBUTE_NAME: 'query_text'}))) transformers.append(replace_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init( conf=Scoped.get_scoped_conf(self._conf, dict_to_model_transformer.get_scope()).with_fallback( ConfigFactory.from_dict( {MODEL_CLASS: 'databuilder.models.dashboard.dashboard_query.DashboardQuery'}))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self): # type: () -> Any record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self): # type: () -> str return 'extractor.mode_dashboard_query' def _build_restapi_query(self): """ Build REST API Query. To get Mode Dashboard last execution, it needs to call three APIs (spaces API, reports API, and queries API) joining together. :return: A RestApiQuery that provides Mode Dashboard execution (run) """ # type: () -> RestApiQuery spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) params = ModeDashboardUtils.get_auth_params(conf=self._conf) # Reports # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace url = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports' json_path = '(_embedded.reports[*].token)' field_names = ['dashboard_id'] reports_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=url, params=params, json_path=json_path, field_names=field_names, skip_no_result=True) queries_url_template = 'https://app.mode.com/api/{organization}/reports/{dashboard_id}/queries' json_path = '_embedded.queries[*].[token,name,raw_query]' field_names = ['query_id', 'query_name', 'query_text'] query_names_query = RestApiQuery(query_to_join=reports_query, url=queries_url_template, params=params, json_path=json_path, field_names=field_names, skip_no_result=True) return query_names_query
class ModeDashboardUserExtractor(Extractor): """ An Extractor that extracts all Mode Dashboard user and add mode_user_id attribute to User model. """ def init(self, conf: ConfigTree) -> None: self._conf = conf restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf) # Remove all unnecessary fields because User model accepts all attributes and push it to Neo4j. transformers: List[Transformer] = [] remove_fields_transformer = RemoveFieldTransformer() remove_fields_transformer.init(conf=Scoped.get_scoped_conf( self._conf, remove_fields_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ FIELD_NAMES: ['organization', 'mode_user_resource_path', 'product'] }))) transformers.append(remove_fields_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope()).with_fallback( ConfigFactory.from_dict( {MODEL_CLASS: 'databuilder.models.user.User'}))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.mode_dashboard_owner' def _build_restapi_query(self) -> RestApiQuery: """ Build REST API Query. To get Mode Dashboard owner, it needs to call three APIs (spaces API, reports API, and user API) joining together. :return: A RestApiQuery that provides Mode Dashboard owner """ # Seed query record for next query api to join with seed_record = [{ 'organization': self._conf.get_string(ORGANIZATION), 'is_active': None, 'updated_at': None, 'do_not_update_empty_attribute': True, }] seed_query = RestApiQuerySeed(seed_record=seed_record) # memberships # https://mode.com/developer/api-reference/management/organization-memberships/#listMemberships memberships_url_template = 'https://app.mode.com/api/{organization}/memberships' params = { 'auth': HTTPBasicAuth(self._conf.get_string(MODE_ACCESS_TOKEN), self._conf.get_string(MODE_PASSWORD_TOKEN)) } json_path = '(_embedded.memberships[*].member_username) | (_embedded.memberships[*]._links.user.href)' field_names = ['mode_user_id', 'mode_user_resource_path'] mode_user_ids_query = RestApiQuery(query_to_join=seed_query, url=memberships_url_template, params=params, json_path=json_path, field_names=field_names, skip_no_result=True, json_path_contains_or=True) # https://mode.com/developer/api-reference/management/users/ user_url_template = 'https://app.mode.com{mode_user_resource_path}' json_path = 'email' field_names = ['email'] failure_handler = HttpFailureSkipOnStatus(status_codes_to_skip={404}) mode_user_email_query = RestApiQuery( query_to_join=mode_user_ids_query, url=user_url_template, params=params, json_path=json_path, field_names=field_names, skip_no_result=True, can_skip_failure=failure_handler.can_skip_failure) return mode_user_email_query
class ModeDashboardExecutionsExtractor(Extractor): """ A Extractor that extracts run (execution) status and timestamp. """ def init(self, conf: ConfigTree) -> None: self._conf = conf restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf) # Payload from RestApiQuery has timestamp which is ISO8601. Here we are using TimestampStringToEpoch to # transform into epoch and then using DictToModel to convert Dictionary to Model transformers: List[Transformer] = [] timestamp_str_to_epoch_transformer = TimestampStringToEpoch() timestamp_str_to_epoch_transformer.init(conf=Scoped.get_scoped_conf( self._conf, timestamp_str_to_epoch_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ FIELD_NAME: 'execution_timestamp', }))) transformers.append(timestamp_str_to_epoch_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_execution.DashboardExecution' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.mode_dashboard_execution' def _build_restapi_query(self) -> RestApiQuery: """ Build REST API Query. To get Mode Dashboard last execution, it needs to call three APIs (spaces API, reports API, and run API) joining together. :return: A RestApiQuery that provides Mode Dashboard execution (run) """ spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) params = ModeDashboardUtils.get_auth_params(conf=self._conf) # Reports # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace url = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports' json_path = '(_embedded.reports[*].token) | (_embedded.reports[*]._links.last_run.href)' field_names = ['dashboard_id', 'last_run_resource_path'] last_run_resource_path_query = ModePaginatedRestApiQuery( query_to_join=spaces_query, url=url, params=params, json_path=json_path, field_names=field_names, skip_no_result=True, json_path_contains_or=True) # https://mode.com/developer/api-reference/analytics/report-runs/#getReportRun url = 'https://app.mode.com{last_run_resource_path}' json_path = '[state,completed_at]' field_names = ['execution_state', 'execution_timestamp'] last_run_state_query = RestApiQuery( query_to_join=last_run_resource_path_query, url=url, params=params, json_path=json_path, field_names=field_names, skip_no_result=True) return last_run_state_query
class TableauDashboardExtractor(Extractor): """ Extracts core metadata about Tableau "dashboards". For the purposes of this extractor, Tableau "workbooks" are mapped to Amundsen dashboards, and the top-level project in which these workbooks preside is the dashboard group. The metadata it gathers is: Dashboard name (Workbook name) Dashboard description (Workbook description) Dashboard creation timestamp (Workbook creationstamp) Dashboard group name (Workbook top-level folder name) Uses the Metadata API: https://help.tableau.com/current/api/metadata_api/en-us/index.html """ API_BASE_URL = const.API_BASE_URL API_VERSION = const.API_VERSION CLUSTER = const.CLUSTER EXCLUDED_PROJECTS = const.EXCLUDED_PROJECTS SITE_NAME = const.SITE_NAME TABLEAU_BASE_URL = const.TABLEAU_BASE_URL TABLEAU_ACCESS_TOKEN_NAME = const.TABLEAU_ACCESS_TOKEN_NAME TABLEAU_ACCESS_TOKEN_SECRET = const.TABLEAU_ACCESS_TOKEN_SECRET VERIFY_REQUEST = const.VERIFY_REQUEST def init(self, conf: ConfigTree) -> None: self._conf = conf self.query = """query { workbooks { id name createdAt description projectName projectVizportalUrlId vizportalUrlId } }""" self._extractor = self._build_extractor() transformers: List[Transformer] = [] timestamp_str_to_epoch_transformer = TimestampStringToEpoch() timestamp_str_to_epoch_transformer.init(conf=Scoped.get_scoped_conf( self._conf, timestamp_str_to_epoch_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ FIELD_NAME: 'created_timestamp', }))) transformers.append(timestamp_str_to_epoch_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_metadata.DashboardMetadata' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.tableau_dashboard_metadata' def _build_extractor(self) -> TableauGraphQLApiMetadataExtractor: """ Builds a TableauGraphQLApiMetadataExtractor. All data required can be retrieved with a single GraphQL call. :return: A TableauGraphQLApiMetadataExtractor that provides core dashboard metadata. """ extractor = TableauGraphQLApiMetadataExtractor() tableau_extractor_conf = \ Scoped.get_scoped_conf(self._conf, extractor.get_scope())\ .with_fallback(self._conf)\ .with_fallback(ConfigFactory.from_dict({TableauGraphQLApiExtractor.QUERY: self.query, STATIC_RECORD_DICT: {'product': 'tableau'} } ) ) extractor.init(conf=tableau_extractor_conf) return extractor
class ModeDashboardChartsExtractor(Extractor): """ A Extractor that extracts Dashboard charts """ def init(self, conf): # type: (ConfigTree) -> None self._conf = conf restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf) # Constructing URL using resource path via TemplateVariableSubstitutionTransformer transformers = [] chart_url_transformer = TemplateVariableSubstitutionTransformer() chart_url_transformer.init(conf=Scoped.get_scoped_conf( self._conf, chart_url_transformer.get_scope()).with_fallback( ConfigFactory.from_dict( { FIELD_NAME: 'chart_url', TEMPLATE: 'https://app.mode.com{chart_url}' }))) transformers.append(chart_url_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_chart.DashboardChart' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self): # type: () -> Any record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self): # type: () -> str return 'extractor.mode_dashboard_chart' def _build_restapi_query(self): """ Build REST API Query. To get Mode Dashboard last execution, it needs to call three APIs (spaces API, reports API, and run API) joining together. :return: A RestApiQuery that provides Mode Dashboard execution (run) """ # type: () -> RestApiQuery spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) params = ModeDashboardUtils.get_auth_params(conf=self._conf) # Reports # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace report_url_template = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports' json_path = '(_embedded.reports[*].token)' field_names = ['dashboard_id'] reports_query = RestApiQuery(query_to_join=spaces_query, url=report_url_template, params=params, json_path=json_path, field_names=field_names, skip_no_result=True) queries_url_template = 'https://app.mode.com/api/{organization}/reports/{dashboard_id}/queries' json_path = '_embedded.queries[*].[token,name]' field_names = ['query_id', 'query_name'] query_names_query = RestApiQuery(query_to_join=reports_query, url=queries_url_template, params=params, json_path=json_path, field_names=field_names, skip_no_result=True) charts_url_template = 'https://app.mode.com/api/{organization}/reports/{dashboard_id}/queries/{query_id}/charts' json_path = '(_embedded.charts[*].token) | (_embedded.charts[*]._links.report_viz_web.href)' field_names = ['chart_id', 'chart_url'] chart_names_query = RestApiQuery(query_to_join=query_names_query, url=charts_url_template, params=params, json_path=json_path, field_names=field_names, skip_no_result=True, json_path_contains_or=True) return chart_names_query
class ModeDashboardQueriesExtractor(Extractor): """ A Extractor that extracts Query information """ def init(self, conf: ConfigTree) -> None: self._conf = conf restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf) # Constructing URL using several ID via TemplateVariableSubstitutionTransformer transformers: List[Transformer] = [] variable_substitution_transformer = TemplateVariableSubstitutionTransformer( ) variable_substitution_transformer.init(conf=Scoped.get_scoped_conf( self._conf, variable_substitution_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ FIELD_NAME: 'url', TEMPLATE: 'https://app.mode.com/{organization}' '/reports/{dashboard_id}/queries/{query_id}' }))) transformers.append(variable_substitution_transformer) # Escape backslash as it breaks Cypher statement. replace_transformer = RegexStrReplaceTransformer() replace_transformer.init(conf=Scoped.get_scoped_conf( self._conf, replace_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ REGEX_REPLACE_TUPLE_LIST: [('\\', '\\\\')], ATTRIBUTE_NAME: 'query_text' }))) transformers.append(replace_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_query.DashboardQuery' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.mode_dashboard_query' def _build_restapi_query(self) -> ModePaginatedRestApiQuery: """ Build REST API Query to get Mode Dashboard queries :return: A RestApiQuery that provides Mode Dashboard execution (run) """ seed_query = ModeDashboardUtils.get_seed_query(conf=self._conf) params = ModeDashboardUtils.get_auth_params(conf=self._conf, discover_auth=True) # Queries # https://mode.com/developer/discovery-api/analytics/queries/ url = 'https://app.mode.com/batch/{organization}/queries' json_path = 'queries[*].[report_token, space_token, token, name, raw_query]' field_names = [ 'dashboard_id', 'dashboard_group_id', 'query_id', 'query_name', 'query_text' ] max_record_size = 1000 pagination_json_path = 'queries[*]' query_names_query = ModePaginatedRestApiQuery( query_to_join=seed_query, url=url, params=params, json_path=json_path, field_names=field_names, skip_no_result=True, max_record_size=max_record_size, pagination_json_path=pagination_json_path) return query_names_query
class ModeDashboardExecutionsExtractor(Extractor): """ A Extractor that extracts run (execution) status and timestamp. """ def init(self, conf: ConfigTree) -> None: self._conf = conf restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf ) # Payload from RestApiQuery has timestamp which is ISO8601. Here we are using TimestampStringToEpoch to # transform into epoch and then using DictToModel to convert Dictionary to Model transformers: List[Transformer] = [] timestamp_str_to_epoch_transformer = TimestampStringToEpoch() timestamp_str_to_epoch_transformer.init( conf=Scoped.get_scoped_conf(self._conf, timestamp_str_to_epoch_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({FIELD_NAME: 'execution_timestamp', }))) transformers.append(timestamp_str_to_epoch_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init( conf=Scoped.get_scoped_conf(self._conf, dict_to_model_transformer.get_scope()).with_fallback( ConfigFactory.from_dict( {MODEL_CLASS: 'databuilder.models.dashboard.dashboard_execution.DashboardExecution'}))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.mode_dashboard_execution' def _build_restapi_query(self) -> ModePaginatedRestApiQuery: """ Build REST API Query to get Mode Dashboard last execution. :return: A RestApiQuery that provides Mode Dashboard execution (run) """ seed_query = ModeDashboardUtils.get_seed_query(conf=self._conf) params = ModeDashboardUtils.get_auth_params(conf=self._conf, discover_auth=True) # Reports # https://mode.com/developer/discovery-api/analytics/reports/ url = 'https://app.mode.com/batch/{organization}/reports' json_path = 'reports[*].[token, space_token, last_run_at, last_run_state]' field_names = ['dashboard_id', 'dashboard_group_id', 'execution_timestamp', 'execution_state'] max_record_size = 1000 pagination_json_path = 'reports[*]' last_execution_query = ModePaginatedRestApiQuery(query_to_join=seed_query, url=url, params=params, json_path=json_path, field_names=field_names, skip_no_result=True, max_record_size=max_record_size, pagination_json_path=pagination_json_path) return last_execution_query
class TableauDashboardTableExtractor(Extractor): """ Extracts metadata about the tables associated with Tableau workbooks. It can handle both "regular" database tables as well as "external" tables (see TableauExternalTableExtractor for more info on external tables). Assumes that all the nodes for both the dashboards and the tables have already been created. """ API_BASE_URL = const.API_BASE_URL API_VERSION = const.API_VERSION CLUSTER = const.CLUSTER DATABASE = const.DATABASE EXCLUDED_PROJECTS = const.EXCLUDED_PROJECTS EXTERNAL_CLUSTER_NAME = const.EXTERNAL_CLUSTER_NAME SITE_NAME = const.SITE_NAME TABLEAU_ACCESS_TOKEN_NAME = const.TABLEAU_ACCESS_TOKEN_NAME TABLEAU_ACCESS_TOKEN_SECRET = const.TABLEAU_ACCESS_TOKEN_SECRET VERIFY_REQUEST = const.VERIFY_REQUEST def init(self, conf: ConfigTree) -> None: self._conf = conf self.query = """query { workbooks { name projectName upstreamTables { name schema database { name connectionType } } } }""" self._extractor = self._build_extractor() transformers = [] dict_to_model_transformer = DictToModel() dict_to_model_transformer.init( conf=Scoped.get_scoped_conf(self._conf, dict_to_model_transformer.get_scope()).with_fallback( ConfigFactory.from_dict( {MODEL_CLASS: 'databuilder.models.dashboard.dashboard_table.DashboardTable'}))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.tableau_dashboard_table' def _build_extractor(self) -> TableauGraphQLDashboardTableExtractor: """ Builds a TableauGraphQLDashboardTableExtractor. All data required can be retrieved with a single GraphQL call. :return: A TableauGraphQLDashboardTableExtractor that creates dashboard <> table relationships. """ extractor = TableauGraphQLDashboardTableExtractor() tableau_extractor_conf = \ Scoped.get_scoped_conf(self._conf, extractor.get_scope())\ .with_fallback(self._conf)\ .with_fallback(ConfigFactory.from_dict({TableauGraphQLApiExtractor.QUERY: self.query, STATIC_RECORD_DICT: {'product': 'tableau'} } ) ) extractor.init(conf=tableau_extractor_conf) return extractor
class TblColUsgAggExtractor(Extractor): """ An aggregate extractor for table column usage. It uses RegexStrReplaceTransformer to cleanse SQL statement and uses SqlToTblColUsageTransformer to get table column usage. All usage will be aggregated in memory and on last record, it will return aggregated TableColumnUsage Note that this extractor will do all the transformation and aggregation so that no more transformation is needed, after this. """ def init(self, conf): # type: (ConfigTree) -> None self._extractor = conf.get(RAW_EXTRACTOR) # type: Extractor self._extractor.init( Scoped.get_scoped_conf(conf, self._extractor.get_scope())) regex_transformer = RegexStrReplaceTransformer() # type: Any if conf.get(regex_transformer.get_scope(), None): regex_transformer.init( Scoped.get_scoped_conf(conf, regex_transformer.get_scope())) else: LOGGER.info('{} is not defined. Not using it'.format( regex_transformer.get_scope())) regex_transformer = NoopTransformer() sql_to_usage_transformer = SqlToTblColUsageTransformer() sql_to_usage_transformer.init( Scoped.get_scoped_conf(conf, sql_to_usage_transformer.get_scope())) self._transformer = ChainedTransformer( (regex_transformer, sql_to_usage_transformer)) def extract(self): # type: () -> Optional[TableColumnUsage] """ It aggregates all count per table and user in memory. Table level aggregation don't expect to occupy much memory. :return: Provides a record or None if no more to extract """ count_map = {} # type: Dict[TableColumnUsageTuple, int] record = self._extractor.extract() count = 0 while record: count += 1 if count % 1000 == 0: LOGGER.info('Aggregated {} records'.format(count)) tbl_col_usg = self._transformer.transform(record=record) record = self._extractor.extract() # filtered case if not tbl_col_usg: continue for col_rdr in tbl_col_usg.col_readers: key = TableColumnUsageTuple(database=col_rdr.database, cluster=col_rdr.cluster, schema=col_rdr.schema, table=col_rdr.table, column=col_rdr.column, email=col_rdr.user_email) new_count = count_map.get(key, 0) + col_rdr.read_count count_map[key] = new_count if not len(count_map): return None col_readers = [] # type: List[ColumnReader] while len(count_map): tbl_col_rdr_tuple, count = count_map.popitem() col_readers.append( ColumnReader(database=tbl_col_rdr_tuple.database, cluster=tbl_col_rdr_tuple.cluster, schema=tbl_col_rdr_tuple.schema, table=tbl_col_rdr_tuple.table, column=tbl_col_rdr_tuple.column, user_email=tbl_col_rdr_tuple.email, read_count=count)) return TableColumnUsage(col_readers=col_readers) def get_scope(self): # type: () -> str return 'extractor.table_column_usage_aggregate' def close(self): # type: () -> None self._transformer.close()
class TableauDashboardLastModifiedExtractor(Extractor): """ Extracts metadata about the time of last update for Tableau dashboards. For the purposes of this extractor, Tableau "workbooks" are mapped to Amundsen dashboards, and the top-level project in which these workbooks preside is the dashboard group. The metadata it gathers is: Dashboard last modified timestamp (Workbook last modified timestamp) """ API_BASE_URL = const.API_BASE_URL API_VERSION = const.API_VERSION CLUSTER = const.CLUSTER EXCLUDED_PROJECTS = const.EXCLUDED_PROJECTS SITE_NAME = const.SITE_NAME TABLEAU_ACCESS_TOKEN_NAME = const.TABLEAU_ACCESS_TOKEN_NAME TABLEAU_ACCESS_TOKEN_SECRET = const.TABLEAU_ACCESS_TOKEN_SECRET VERIFY_REQUEST = const.VERIFY_REQUEST def init(self, conf: ConfigTree) -> None: self._conf = conf self.query = """query { workbooks { id name projectName updatedAt } }""" self._extractor = self._build_extractor() transformers: List[Transformer] = [] timestamp_str_to_epoch_transformer = TimestampStringToEpoch() timestamp_str_to_epoch_transformer.init(conf=Scoped.get_scoped_conf( self._conf, timestamp_str_to_epoch_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ FIELD_NAME: 'last_modified_timestamp', }))) transformers.append(timestamp_str_to_epoch_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_last_modified.DashboardLastModifiedTimestamp' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.tableau_dashboard_last_modified' def _build_extractor(self) -> TableauGraphQLApiLastModifiedExtractor: """ Builds a TableauGraphQLApiExtractor. All data required can be retrieved with a single GraphQL call. :return: A TableauGraphQLApiLastModifiedExtractor that provides dashboard update metadata. """ extractor = TableauGraphQLApiLastModifiedExtractor() tableau_extractor_conf = \ Scoped.get_scoped_conf(self._conf, extractor.get_scope())\ .with_fallback(self._conf)\ .with_fallback(ConfigFactory.from_dict({TableauGraphQLApiExtractor.QUERY: self.query, STATIC_RECORD_DICT: {'product': 'tableau'} } ) ) extractor.init(conf=tableau_extractor_conf) return extractor
class ModeDashboardExtractor(Extractor): """ A Extractor that extracts core metadata on Mode dashboard. https://app.mode.com/ It extracts list of reports that consists of: Dashboard group name (Space name) Dashboard group id (Space token) Dashboard group description (Space description) Dashboard name (Report name) Dashboard id (Report token) Dashboard description (Report description) Other information such as report run, owner, chart name, query name is in separate extractor. """ def init(self, conf: ConfigTree) -> None: self._conf = conf self.dashboard_group_ids_to_skip = self._conf.get_list( DASHBOARD_GROUP_IDS_TO_SKIP, []) restapi_query = self._build_restapi_query() self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor( restapi_query=restapi_query, conf=self._conf) # Payload from RestApiQuery has timestamp which is ISO8601. Here we are using TimestampStringToEpoch to # transform into epoch and then using DictToModel to convert Dictionary to Model transformers: List[Transformer] = [] timestamp_str_to_epoch_transformer = TimestampStringToEpoch() timestamp_str_to_epoch_transformer.init(conf=Scoped.get_scoped_conf( self._conf, timestamp_str_to_epoch_transformer.get_scope()).with_fallback( ConfigFactory.from_dict({ FIELD_NAME: 'created_timestamp', }))) transformers.append(timestamp_str_to_epoch_transformer) dashboard_group_url_transformer = TemplateVariableSubstitutionTransformer( ) dashboard_group_url_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dashboard_group_url_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ VAR_FIELD_NAME: 'dashboard_group_url', TEMPLATE: 'https://app.mode.com/{organization}/spaces/{dashboard_group_id}' }))) transformers.append(dashboard_group_url_transformer) dashboard_url_transformer = TemplateVariableSubstitutionTransformer() dashboard_url_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dashboard_url_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ VAR_FIELD_NAME: 'dashboard_url', TEMPLATE: 'https://app.mode.com/{organization}/reports/{dashboard_id}' }))) transformers.append(dashboard_url_transformer) dict_to_model_transformer = DictToModel() dict_to_model_transformer.init(conf=Scoped.get_scoped_conf( self._conf, dict_to_model_transformer.get_scope() ).with_fallback( ConfigFactory.from_dict({ MODEL_CLASS: 'databuilder.models.dashboard.dashboard_metadata.DashboardMetadata' }))) transformers.append(dict_to_model_transformer) self._transformer = ChainedTransformer(transformers=transformers) def extract(self) -> Any: record = self._extractor.extract() # determine whether we want to skip these records while record and record.get( 'dashboard_group_id') in self.dashboard_group_ids_to_skip: record = self._extractor.extract() if not record: return None return self._transformer.transform(record=record) def get_scope(self) -> str: return 'extractor.mode_dashboard' def _build_restapi_query(self) -> ModePaginatedRestApiQuery: """ Build REST API Query to get Mode Dashboard metadata :return: A RestApiQuery that provides Mode Dashboard metadata """ seed_query = ModeDashboardUtils.get_seed_query(conf=self._conf) params = ModeDashboardUtils.get_auth_params(conf=self._conf, discover_auth=True) # Reports # https://mode.com/developer/discovery-api/analytics/reports/ url = 'https://app.mode.com/batch/{organization}/reports' json_path = 'reports[*].[token, name, description, created_at, space_token]' field_names = [ 'dashboard_id', 'dashboard_name', 'description', 'created_timestamp', 'dashboard_group_id' ] max_record_size = 1000 pagination_json_path = 'reports[*]' spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf) query_merger = QueryMerger(query_to_merge=spaces_query, merge_key='dashboard_group_id') reports_query = ModePaginatedRestApiQuery( query_to_join=seed_query, url=url, params=params, json_path=json_path, field_names=field_names, skip_no_result=True, max_record_size=max_record_size, pagination_json_path=pagination_json_path, query_merger=query_merger) return reports_query