def read_dremio_environment_from_directory(self): try: source_directory = self._config.source_directory dremio_data = DremioData() f = open( os.path.join(source_directory, self._config.dremio_conf_filename), "r") dremio_data.dremio_get_config = json.load(f) f.close() self._collect_directory(os.path.join(source_directory, 'homes'), dremio_data.homes, dremio_data.folders, dremio_data.homes) self._collect_directory(os.path.join(source_directory, 'spaces'), dremio_data.spaces, dremio_data.folders, dremio_data.vds_list) self._collect_directory(os.path.join(source_directory, 'sources'), dremio_data.sources, None, dremio_data.pds_list) self._collect_directory( os.path.join(source_directory, 'reflections'), None, None, dremio_data.reflections) self._collect_directory( os.path.join(source_directory, 'referenced_users'), None, None, dremio_data.referenced_users) self._collect_directory( os.path.join(source_directory, 'referenced_groups'), None, None, dremio_data.referenced_groups) self._collect_directory(os.path.join(source_directory, 'queues'), None, None, dremio_data.queues) self._collect_directory(os.path.join(source_directory, 'rules'), None, None, dremio_data.rules) self._collect_directory(os.path.join(source_directory, 'tags'), None, None, dremio_data.tags) self._collect_directory(os.path.join(source_directory, 'wikis'), None, None, dremio_data.wikis) except OSError as e: raise Exception("Error reading file. OS Error: " + e.strerror) return dremio_data
def read_dremio_environment_from_json_file(self, filename): f = open(filename, "r") data = json.load(f)['data'] f.close() dremio_data = DremioData() for item in data: if ('dremio_environment' in item): logging.info( "read_dremio_environment: processing environment " + str(item)) elif ('containers' in item): dremio_data.containers = item['containers'] elif ('homes' in item): dremio_data.homes = item['homes'] elif ('sources' in item): dremio_data.sources = item['sources'] elif ('spaces' in item): dremio_data.spaces = item['spaces'] elif ('folders' in item): dremio_data.folders = item['folders'] elif ('pds' in item): dremio_data.pds_list = item['pds'] elif ('vds' in item): dremio_data.vds_list = item['vds'] elif ('files' in item): dremio_data.files = item['files'] elif ('reflections' in item): dremio_data.reflections = item['reflections'] elif ('referenced_users' in item): dremio_data.referenced_users = item['referenced_users'] elif ('referenced_groups' in item): dremio_data.referenced_groups = item['referenced_groups'] elif ('queues' in item): dremio_data.queues = item['queues'] elif ('rules' in item): dremio_data.rules = item['rules'] elif ('tags' in item): dremio_data.tags = item['tags'] elif ('wikis' in item): dremio_data.wikis = item['wikis'] elif ('votes' in item): dremio_data.votes = item['votes'] elif ('vds_parents' in item): dremio_data.vds_parents = item['vds_parents'] elif ('dremio_get_config' in item): dremio_data.dremio_get_config = item['dremio_get_config'] else: logging.warn( "read_dremio_environment: unexpected data in the source file " + str(item)) return dremio_data
class DremioReader: # Dremio Cloner Configuration, Utils, ... _config = None _utils = None _logger = None _filter = None # Dremio object pointing to the source Dremio environment _dremio_env = None # DremioData object containing data from Dremio source environment _d = DremioData() # Current top-level hierarchy context: Home, Space, Source _top_level_hierarchy_context = None def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config) # Read all data from the source Dremio environemnt # Return DremioData def read_dremio_environment(self): self._read_catalog() if not self._config.pds_list_useapi and self._filter.is_pds_in_scope(): self._read_all_pds() self._read_reflections() self._read_rules() self._read_queues() self._read_votes() # Make sure that all VDS dependencies included as per configuration self._process_vds_dependencies() return self._d def _read_all_pds(self): if self._config.pds_list_useapi or not self._filter.is_pds_in_scope(): self._logger.info( "_read_all_pds: skipping PDS reading as per pds.filter configuration." ) else: pds_list = self._dremio_env.list_pds( self._d.sources, self._config.source_folder_filter, self._config.source_folder_exclude_filter, self._config.pds_filter, self._config.pds_exclude_filter, pds_error_list=self._d.pds_error_list) for pds in pds_list: if self._filter.match_pds_filter(pds): self._d.pds_list.append(pds) # Read Dremio catalog from source environment recursively going to containers and their children objects def _read_catalog(self): containers = self._dremio_env.list_catalog()['data'] for container in containers: self._logger.debug("_read_catalog: processing container " + self._utils.get_entity_desc(container)) self._process_container(container) # Identify a container and delegate processing def _process_container(self, container): self._logger.debug("_process_container: " + self._utils.get_entity_desc(container)) if container['containerType'] == "HOME": self._read_home(container) elif container['containerType'] == "SPACE": self._read_space(container) elif container['containerType'] == "SOURCE": self._read_source(container) else: self._logger.fatal("_process_container: unexpected entity type " + self._utils.get_entity_desc(container)) def _read_home(self, container): self._logger.debug("_read_home: processing container: " + self._utils.get_entity_desc(container)) if self._config.home_process_mode == 'process': self._top_level_hierarchy_context = "HOME" self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: self._logger.info("_read_home: " + self._utils.get_entity_desc(entity)) self._d.homes.append(entity) self._read_acl(entity) self._read_wiki(entity) self._read_space_children(entity) else: self._logger.error( "_read_home: error reading entity for container: " + self._utils.get_entity_desc(container)) else: self._logger.debug("_read_home: skipping due to job configuration") def _read_space(self, container): self._logger.debug("_read_space: processing container: " + self._utils.get_entity_desc(container)) self._top_level_hierarchy_context = "SPACE" if self._filter.match_space_filter(container): self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: self._logger.debug("_read_space: " + self._utils.get_entity_desc(container)) self._d.spaces.append(entity) self._read_acl(entity) self._read_wiki(entity) self._read_space_children(entity) else: self._logger.error( "_read_space: error reading entity for container: " + self._utils.get_entity_desc(container)) def _read_source(self, container): self._logger.debug("_read_source: processing container: " + self._utils.get_entity_desc(container)) if self._config.source_process_mode == 'process' or ( self._config.pds_process_mode == 'process' and self._config.pds_list_useapi): self._top_level_hierarchy_context = "SOURCE" if self._filter.match_source_filter(container): self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: # Re-validate the filter with entity since there is more details in entity if self._filter.match_source_filter(entity): self._logger.debug("_read_source: " + self._utils.get_entity_desc(entity)) self._d.sources.append(entity) self._read_acl(entity) self._read_wiki(entity) # Depending on the useapi flag, PDSs can be collected via INFORMATION_SCHEMA. See also DX16597 if self._config.pds_list_useapi: self._read_source_children(entity) else: self._logger.error( "_read_source: error reading entity for container: " + self._utils.get_entity_desc(container)) else: self._logger.debug( "_read_source: skipping due to job configuration") def _read_space_folder(self, folder): self._logger.debug("_read_space_folder: processing folder: " + self._utils.get_entity_desc(folder)) if self._top_level_hierarchy_context not in ["SPACE", "HOME"]: return entity = self._get_entity_definition_by_id(folder) if entity is None: self._logger.error( "_read_space_folder: error reading entity for folder: " + self._utils.get_entity_desc(folder)) return if self._top_level_hierarchy_context == "HOME" or self._filter.match_space_folder_filter( folder): self._logger.debug("_read_space_folder: " + self._utils.get_entity_desc(folder)) self._d.folders.append(entity) self._read_acl(entity) self._read_wiki(entity) # Validate all parent folders in the path have been saved already folder_path = entity['path'] for i in range(1, len(folder_path) - 1): folderSaved = False for item in self._d.folders: if item['path'][-1] == folder_path[i]: folderSaved = True if not folderSaved: parent_entity = self._get_entity_definition_by_path( folder_path[0:i + 1]) self._d.folders.append(parent_entity) self._read_space_children(entity) def _read_space_children(self, parent_entity): self._logger.debug("_read_space_children: processing parent_entity: " + self._utils.get_entity_desc(parent_entity)) if 'entityType' not in parent_entity: self._logger.error( "_read_space_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return for child in parent_entity['children']: if child['type'] == "DATASET": self._read_dataset(child) elif child['type'] == "FILE": self._read_file(child) elif child['containerType'] == "FOLDER": self._read_space_folder(child) else: self._logger.error( "_read_space_children: not supported entity type " + child['type']) def _read_source_folder(self, folder): self._logger.debug("_read_source_folder: processing folder: " + self._utils.get_entity_desc(folder)) if self._top_level_hierarchy_context == "SOURCE" and self._filter.match_source_folder_filter( folder): entity = self._get_entity_definition_by_id(folder) if entity is not None: self._logger.debug("_read_source_folder: " + self._utils.get_entity_desc(folder)) self._read_source_children(entity) else: self._logger.error( "_read_source_folder: error reading entity for folder: " + self._utils.get_entity_desc(folder)) def _read_source_children(self, parent_entity): self._logger.debug( "_read_source_children: processing parent entity '" + self._utils.get_entity_desc(parent_entity) + "'") if 'entityType' not in parent_entity: self._logger.error( "_read_source_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return for child in parent_entity['children']: if child['type'] == "DATASET": self._read_dataset(child) elif child['type'] == "FILE": self._read_file(child) elif child['containerType'] == "FOLDER": self._read_source_folder(child) else: self._logger.error( "_read_source_children: not supported entity type " + child['type']) def _read_dataset(self, dataset): self._logger.debug("_read_dataset: processing dataset: " + self._utils.get_entity_desc(dataset)) entity = self._get_entity_definition_by_id(dataset) if entity is not None: self._logger.debug("_read_dataset: " + dataset['datasetType'] + " : " + self._utils.get_entity_desc(dataset)) if dataset['datasetType'] == "PROMOTED" or dataset[ 'datasetType'] == "DIRECT": self._d.pds_list.append(entity) elif dataset['datasetType'] == "VIRTUAL": tags = self._dremio_env.get_catalog_tags(entity['id']) if self._filter.match_vds_filter(dataset, tags=tags): self._d.vds_list.append(entity) else: self._logger.error("_read_dataset: Unexpected dataset type " + dataset['datasetType'] + " for " + self._utils.get_entity_desc(dataset) + ".") self._read_acl(entity) self._read_wiki(entity) self._read_tags(entity) def _read_file(self, file_name): # do nothing return def _read_reflections(self): self._logger.debug("_read_reflections: starting") if self._config.reflection_process_mode == 'process' and not self._config.source_ce: reflections = self._dremio_env.list_reflections()['data'] for reflection in reflections: reflection_dataset = self._dremio_env.get_catalog_entity_by_id( reflection['datasetId']) if reflection_dataset is None: self._logger.debug( "_read_reflections: error processing reflection, cannot get path for dataset: " + reflection['datasetId']) continue reflection_path = reflection_dataset['path'] self._logger.debug( "_read_reflections: processing reflection " + reflection['datasetId'] + " path: " + str(reflection_path)) reflection["path"] = reflection_path self._d.reflections.append(reflection) # self._read_acl(reflection) # self._read_wiki(reflection) else: self._logger.debug( "_read_reflections: skipping reflections processing as per job configuration" ) # Note, tags are only available for datasets def _read_tags(self, entity): self._logger.debug("_read_tags: for entity " + self._utils.get_entity_desc(entity)) if self._config.tag_process_mode == 'process': tag = self._dremio_env.get_catalog_tags(entity['id']) if tag is not None: tag['entity_id'] = entity['id'] if entity['entityType'] == 'space' or entity[ 'entityType'] == 'source': tag['path'] = [entity['name']] else: tag['path'] = entity['path'] if tag not in self._d.tags: self._d.tags.append(tag) else: self._logger.debug( "_read_tags: skipping tags processing as per job configuration" ) def _read_wiki(self, entity): self._logger.debug("_read_wiki: for entity " + self._utils.get_entity_desc(entity)) if self._config.wiki_process_mode == 'process': wiki = self._dremio_env.get_catalog_wiki(entity['id']) if wiki is not None: wiki["entity_id"] = entity['id'] if entity['entityType'] == 'space' or entity[ 'entityType'] == 'source' or entity[ 'entityType'] == 'home': wiki['path'] = [entity['name']] else: wiki['path'] = entity['path'] if wiki not in self._d.wikis: self._d.wikis.append(wiki) else: self._logger.debug( "_read_wiki: skipping wiki processing as per job configuration" ) def _read_acl(self, entity): self._logger.debug("_read_acl: for entity " + self._utils.get_entity_desc(entity)) if 'accessControlList' in entity: acl = entity['accessControlList'] if 'users' in acl: for user in acl['users']: user_entity = self._dremio_env.get_user(user['id']) if user_entity is not None: if user_entity not in self._d.referenced_users: self._d.referenced_users.append(user_entity) if 'groups' in acl: for group in acl['groups']: group_entity = self._dremio_env.get_group(group['id']) if group_entity is not None: if group_entity not in self._d.referenced_groups: self._d.referenced_groups.append(group_entity) def _process_vds_dependencies(self): if self._config.vds_dependencies_process_mode == 'get': for vds in self._d.vds_list: self._discover_dependencies(vds) for vds in self._d.vds_list: self._populate_dependencies_graph(vds) # Discovers dependencies for the passed dataset and adds them to the self._d.vds_list def _discover_dependencies(self, dataset): self._logger.debug("_discover_dependencies: processing dataset: " + self._utils.get_entity_desc(dataset)) if dataset is not None: if 'type' not in dataset: self._logger.error( "_discover_dependencies: Expected Dataset Entity but got: " + self._utils.get_entity_desc(dataset)) return if dataset['type'] == 'PHYSICAL_DATASET': if dataset not in self._d.pds_list: self._d.pds_list.append(dataset) return elif dataset['type'] == 'VIRTUAL_DATASET': if dataset not in self._d.vds_list: self._d.vds_list.append(dataset) # Process VDS dependencies sql_dependency_paths = self._get_vds_dependency_paths(dataset) for dependency_path in sql_dependency_paths: dependency_path = self._utils.get_absolute_path( dependency_path, self._utils.get_sql_context(dataset)) entity = self._find_entity(dependency_path) if entity is not None: # Entity has already been read return dependency_dataset = self._dremio_env.get_catalog_entity_by_path( dependency_path) if dependency_dataset is None: self._logger.warn( "_discover_dependencies: unable to resolve dataset likely due to datasource availability: " + dependency_path) else: self._discover_dependencies(dependency_dataset) else: self._logger.error( "_discover_dependencies: Unknown Entity Type: " + dataset['type']) else: self._logger.error( "_discover_dependencies: Could not resolve dependency: None") def _populate_dependencies_graph(self, vds): self._logger.debug("_populate_dependencies_graph: processing vds: " + self._utils.get_entity_desc(vds)) # For some broken VDSs, vds_parent_list = self._get_vds_dependency_paths(vds) vds_parent_json = { 'id': vds['id'], 'path': vds['path'], 'parents': vds_parent_list } if not self._config.source_ce and self._config.source_graph_support: self._d.vds_parents.append(vds_parent_json) def _get_vds_dependency_paths(self, vds): self._logger.debug("_get_vds_dependency_paths: processing vds: " + self._utils.get_entity_desc(vds)) if self._config.source_ce or not self._config.source_graph_support: return parse_sql.tables_in_query(vds['sql']) else: graph = self._dremio_env.get_catalog_entity_graph_by_id(vds['id']) if graph is None: self._logger.warn( "Could not receive Graph via API. Try to set graph_api_support to False in the job configuration." ) return parse_sql.tables_in_query(vds['sql']) vds_parent_list = [] for parent in graph['parents']: vds_parent_list.append( self._utils.normalize_path(parent['path'])) return vds_parent_list def _find_entity(self, path): self._logger.debug("_find_entity: processing path: " + str(path)) for vds in self._d.vds_list: if self._utils.normalize_path(vds['path']) == path: return vds for pds in self._d.pds_list: if self._utils.normalize_path(pds['path']) == path: return pds # Helper method, used by most read* methods def _get_entity_definition_by_id(self, src): self._logger.debug("_get_entity_definition_by_id: processing src: " + self._utils.get_entity_desc(src)) if 'id' not in src: self._logger.error( "_read_entity_definition: bad data, skipping entity: " + self._utils.get_entity_desc(src)) return None else: entity = self._dremio_env.get_catalog_entity_by_id(src['id']) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for id: " + src['id']) return entity def _get_entity_definition_by_path(self, path): self._logger.debug( "_get_entity_definition_by_path: processing path: " + str(path)) path = self._utils.normalize_path(path) entity = self._dremio_env.get_catalog_entity_by_path(path) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for path: " + str(path)) return entity def _read_queues(self): self._logger.debug("read_queues: started") if self._config.wlm_queue_process_mode == 'process' and not self._config.source_ce: self._d.queues = self._dremio_env.list_queues()['data'] else: self._logger.debug( "_read_queues: skipping as per job configuration") def _read_rules(self): self._logger.debug("read_rules: started") if self._config.wlm_rule_process_mode == 'process' and not self._config.source_ce: self._d.rules = self._dremio_env.list_rules()['rules'] else: self._logger.debug("read_rules: skipping as per job configuration") def _read_votes(self): self._logger.debug("read_votes: started") if self._config.vote_process_mode == 'process' and not self._config.source_ce: self._d.votes = self._dremio_env.list_votes()['data'] else: self._logger.debug("read_votes: skipping as per job configuration") def get_errors_count(self): return self._logger.errors_encountered