def __init__(self, target_dremio, dremio_data, config): self._config = config self._dremio_env = target_dremio self._d = dremio_data self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._filter = DremioClonerFilter(config) self._utils = DremioClonerUtils(config)
def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config)
class DremioWriter: # Dremio Cloner Config, Utils, ... _config = None _utils = None _logger = None _filter = None # Dremio Environment to write to _dremio_env = None # Dremio Data to write _d = None # VDS list grouped by hierarchy _vds_hierarchy = [] _hierarchy_depth = 0 _unresolved_vds = [] # Referenced Users and Groups in the target environment _target_dremio_users = [] _target_dremio_groups = [] # Resolved Datasets for Reflections _existing_reflections = list() # Dry run collections _dry_run_processed_vds_list = [] _dry_run_processed_pds_list = [] def __init__(self, target_dremio, dremio_data, config): self._config = config self._dremio_env = target_dremio self._d = dremio_data self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._filter = DremioClonerFilter(config) self._utils = DremioClonerUtils(config) def write_dremio_environment(self): self._retrieve_users_groups() if self._config.acl_transformation != {} and self._d.referenced_users == [] and self._d.referenced_groups == []: self._logger.warn("ACL Transformation has been defined while Referenced Users and Referenced Groups are not present in the Source Dremio Data.") if self._config.reflection_process_mode != 'skip': self._existing_reflections = self._dremio_env.list_reflections()['data'] if self._config.source_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping source processing due to configuration source.process_mode=skip.") else: for source in self._d.sources: self._write_source(source, self._config.source_process_mode, self._config.source_ignore_missing_acl_user, self._config.source_ignore_missing_acl_group) if self._config.pds_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping source PDS processing due to configuration source.pds.process_mode=skip.") else: for pds in self._d.pds_list: self._write_pds(pds, self._config.pds_process_mode, self._config.pds_ignore_missing_acl_user, self._config.pds_ignore_missing_acl_group) if self._config.space_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping space processing due to configuration space.process_mode=skip.") else: for space in self._d.spaces: self._write_space(space, self._config.space_process_mode, self._config.space_ignore_missing_acl_user, self._config.space_ignore_missing_acl_group) if self._config.folder_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping folder processing due to configuration folder.process_mode=skip.") else: for folder in self._d.folders: self._write_folder(folder, self._config.folder_process_mode, self._config.folder_ignore_missing_acl_user, self._config.folder_ignore_missing_acl_group) if self._config.vds_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping VDS processing due to configuration vds.process_mode=skip.") else: self._order_vds(0) self._write_vds_hierarchy() self._write_remainder_vds() if self._config.reflection_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping reflection processing due to configuration reflection.process_mode=skip.") else: for reflection in self._d.reflections: self._write_reflection(reflection, self._config.reflection_process_mode) if self._config.reflection_refresh_mode != 'refresh': self._logger.info("write_dremio_environment: Skipping reflection refresh due to configuration reflection.refresh_mode=skip.") else: for pds in self._d.pds_list: self._dremio_env.refresh_reflections_by_pds_path(self._utils.normalize_path(pds['path']), self._config.dry_run) if self._config.wiki_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping wiki processing due to configuration wiki.process_mode=skip.") else: for wiki in self._d.wikis: self._write_wiki(wiki, self._config.wiki_process_mode) if self._config.tag_process_mode == 'skip': self._logger.info("write_dremio_environment: Skipping tag processing due to configuration tag.process_mode=skip.") else: for tags in self._d.tags: self._write_tags(tags, self._config.tag_process_mode) def _write_space(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): if self._filter.match_space_filter(entity): self._logger.debug("_write_space: processing entity: " + self._utils.get_entity_desc(entity)) return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: self._logger.debug("_write_space: skipping entity: " + self._utils.get_entity_desc(entity)) return None def _write_source(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): if self._filter.match_source_filter(entity): self._logger.debug("_write_source: processing entity: " + self._utils.get_entity_desc(entity)) return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: self._logger.debug("_write_source: skipping entity: " + self._utils.get_entity_desc(entity)) return None def _write_folder(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): # Drop ACL for HOME folders if entity['path'][0][:1] == '@' and 'accessControlList' in entity: entity.pop("accessControlList") # Do not apply space.folder.filter to Home folders if entity['path'][0][:1] == '@' or self._filter.match_space_folder_filter(entity): self._logger.debug("_write_folder: processing entity: " + self._utils.get_entity_desc(entity)) return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: self._logger.debug("_write_folder: skipping entity: " + self._utils.get_entity_desc(entity)) return None def _retrieve_users_groups(self): for user in self._d.referenced_users: target_user = self._dremio_env.get_user_by_name(user['name']) if target_user is not None: self._target_dremio_users.append(target_user) else: self._logger.error("_retrieve_users_groups: Unable to resolve user in target Dremio environment: " + str(user['name'])) for group in self._d.referenced_groups: target_group = self._dremio_env.get_group_by_name(group['name']) if target_group is not None: self._target_dremio_groups.append(target_group) else: self._logger.error("_retrieve_users_groups: Unable to resolve group in target Dremio environment: " + str(group['name'])) # Retrieve acl transformation target users and groups for item in self._config.acl_transformation: if 'user' in item['target']: user = self._dremio_env.get_user_by_name(item['target']['user']) if user is not None: # dont worry about dups self._target_dremio_users.append(user) else: self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION user in target Dremio environment: " + str(item['target']['user'])) if 'group' in item['target']: group = self._dremio_env.get_group_by_name(item['target']['group']) if group is not None: # dont worry about dups self._target_dremio_groups.append(group) else: self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION group in target Dremio environment: " + str(item['target']['group'])) def _write_vds_hierarchy(self): for level in range(0, self._hierarchy_depth): for item in self._vds_hierarchy: if item[0] == level: vds = item[1] if self._filter.match_vds_filter(vds): self._logger.debug("_write_vds_hierarchy: writing vds: " + self._utils.get_entity_desc(vds)) self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group) def _write_remainder_vds(self): if not self._d.vds_list and not self._unresolved_vds: return else: self._logger.info("_write_remainder_vds: Attempt processing VDSs that failed ordering.") # Attempt to process max_hierarchy_depth for h in range(1, self._config.vds_max_hierarchy_depth): # These are VDSs that have all dependencies validated but could not be placed in the hierarchy # Go with decreasing index so we can remove VDS from the list for i in range(len(self._d.vds_list) - 1, -1, -1): vds = self._d.vds_list[i] if self._filter.match_vds_filter(vds): self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds)) if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False): self._d.vds_list.remove(vds) else: self._d.vds_list.remove(vds) # Iterate through the remainder of unresolved VDS in the list # Go with decreasing index so we can remove VDS from the list for i in range(len(self._unresolved_vds) - 1, -1, -1): vds = self._unresolved_vds[i] if self._filter.match_vds_filter(vds): self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds)) if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False): self._unresolved_vds.remove(vds) else: self._unresolved_vds.remove(vds) if self._d.vds_list != [] or self._unresolved_vds != []: self._logger.warn('_write_remainder_vds: After attempting to process VDSs that failed ordering, the following VDSs still failed. Set log level to DEBUG and see prior error messages for more information.') for vds in self._d.vds_list: self._logger.error("Failed VDS: " + str(vds['path'])) for vds in self._unresolved_vds: self._logger.error("Failed VDS: " + str(vds['path'])) else: self._logger.warn("_write_remainder_vds: Finished processing VDSs that failed ordering. All VDSs have been successfuly processed.") def _write_user(self): if self._config.user_process_mode == 'skip': self._logger.info("_write_user: Skipping user processing due to configuration user.process_mode=skip.") return True self._logger.error("_write_user: Cannot create users. API is not implemented.") def _write_entity(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag, report_error = True): self._logger.debug("_write_entity: processing entity: " + self._utils.get_entity_desc(entity)) # Clean up the definition if 'id' in entity: entity.pop("id") if 'tag' in entity: entity.pop("tag") if 'children'in entity: entity.pop("children") if 'createdAt' in entity: entity.pop("createdAt") # Process ACL as needed if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): # Skip this entity due to ACL processing errors self._logger.info("_write_entity: Skipping entity due to ignore_missing_acl_user_flag, ignore_missing_acl_group_flag: " + self._utils.get_entity_desc(entity)) return False # Check if the entity already exists existing_entity = self._read_entity_definition(entity) # Ensure we have not received FOLDER instead of DATASET. See DX-16666 if existing_entity is not None and 'entityType' in entity and \ 'entityType' in existing_entity and entity['entityType'] != existing_entity['entityType']: existing_entity = None if existing_entity is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_entity: Skipping entity creation due to configuration process_mode=update_only. " + self._utils.get_entity_desc(entity)) return True # Reset version for proper concurrency if 'accessControlList' in entity: entity['accessControlList']['version'] = "0" if self._config.dry_run: self._logger.warn("_write_entity: Dry Run, NOT Creating entity: " + self._utils.get_entity_desc(entity)) # For dry run, keep it in a seperate collection to suppress errors if self._utils.is_vds(entity): self._dry_run_processed_vds_list.append(entity) return False # Note for the CE target env, the ACL should have been popped out by _process_acl new_entity = self._dremio_env.create_catalog_entity(entity, self._config.dry_run) if new_entity is None: if report_error: self._logger.error("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity)) else: self._logger.debug("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity)) return False else: # Entity already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_entity: Found existing entity and process_mode is set to create_only. Skipping entity: " + self._utils.get_entity_desc(entity)) return True self._logger.debug("_write_entity: Overwriting entity definition as per process_mode configuration : " + self._utils.get_entity_desc(entity)) # Update entity definition with data from entity existing in the target environment entity['id'] = existing_entity['id'] entity['tag'] = existing_entity['tag'] # Tag from the entity existing in the target environment required for proper concurrency control # Update ACL version for proper concurrency control, but do not use ACL if not really needed as HOME folders are not allowed to have ACL if ('path' in entity and entity['path'][0][:1] == '@') or ('name' in entity and entity['name'][:1] == '@'): if 'accessControlList' in entity: entity.pop('accessControlList') else: # Note for the CE target env, the ACL should have been popped out by _process_acl if not self._config.target_ce: if 'accessControlList' not in entity: entity['accessControlList'] = {"version": "0"} # API changed behavior around version 4 and may not return version attribute for ACL. if 'accessControlList' in existing_entity and 'version' in existing_entity['accessControlList']: entity['accessControlList']['version'] = existing_entity['accessControlList']['version'] if self._config.dry_run: self._logger.warn("_write_entity: Dry Run, NOT Updating entity: " + self._utils.get_entity_desc(entity)) return False updated_entity = self._dremio_env.update_catalog_entity(entity['id'], entity, self._config.dry_run, report_error) if updated_entity is None: if report_error: self._logger.error("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity)) else: self._logger.debug("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity)) return False return True def _write_pds(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): self._logger.debug("_write_pds: processing entity: " + self._utils.get_entity_desc(entity)) if self._filter.match_pds_filter(entity): existing_entity = self._read_entity_definition(entity) if existing_entity is None: self._logger.error("_write_pds: Cannot find existing entity for PDS Entity. Either Folder, File, or PDS must exist prior to promoting or updating PDS. Source PDS: " + self._utils.get_entity_desc(entity)) return False # Check if PDS needs to be promoted first if 'type' not in existing_entity or existing_entity['type'] != 'PHYSICAL_DATASET': self._promote_pds(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) # Update PDS now self._logger.debug("_write_pds: writing pds: " + self._utils.get_entity_desc(entity)) self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag) else: return None def _promote_pds(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): self._logger.debug("_promote_pds: processing entity: " + self._utils.get_entity_desc(entity)) # Clean up the definition if 'id' in entity: entity.pop("id") if 'tag' in entity: entity.pop("tag") if 'children'in entity: entity.pop("children") if 'createdAt' in entity: entity.pop("createdAt") # Process ACL as needed if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): # Skip this entity due to ACL processing errors self._logger.error("_promote_pds: Skipping PDS due to an error in ACL processing: " + self._utils.get_entity_desc(entity)) return False # Read exisitng folder or file entity fs_entity = self._read_entity_definition(entity) if fs_entity is None: self._logger.error("_promote_pds: Skipping PDS. Cannot find folder or file for PDS Entity: " + self._utils.get_entity_desc(entity)) return False # Add Folder ID to PDS Entity entity['id'] = fs_entity['id'] if 'accessControlList' in entity: entity.pop('accessControlList') if self._config.dry_run: self._logger.warn("_promote_pds: Dry Run, NOT promoting pds: " + self._utils.get_entity_desc(entity)) return True self._logger.debug("_promote_pds: promoting pds: " + self._utils.get_entity_desc(entity)) new_pds_entity = self._dremio_env.promote_pds(entity, self._config.dry_run) if new_pds_entity is None: self._logger.error("_promote_pds: Error promoting PDS: " + self._utils.get_entity_desc(entity)) return False return True def _write_reflection(self, reflection, process_mode): self._logger.debug("_write_reflection: processing reflection: " + self._utils.get_entity_desc(reflection)) # Clean up the definition if 'id' in reflection: reflection.pop("id") if 'tag' in reflection: reflection.pop("tag") if 'createdAt' in reflection: reflection.pop("createdAt") if 'updatedAt' in reflection: reflection.pop("updatedAt") if 'currentSizeBytes' in reflection: reflection.pop("currentSizeBytes") if 'totalSizeBytes' in reflection: reflection.pop("totalSizeBytes") if 'status' in reflection: reflection.pop("status") reflection_path = reflection['path'] # Write Reflection reflection.pop("path") reflected_dataset = self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(reflection_path)) if reflected_dataset is None: self._logger.error("_write_reflection: Could not resolve dataset for " + self._utils.get_entity_desc(reflection)) return None # Match filters if requested if self._config.reflection_filter_mode == "apply_vds_pds_filter": if not self._filter.match_reflection_path(reflection_path, reflected_dataset): return False reflection['datasetId'] = reflected_dataset['id'] # Check if the reflection already exists existing_reflection = self._find_existing_reflection(reflection, reflected_dataset) if existing_reflection is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_reflection: Skipping reflection creation due to configuration reflection_process_mode. " + self._utils.get_entity_desc(reflection)) return None if self._config.dry_run: self._logger.warn("_write_reflection: Dry Run, NOT Creating reflection: " + self._utils.get_entity_desc(reflection)) return None new_reflection = self._dremio_env.create_reflection(reflection, self._config.dry_run) if new_reflection is None: self._logger.error("_write_reflection: could not create " + self._utils.get_entity_desc(reflection)) return None else: # Reflection already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_reflection: Found existing refleciton and reflection_process_mode is set to create_only. Skipping " + self._utils.get_entity_desc(reflection)) return None # make sure there are changes to update as it will invalidate existing reflection data if reflection['type'] == existing_reflection['type'] and \ reflection['name'] == existing_reflection['name'] and \ ('partitionDistributionStrategy' in reflection and reflection['partitionDistributionStrategy'] == existing_reflection['partitionDistributionStrategy']) and \ ('measureFields' in reflection and reflection['measureFields'] == existing_reflection['measureFields']) and \ ('dimensionFields' in reflection and reflection['dimensionFields'] == existing_reflection['dimensionFields']) and \ ('displayFields' in reflection and reflection['displayFields'] == existing_reflection['displayFields']) and \ ('sortFields' in reflection and reflection['sortFields'] == existing_reflection['sortFields']) and \ ('partitionFields' in reflection and reflection['partitionFields'] == existing_reflection['partitionFields']) and \ ('distributionFields' in reflection and reflection['distributionFields'] == existing_reflection['distributionFields']): # Nothing to do self._logger.debug("_write_reflection: No pending changes. Skipping " + self._utils.get_entity_desc(reflection)) return None if self._config.dry_run: self._logger.warn("_write_entity: Dry Run, NOT Updating " + self._utils.get_entity_desc(reflection)) return False self._logger.debug("_write_reflection: Overwriting " + self._utils.get_entity_desc(reflection)) reflection['tag'] = existing_reflection['tag'] updated_reflection = self._dremio_env.update_reflection(existing_reflection['id'], reflection, self._config.dry_run) if updated_reflection is None: self._logger.error("_write_reflection: Error updating " + self._utils.get_entity_desc(reflection)) return False return True def _find_existing_reflection(self, reflection, dataset): for existing_reflection in self._existing_reflections: # Match reflections by name if reflection['name'] == existing_reflection['name']: existing_dataset = self._dremio_env.get_catalog_entity_by_id(existing_reflection['datasetId']) # Match reflections by respective dataset's path if existing_dataset is not None and existing_dataset['path'] == dataset['path']: return existing_reflection return None def _find_existing_dataset_by_path(self, path): return self._dremio_env.get_catalog_entity_by_path(path) # Searches for Users from entity's ACL in the target environment and either: # - removes the user from ACL if not found and ignore_missing_acl_user_flag is set # - returns False if if not found and ignore_missing_acl_user_flag is not set # - updates the ACL with userid from the new environment if User found there def _process_acl(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag): self._logger.debug("_process_acl: processing entity: " + self._utils.get_entity_desc(entity)) if 'accessControlList' not in entity: return True if self._config.target_ce: entity.pop('accessControlList') return True acl = entity['accessControlList'] transformed_acl = {"users": [], "groups": []} if 'version' in entity: acl.pop('version') if acl == {} or ('users' not in acl and 'groups' not in acl): pass else: if 'users' in acl: # Note, taking a copy of the list for proper removal of items for user_def in acl['users'][:]: new_acl_principal = self._find_matching_principal_for_userid(user_def['id'], user_def['permissions']) if new_acl_principal == "REMOVE": self._logger.info("_process_acl: Source User " + user_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity)) elif new_acl_principal is None: if ignore_missing_acl_user_flag: self._logger.warn("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. User is removed from ACL definition as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity)) else: self._logger.error("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity)) elif "user" in new_acl_principal: transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']}) elif "group" in new_acl_principal: transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']}) if 'groups' in acl: # Note, taking a copy of the list for proper removal of items for group_def in acl['groups'][:]: new_acl_principal = self._find_matching_principal_for_groupid(group_def['id'], group_def['permissions']) if new_acl_principal == "REMOVE": self._logger.info("_process_acl: Source Group " + group_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity)) elif new_acl_principal is None: if ignore_missing_acl_group_flag: self._logger.warn("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. Group is removed from ACL definition as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity)) else: # Flag is not set - return error status self._logger.error("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity)) elif "user" in new_acl_principal: transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']}) elif "group" in new_acl_principal: transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']}) entity['accessControlList'] = transformed_acl return True def _transform_permissions(self, source_permissions, acl_mapping): # if permission mapping not explicitely defined, use source permissions as is if 'permission-mapping' not in acl_mapping: return source_permissions permissions_mapping = acl_mapping['permission-mapping'] # READ is required for WRITE, so READ is always present in the list of permissions permissions = ["READ"] for permission in source_permissions: for mapping in permissions_mapping: # add only once if permission in mapping and mapping[permission] not in permissions: permissions.append(mapping[permission]) return permissions def _find_matching_principal_for_userid(self, userid, permissions): self._logger.debug("_find_matching_principal_for_userid: processing user_id: " + str(userid)) for user in self._d.referenced_users: if user['id'] == userid: transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions) if transformed_principal == "REMOVE": self._logger.info("_find_matching_principal_for_userid: Source User " + user['name'] + " [" + user['id'] + "] is mapped as NONE.") return "REMOVE" # If no tranformation is defined for this user elif transformed_principal is None: for target_user in self._target_dremio_users: if target_user['name'] == user['name']: return {"user":target_user['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error']) return None else: return transformed_principal # If the username is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take username straight from target for user in self._target_dremio_users: if user['id'] == userid: transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions) if transformed_principal is None: return {"user": user['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error']) return None else: return transformed_principal return None def _find_acl_transformation_by_username(self, username, permissions): for item in self._config.acl_transformation: if 'user' in item['source'] and item['source']['user'] == username: if "REMOVE" in item['target']: return "REMOVE" elif "user" in item['target']: for target_user in self._target_dremio_users: if target_user['name'] == item['target']['user']: new_permissions = self._transform_permissions(permissions, item) return {"user":target_user['id'],"permissions":new_permissions} elif "group" in item['target']: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group":target_group['id'],"permissions":new_permissions} # The transformation is defined for this user, however, the target principal is not in the target Dremio Environment return {"error": "user_transformation_found_but_target_principle_is_not_in_target_dremio_environment"} # If the username is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take username straight from target for item in self._config.acl_transformation: if 'user' in item['target'] and item['target']['user'] == username: for target_user in self._target_dremio_users: if target_user['name'] == username: new_permissions = self._transform_permissions(permissions, item) return {"user": target_user['id'], "permissions": new_permissions} if 'group' in item['target'] and item['target']['group'] == username: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group": target_group['id'], "permissions": new_permissions} return None def _find_matching_principal_for_groupid(self, groupid, permissions): self._logger.debug("_find_matching_groupid: processing: " + str(groupid)) for group in self._d.referenced_groups: if group['id'] == groupid: transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions) if transformed_principal == "REMOVE": self._logger.info("_find_matching_principal_for_groupid: Source Group " + group['name'] + " [" + group['id'] + "] is mapped as NONE.") return "REMOVE" # If no transformation is defined for this group elif transformed_principal is None: for target_group in self._target_dremio_groups: if target_group['name'] == group['name']: return {"group":target_group['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_groupid: error " + transformed_principal['error']) return None else: return transformed_principal # If the group name is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take group name straight from target for group in self._target_dremio_groups: if group['id'] == groupid: transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions) if transformed_principal is None: return {"user": group['id']} elif "error" in transformed_principal: # Something went wrong self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error']) return None else: return transformed_principal return None def _find_acl_transformation_by_groupname(self, groupname, permissions): for item in self._config.acl_transformation: if 'group' in item['source'] and item['source']['group'] == groupname: if "REMOVE" in item['target']: return "REMOVE" elif "user" in item['target']: for target_user in self._target_dremio_users: if target_user['name'] == item['target']['user']: new_permissions = self._transform_permissions(permissions, item) return {"user":target_user['id'],"permissions":new_permissions} elif "group" in item['target']: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group":target_group['id'],"permissions":new_permissions} # The transformation is defined for this group, however, the target principal is not in the target Dremio Environment return {"error": "group_transformation_found_but_target_principle_is_not_in_target_dremio_environment"} # If the group name is already in the target list (i.e. the mapping already happened # but the write_entity failed because parent objects were not yet created) then take group name straight from target for item in self._config.acl_transformation: if 'user' in item['target'] and item['target']['user'] == groupname: for target_user in self._target_dremio_users: if target_user['name'] == groupname: new_permissions = self._transform_permissions(permissions, item) return {"user": target_user['id'], "permissions": new_permissions} if 'group' in item['target'] and item['target']['group'] == groupname: for target_group in self._target_dremio_groups: if target_group['name'] == item['target']['group']: new_permissions = self._transform_permissions(permissions, item) return {"group": target_group['id'], "permissions": new_permissions} return None def _read_entity_definition(self, entity): self._logger.debug("_read_entity_definition: processing entity: " + self._utils.get_entity_desc(entity)) if 'name' in entity: return self._dremio_env.get_catalog_entity_by_path(entity['name']) elif 'path' in entity: return self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(entity['path'])) else: self._logger.error("_read_entity_definition: bad data: " + self._utils.get_entity_desc(entity)) return None # Process vds_list and save ordered list of VDSs into _vds_hierarchy. Recursive method. def _order_vds(self, processing_level=0): # Verify for the Max Hierarchy Depth if processing_level >= self._config.vds_max_hierarchy_depth: self._logger.debug("_order_vds: Finished processing with VDSs left to process:" + str(self._d.vds_list)) return any_vds_leveled = False # Iterate through the remainder VDS in the list # Go with decreasing index so we can remove VDS from the list for i in range(len(self._d.vds_list) - 1, -1, -1): vds = self._d.vds_list[i] self._logger.debug("_order_vds: processing vds " + self._utils.get_entity_desc(vds)) vds_hierarchy_level = processing_level any_dependency_unresolved = False sql_dependency_paths = self._get_vds_dependency_paths(vds) # Iterate through SQL dependencies to determine level of hierarchy for each dependency and the VDS for path in sql_dependency_paths: self._logger.debug("_order_vds: processing sql dependency " + path) # Validate the dependency against VDS and PDS sql_context = self._utils.get_sql_context(vds) dependency_vds = self._find_vds_by_path(self._utils.get_absolute_path(path, sql_context)) if dependency_vds is None: dependency_pds = self._find_pds_by_path(self._utils.get_absolute_path(path, sql_context)) if dependency_pds is None: # Dependency could not be resolved. self._logger.warn("_order_vds: giving up on ordering VDS '" + self._utils.normalize_path(vds['path']) + "'. Could not resolve dependency '" + self._utils.get_absolute_path(path, sql_context) + "' Will try to process without ordering.") # Move VDS into unresolved list self._unresolved_vds.append(vds) self._d.vds_list.remove(vds) # Mark as do-not-process any_dependency_unresolved = True break else: # The dependency has been resolved as PDS, continue to the next dependency continue else: # Dependency was found as VDS dependency_hierarchy_level = self._find_vds_level_in_hierarchy(dependency_vds['id']) if dependency_hierarchy_level is None: # Dependency has not been processed yet, push this VDS to the next processing level vds_hierarchy_level = None break # Find the highest level of hierarchy among dependencies elif vds_hierarchy_level < dependency_hierarchy_level + 1: vds_hierarchy_level = dependency_hierarchy_level + 1 if any_dependency_unresolved or vds_hierarchy_level is None: # Do not process this VDS at this recursion self._logger.debug("_order_vds: some dependencies cannot be validated for entity " + vds['id'] + " at processing level " + str(processing_level)) else: # Add the current VDS to the vds_hierarchy_level self._vds_hierarchy.append([vds_hierarchy_level, vds]) # Remove the current VDS from further processing self._d.vds_list.remove(vds) # Mark this hierarchy level as successful any_vds_leveled = True self._logger.debug("_order_vds: dependencies have been validated for entity " + vds['id'] + " for hierarchy level " + str(vds_hierarchy_level)) # Are we done yet with recursion if not any_vds_leveled or len(self._d.vds_list) == 0: self._hierarchy_depth = processing_level + 1 self._logger.debug("_order_vds: finished processing all VDS with hierarchy depth of :" + str(self._hierarchy_depth + 1)) return # Process the next Hierarchy Level recursively self._order_vds(processing_level + 1) def _get_vds_dependency_paths(self, vds): if self._is_source_ce() or not self._d.vds_parents: # CE does not support graph return parse_sql.tables_in_query(vds['sql']) else: for vds_entry in self._d.vds_parents: if vds_entry['path'] == vds['path']: return vds_entry['parents'] def _is_source_ce(self): for item in self._d.dremio_get_config: if 'source' in item: for param in item['source']: if 'is_community_edition' in param: return eval(param['is_community_edition']) return False def _find_vds_by_path(self, path): # First, try finding in the VDS list from the source file for vds in self._d.vds_list: if path == self._utils.normalize_path(vds['path']): return vds # For dry run, check processed vds if self._config.dry_run: for vds in self._dry_run_processed_vds_list: if path == self._utils.normalize_path(vds['path']): return vds # Finally, try finding in the target environment entity = self._dremio_env.get_catalog_entity_by_path(path) # Make sure we get VDS and not folder/file if entity is not None and self._utils.is_vds(entity): return entity return None def _find_pds_by_path(self, path): # First, try finding in the PDS list from the source file for pds in self._d.pds_list: if path == self._utils.normalize_path(pds['path']): return pds # For dry run, check processed pds if self._config.dry_run: for pds in self._dry_run_processed_pds_list: if path == self._utils.normalize_path(pds['path']): return pds # Finally, try finding in the target environment entity = self._dremio_env.get_catalog_entity_by_path(path) # Make sure we get promoted PDS and not folder/file if entity is not None and self._utils.is_pds(entity): return entity return None def _find_vds_level_in_hierarchy(self, vds_id): for item in self._vds_hierarchy: if item[1]['id'] == vds_id: return item[0] return None def get_errors_count(self): return self._logger.errors_encountered def _write_wiki(self, wiki, process_mode): self._logger.debug("_write_wiki: processing wiki: " + str(wiki)) new_wiki_text = wiki['text'] wiki_path = wiki['path'] # Check if the wiki already exists existing_wiki_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(wiki_path)) if existing_wiki_entity is None: self._logger.error("_write_wiki: Unable to resolve wiki's dataset for " + str(wiki)) return None existing_wiki = self._dremio_env.get_catalog_wiki(existing_wiki_entity['id']) if existing_wiki is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_wiki: Skipping wiki creation due to configuration wiki_process_mode. " + str(wiki)) return None if self._config.dry_run: self._logger.warn("_write_wiki: Dry Run, NOT Creating wiki: " + str(wiki)) return None new_wiki = {"text":new_wiki_text} new_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], new_wiki, self._config.dry_run) if new_wiki is None: self._logger.error("_write_wiki: could not create " + str(wiki)) return None else: # Wiki already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_wiki: Found existing wiki and wiki_process_mode is set to create_only. Skipping " + str(wiki)) return None # make sure there are changes to update as it will invalidate existing wiki data if new_wiki_text == existing_wiki['text']: # Nothing to do self._logger.debug("_write_wiki: No pending changes. Skipping " + str(wiki)) return None if self._config.dry_run: self._logger.warn("_write_wiki: Dry Run, NOT Updating " + str(wiki)) return False self._logger.debug("_write_wiki: Overwriting " + str(wiki)) existing_wiki['text'] = new_wiki_text updated_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], existing_wiki, self._config.dry_run) if updated_wiki is None: self._logger.error("_write_wiki: Error updating " + str(wiki)) return False return True def _write_tags(self, tags, process_mode): self._logger.debug("_write_tag: processing tags: " + str(tags)) new_tags = tags['tags'] tags_path = tags['path'] # Check if the tags already exist existing_tags_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(tags_path)) if existing_tags_entity is None: self._logger.error("_write_tags: Unable to resolve tag's dataset for " + str(tags)) return None existing_tags = self._dremio_env.get_catalog_tags(existing_tags_entity['id']) if existing_tags is None: # Need to create new entity if process_mode == 'update_only': self._logger.info("_write_tags: Skipping tags creation due to configuration tag_process_mode. " + str(tags)) return None if self._config.dry_run: self._logger.warn("_write_tags: Dry Run, NOT Creating tags: " + str(tags)) return None new_tags = {"tags":new_tags} new_tags = self._dremio_env.update_tag(existing_tags_entity['id'], new_tags, self._config.dry_run) if new_tags is None: self._logger.error("_write_tags: could not create " + str(tags)) return None else: # Tags already exists in the target environment if process_mode == 'create_only': self._logger.info("_write_tags: Found existing tags and tag_process_mode is set to create_only. Skipping " + str(tags)) return None # make sure there are changes to update as it will invalidate existing tags data if new_tags == existing_tags['tags']: # Nothing to do self._logger.debug("_write_tags: No pending changes. Skipping " + str(tags)) return None if self._config.dry_run: self._logger.warn("tags: Dry Run, NOT Updating " + str(tags)) return False self._logger.debug("_write_tags: Overwriting " + str(tags)) existing_tags['tags'] = new_tags updated_tags = self._dremio_env.update_tag(existing_tags_entity['id'], existing_tags, self._config.dry_run) if updated_tags is None: self._logger.error("_write_tags: Error updating " + str(tags)) return False return True
class DremioReader: # Dremio Cloner Configuration, Utils, ... _config = None _utils = None _logger = None _filter = None # Dremio object pointing to the source Dremio environment _dremio_env = None # DremioData object containing data from Dremio source environment _d = DremioData() # Current top-level hierarchy context: Home, Space, Source _top_level_hierarchy_context = None def __init__(self, source_dremio, config): self._config = config self._dremio_env = source_dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config) # Read all data from the source Dremio environemnt # Return DremioData def read_dremio_environment(self): self._read_catalog() if not self._config.pds_list_useapi and self._filter.is_pds_in_scope(): self._read_all_pds() self._read_reflections() self._read_rules() self._read_queues() self._read_votes() # Make sure that all VDS dependencies included as per configuration self._process_vds_dependencies() return self._d def _read_all_pds(self): if self._config.pds_list_useapi or not self._filter.is_pds_in_scope(): self._logger.info( "_read_all_pds: skipping PDS reading as per pds.filter configuration." ) else: pds_list = self._dremio_env.list_pds( self._d.sources, self._config.source_folder_filter, self._config.source_folder_exclude_filter, self._config.pds_filter, self._config.pds_exclude_filter, pds_error_list=self._d.pds_error_list) for pds in pds_list: if self._filter.match_pds_filter(pds): self._d.pds_list.append(pds) # Read Dremio catalog from source environment recursively going to containers and their children objects def _read_catalog(self): containers = self._dremio_env.list_catalog()['data'] for container in containers: self._logger.debug("_read_catalog: processing container " + self._utils.get_entity_desc(container)) self._process_container(container) # Identify a container and delegate processing def _process_container(self, container): self._logger.debug("_process_container: " + self._utils.get_entity_desc(container)) if container['containerType'] == "HOME": self._read_home(container) elif container['containerType'] == "SPACE": self._read_space(container) elif container['containerType'] == "SOURCE": self._read_source(container) else: self._logger.fatal("_process_container: unexpected entity type " + self._utils.get_entity_desc(container)) def _read_home(self, container): self._logger.debug("_read_home: processing container: " + self._utils.get_entity_desc(container)) if self._config.home_process_mode == 'process': self._top_level_hierarchy_context = "HOME" self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: self._logger.info("_read_home: " + self._utils.get_entity_desc(entity)) self._d.homes.append(entity) self._read_acl(entity) self._read_wiki(entity) self._read_space_children(entity) else: self._logger.error( "_read_home: error reading entity for container: " + self._utils.get_entity_desc(container)) else: self._logger.debug("_read_home: skipping due to job configuration") def _read_space(self, container): self._logger.debug("_read_space: processing container: " + self._utils.get_entity_desc(container)) self._top_level_hierarchy_context = "SPACE" if self._filter.match_space_filter(container): self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: self._logger.debug("_read_space: " + self._utils.get_entity_desc(container)) self._d.spaces.append(entity) self._read_acl(entity) self._read_wiki(entity) self._read_space_children(entity) else: self._logger.error( "_read_space: error reading entity for container: " + self._utils.get_entity_desc(container)) def _read_source(self, container): self._logger.debug("_read_source: processing container: " + self._utils.get_entity_desc(container)) if self._config.source_process_mode == 'process' or ( self._config.pds_process_mode == 'process' and self._config.pds_list_useapi): self._top_level_hierarchy_context = "SOURCE" if self._filter.match_source_filter(container): self._d.containers.append(container) entity = self._get_entity_definition_by_id(container) if entity is not None: # Re-validate the filter with entity since there is more details in entity if self._filter.match_source_filter(entity): self._logger.debug("_read_source: " + self._utils.get_entity_desc(entity)) self._d.sources.append(entity) self._read_acl(entity) self._read_wiki(entity) # Depending on the useapi flag, PDSs can be collected via INFORMATION_SCHEMA. See also DX16597 if self._config.pds_list_useapi: self._read_source_children(entity) else: self._logger.error( "_read_source: error reading entity for container: " + self._utils.get_entity_desc(container)) else: self._logger.debug( "_read_source: skipping due to job configuration") def _read_space_folder(self, folder): self._logger.debug("_read_space_folder: processing folder: " + self._utils.get_entity_desc(folder)) if self._top_level_hierarchy_context not in ["SPACE", "HOME"]: return entity = self._get_entity_definition_by_id(folder) if entity is None: self._logger.error( "_read_space_folder: error reading entity for folder: " + self._utils.get_entity_desc(folder)) return if self._top_level_hierarchy_context == "HOME" or self._filter.match_space_folder_filter( folder): self._logger.debug("_read_space_folder: " + self._utils.get_entity_desc(folder)) self._d.folders.append(entity) self._read_acl(entity) self._read_wiki(entity) # Validate all parent folders in the path have been saved already folder_path = entity['path'] for i in range(1, len(folder_path) - 1): folderSaved = False for item in self._d.folders: if item['path'][-1] == folder_path[i]: folderSaved = True if not folderSaved: parent_entity = self._get_entity_definition_by_path( folder_path[0:i + 1]) self._d.folders.append(parent_entity) self._read_space_children(entity) def _read_space_children(self, parent_entity): self._logger.debug("_read_space_children: processing parent_entity: " + self._utils.get_entity_desc(parent_entity)) if 'entityType' not in parent_entity: self._logger.error( "_read_space_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return for child in parent_entity['children']: if child['type'] == "DATASET": self._read_dataset(child) elif child['type'] == "FILE": self._read_file(child) elif child['containerType'] == "FOLDER": self._read_space_folder(child) else: self._logger.error( "_read_space_children: not supported entity type " + child['type']) def _read_source_folder(self, folder): self._logger.debug("_read_source_folder: processing folder: " + self._utils.get_entity_desc(folder)) if self._top_level_hierarchy_context == "SOURCE" and self._filter.match_source_folder_filter( folder): entity = self._get_entity_definition_by_id(folder) if entity is not None: self._logger.debug("_read_source_folder: " + self._utils.get_entity_desc(folder)) self._read_source_children(entity) else: self._logger.error( "_read_source_folder: error reading entity for folder: " + self._utils.get_entity_desc(folder)) def _read_source_children(self, parent_entity): self._logger.debug( "_read_source_children: processing parent entity '" + self._utils.get_entity_desc(parent_entity) + "'") if 'entityType' not in parent_entity: self._logger.error( "_read_source_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return for child in parent_entity['children']: if child['type'] == "DATASET": self._read_dataset(child) elif child['type'] == "FILE": self._read_file(child) elif child['containerType'] == "FOLDER": self._read_source_folder(child) else: self._logger.error( "_read_source_children: not supported entity type " + child['type']) def _read_dataset(self, dataset): self._logger.debug("_read_dataset: processing dataset: " + self._utils.get_entity_desc(dataset)) entity = self._get_entity_definition_by_id(dataset) if entity is not None: self._logger.debug("_read_dataset: " + dataset['datasetType'] + " : " + self._utils.get_entity_desc(dataset)) if dataset['datasetType'] == "PROMOTED" or dataset[ 'datasetType'] == "DIRECT": self._d.pds_list.append(entity) elif dataset['datasetType'] == "VIRTUAL": tags = self._dremio_env.get_catalog_tags(entity['id']) if self._filter.match_vds_filter(dataset, tags=tags): self._d.vds_list.append(entity) else: self._logger.error("_read_dataset: Unexpected dataset type " + dataset['datasetType'] + " for " + self._utils.get_entity_desc(dataset) + ".") self._read_acl(entity) self._read_wiki(entity) self._read_tags(entity) def _read_file(self, file_name): # do nothing return def _read_reflections(self): self._logger.debug("_read_reflections: starting") if self._config.reflection_process_mode == 'process' and not self._config.source_ce: reflections = self._dremio_env.list_reflections()['data'] for reflection in reflections: reflection_dataset = self._dremio_env.get_catalog_entity_by_id( reflection['datasetId']) if reflection_dataset is None: self._logger.debug( "_read_reflections: error processing reflection, cannot get path for dataset: " + reflection['datasetId']) continue reflection_path = reflection_dataset['path'] self._logger.debug( "_read_reflections: processing reflection " + reflection['datasetId'] + " path: " + str(reflection_path)) reflection["path"] = reflection_path self._d.reflections.append(reflection) # self._read_acl(reflection) # self._read_wiki(reflection) else: self._logger.debug( "_read_reflections: skipping reflections processing as per job configuration" ) # Note, tags are only available for datasets def _read_tags(self, entity): self._logger.debug("_read_tags: for entity " + self._utils.get_entity_desc(entity)) if self._config.tag_process_mode == 'process': tag = self._dremio_env.get_catalog_tags(entity['id']) if tag is not None: tag['entity_id'] = entity['id'] if entity['entityType'] == 'space' or entity[ 'entityType'] == 'source': tag['path'] = [entity['name']] else: tag['path'] = entity['path'] if tag not in self._d.tags: self._d.tags.append(tag) else: self._logger.debug( "_read_tags: skipping tags processing as per job configuration" ) def _read_wiki(self, entity): self._logger.debug("_read_wiki: for entity " + self._utils.get_entity_desc(entity)) if self._config.wiki_process_mode == 'process': wiki = self._dremio_env.get_catalog_wiki(entity['id']) if wiki is not None: wiki["entity_id"] = entity['id'] if entity['entityType'] == 'space' or entity[ 'entityType'] == 'source' or entity[ 'entityType'] == 'home': wiki['path'] = [entity['name']] else: wiki['path'] = entity['path'] if wiki not in self._d.wikis: self._d.wikis.append(wiki) else: self._logger.debug( "_read_wiki: skipping wiki processing as per job configuration" ) def _read_acl(self, entity): self._logger.debug("_read_acl: for entity " + self._utils.get_entity_desc(entity)) if 'accessControlList' in entity: acl = entity['accessControlList'] if 'users' in acl: for user in acl['users']: user_entity = self._dremio_env.get_user(user['id']) if user_entity is not None: if user_entity not in self._d.referenced_users: self._d.referenced_users.append(user_entity) if 'groups' in acl: for group in acl['groups']: group_entity = self._dremio_env.get_group(group['id']) if group_entity is not None: if group_entity not in self._d.referenced_groups: self._d.referenced_groups.append(group_entity) def _process_vds_dependencies(self): if self._config.vds_dependencies_process_mode == 'get': for vds in self._d.vds_list: self._discover_dependencies(vds) for vds in self._d.vds_list: self._populate_dependencies_graph(vds) # Discovers dependencies for the passed dataset and adds them to the self._d.vds_list def _discover_dependencies(self, dataset): self._logger.debug("_discover_dependencies: processing dataset: " + self._utils.get_entity_desc(dataset)) if dataset is not None: if 'type' not in dataset: self._logger.error( "_discover_dependencies: Expected Dataset Entity but got: " + self._utils.get_entity_desc(dataset)) return if dataset['type'] == 'PHYSICAL_DATASET': if dataset not in self._d.pds_list: self._d.pds_list.append(dataset) return elif dataset['type'] == 'VIRTUAL_DATASET': if dataset not in self._d.vds_list: self._d.vds_list.append(dataset) # Process VDS dependencies sql_dependency_paths = self._get_vds_dependency_paths(dataset) for dependency_path in sql_dependency_paths: dependency_path = self._utils.get_absolute_path( dependency_path, self._utils.get_sql_context(dataset)) entity = self._find_entity(dependency_path) if entity is not None: # Entity has already been read return dependency_dataset = self._dremio_env.get_catalog_entity_by_path( dependency_path) if dependency_dataset is None: self._logger.warn( "_discover_dependencies: unable to resolve dataset likely due to datasource availability: " + dependency_path) else: self._discover_dependencies(dependency_dataset) else: self._logger.error( "_discover_dependencies: Unknown Entity Type: " + dataset['type']) else: self._logger.error( "_discover_dependencies: Could not resolve dependency: None") def _populate_dependencies_graph(self, vds): self._logger.debug("_populate_dependencies_graph: processing vds: " + self._utils.get_entity_desc(vds)) # For some broken VDSs, vds_parent_list = self._get_vds_dependency_paths(vds) vds_parent_json = { 'id': vds['id'], 'path': vds['path'], 'parents': vds_parent_list } if not self._config.source_ce and self._config.source_graph_support: self._d.vds_parents.append(vds_parent_json) def _get_vds_dependency_paths(self, vds): self._logger.debug("_get_vds_dependency_paths: processing vds: " + self._utils.get_entity_desc(vds)) if self._config.source_ce or not self._config.source_graph_support: return parse_sql.tables_in_query(vds['sql']) else: graph = self._dremio_env.get_catalog_entity_graph_by_id(vds['id']) if graph is None: self._logger.warn( "Could not receive Graph via API. Try to set graph_api_support to False in the job configuration." ) return parse_sql.tables_in_query(vds['sql']) vds_parent_list = [] for parent in graph['parents']: vds_parent_list.append( self._utils.normalize_path(parent['path'])) return vds_parent_list def _find_entity(self, path): self._logger.debug("_find_entity: processing path: " + str(path)) for vds in self._d.vds_list: if self._utils.normalize_path(vds['path']) == path: return vds for pds in self._d.pds_list: if self._utils.normalize_path(pds['path']) == path: return pds # Helper method, used by most read* methods def _get_entity_definition_by_id(self, src): self._logger.debug("_get_entity_definition_by_id: processing src: " + self._utils.get_entity_desc(src)) if 'id' not in src: self._logger.error( "_read_entity_definition: bad data, skipping entity: " + self._utils.get_entity_desc(src)) return None else: entity = self._dremio_env.get_catalog_entity_by_id(src['id']) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for id: " + src['id']) return entity def _get_entity_definition_by_path(self, path): self._logger.debug( "_get_entity_definition_by_path: processing path: " + str(path)) path = self._utils.normalize_path(path) entity = self._dremio_env.get_catalog_entity_by_path(path) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for path: " + str(path)) return entity def _read_queues(self): self._logger.debug("read_queues: started") if self._config.wlm_queue_process_mode == 'process' and not self._config.source_ce: self._d.queues = self._dremio_env.list_queues()['data'] else: self._logger.debug( "_read_queues: skipping as per job configuration") def _read_rules(self): self._logger.debug("read_rules: started") if self._config.wlm_rule_process_mode == 'process' and not self._config.source_ce: self._d.rules = self._dremio_env.list_rules()['rules'] else: self._logger.debug("read_rules: skipping as per job configuration") def _read_votes(self): self._logger.debug("read_votes: started") if self._config.vote_process_mode == 'process' and not self._config.source_ce: self._d.votes = self._dremio_env.list_votes()['data'] else: self._logger.debug("read_votes: skipping as per job configuration") def get_errors_count(self): return self._logger.errors_encountered
class DremioCascadeAcl: # Dremio Cloner Config, Logger, Utils _config = None _logger = None _utils = None _filter = None # Dremio Environment to write to _dremio_env = None # List of PDS for processing _pds_list = None def __init__(self, dremio, config): self._config = config self._dremio_env = dremio self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose) self._utils = DremioClonerUtils(config) self._filter = DremioClonerFilter(config) def cascade_acl(self): if not self._config.pds_list_useapi: self._pds_list = self._dremio_env.list_pds( self._config.source_filter, self._config.source_exclude_filter, self._config.source_folder_filter, self._config.source_folder_exclude_filter, self._config.pds_filter, self._config.pds_exclude_filter) self._logger.info( "cascade_acl: Not using API for PDS retrieval. Filtered PDS are NOT reported in the log." ) containers = self._dremio_env.list_catalog()['data'] for container in containers: self._logger.debug("cascade_acl: processing container " + self._utils.get_entity_desc(container)) if container[ 'containerType'] == "SPACE" and self._filter.match_space_filter( container): self._process_space(container) elif container[ 'containerType'] == "SOURCE" and self._filter.match_source_filter( container): self._process_source(container) def _process_space(self, space): entity = self._get_entity_definition(space) if entity is None: self._logger.error( "_process_space: error reading entity for container: " + self._utils.get_entity_desc(space)) else: if self._config.space_cascade_acl_origin_override_object is None: # Use Space ACL as an 'origin' self._logger.info( "_process_space: SPACE: '" + str(space['path']) + "' will be used as an ACL Origin for its children FOLDERs and VDSs." ) acl = self._get_acl(entity) else: # Use ACL from a configured object acl_entity = self._dremio_env.get_catalog_entity_by_path( self._config.space_cascade_acl_origin_override_object) if acl_entity is None: self._logger.error( "_process_space: error reading origin entity for path: " + str(self._config. space_cascade_acl_origin_override_object)) return self._logger.info( "_process_space: SPACE: '" + str(space['path']) + "' Using override origin instead as an ACL Origin for its children FOLDERs and VDSs." ) acl = self._get_acl(acl_entity) self._process_space_children(entity, acl) def _process_source(self, source): entity = self._get_entity_definition(source) if entity is None: self._logger.error( "_process_source: error reading entity for container: " + self._utils.get_entity_desc(source)) else: if self._config.source_cascade_acl_origin_override_object is None: # Use Source ACL as an 'origin' self._logger.info( "_process_source: SOURCE: '" + str(source['path']) + "' will be used as an ACL Origin for its children PDSs.") acl = self._get_acl(entity) else: # Use ACL from a configured object acl_entity = self._dremio_env.get_catalog_entity_by_path( self._config.source_cascade_acl_origin_override_object) if acl_entity is None: self._logger.error( "_process_source: error reading origin entity for path: " + str(self._config. source_cascade_acl_origin_override_object)) return self._logger.info( "_process_source: SOURCE: '" + str(source['path']) + "' Using override origin instead as an ACL Origin for its children PDSs." ) acl = self._get_acl(acl_entity) # Process PDSs if self._config.pds_list_useapi: self._process_source_children(entity, acl) else: for pds in self._pds_list: # Does the PDS belong to the current Source if pds['path'][0] == source['path'][0]: self._logger.debug("_process_source: pds: " + self._utils.get_entity_desc(pds)) if self._filter.match_pds_filter(pds): self._logger.debug( "_process_source_children: applying ACL to PDS: " + self._utils.get_entity_desc(pds)) self._apply_acl(pds, acl) def _process_source_children(self, parent_entity, acl): # This is a recursive function if 'children' not in parent_entity: return if 'entityType' not in parent_entity: self._logger.error( "_process_source_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return self._logger.debug( "_process_source_children: processing parent entity '" + self._utils.get_entity_desc(parent_entity) + "'") for child in parent_entity['children']: child_entity = self._get_entity_definition(child) if child_entity is None: self._logger.error( "_process_source_children: error reading entity for: " + self._utils.get_entity_desc(child)) if child['type'] == "DATASET": if self._filter.match_pds_filter(child_entity): self._logger.debug( "_process_source_children: applying ACL to PDS: " + self._utils.get_entity_desc(child_entity)) self._apply_acl(child_entity, acl) else: self._logger.info( "_process_source_children: skipping PDS: " + str(child_entity['path']) + "as per filter configuration") elif child['type'] == "FILE": self._logger.info("_process_source_children: skipping FILE: " + self._utils.get_entity_desc(child_entity)) elif 'containerType' in child and child[ 'containerType'] == "FOLDER": if self._filter.match_source_folder_filter(child_entity): self._process_source_children(child_entity, acl) else: self._logger.info( "_process_source_children: skipping FOLDER: " + str(child_entity['path']) + "as per filter configuration") def _process_space_children(self, parent_entity, acl): # This is a recursive function if 'children' not in parent_entity: return if 'entityType' not in parent_entity: self._logger.error( "_process_space_children: bad data, skipping entity: " + self._utils.get_entity_desc(parent_entity)) return self._logger.debug( "_process_space_children: processing parent entity '" + self._utils.get_entity_desc(parent_entity) + "'") for child in parent_entity['children']: child_entity = self._get_entity_definition(child) if child_entity is None: self._logger.error( "_process_space_children: error reading entity for: " + self._utils.get_entity_desc(child)) if child['type'] == "DATASET": if self._filter.match_vds_filter(child_entity): self._logger.debug( "_process_space_children: applying ACL to VDS: " + self._utils.get_entity_desc(child_entity)) self._apply_acl(child_entity, acl) else: self._logger.info( "_process_space_children: skipping VDS: " + self._utils.get_entity_desc(child_entity)) elif child['containerType'] == "FOLDER": if self._filter.match_space_folder_filter(child_entity): if self._filter.match_space_folder_cascade_acl_origin_filter( child_entity): self._logger.info( "_process_space_children: FOLDER: " + str(child_entity['path']) + " will be used as an ACL Origin for its children.") self._process_space_children( child_entity, self._get_acl(child_entity)) else: self._logger.info( "_process_space_children: applying ACL to FOLDER: " + self._utils.get_entity_desc(child_entity)) self._apply_acl(child_entity, acl) self._process_space_children(child_entity, acl) else: self._logger.info( "_process_space_children: skipping FOLDER: " + self._utils.get_entity_desc(child_entity)) self._process_space_children(child_entity, acl) def _get_entity_definition(self, src): if 'id' not in src: self._logger.error( "_read_entity_definition: bad data, skipping entity: " + self._utils.get_entity_desc(src)) return None else: entity = self._dremio_env.get_catalog_entity_by_id(src['id']) if entity is None: self._logger.error( "_read_entity_definition: cannot retrieve entity for id: " + src['id']) return entity def _get_acl(self, entity): if 'accessControlList' in entity: return entity['accessControlList'] else: self._logger.fatal("ACL is not defined for " + self._utils.get_entity_desc(entity)) return None def _apply_acl(self, entity, acl): # Clear the current ACL definition if 'accessControlList' not in entity: entity['accessControlList'] = {"version": "0"} if 'users' in entity['accessControlList']: entity['accessControlList'].pop('users') if 'groups' in entity['accessControlList']: entity['accessControlList'].pop('groups') # Apply ACL to entity if 'users' in acl: entity['accessControlList']['users'] = acl['users'] if 'groups' in acl: entity['accessControlList']['groups'] = acl['groups'] if self._config.dry_run: self._logger.warn("_apply_acl: Dry Run, NOT Updating entity: " + self._utils.get_entity_desc(entity)) return False self._logger.info("_apply_acl: updating entity: " + self._utils.get_entity_desc(entity)) updated_entity = self._dremio_env.update_catalog_entity( entity['id'], entity, self._config.dry_run) if updated_entity is None: self._logger.error("_apply_acl: Error updating entity: " + self._utils.get_entity_desc(entity)) return False return True def get_errors_count(self): return self._logger.errors_encountered