コード例 #1
0
ファイル: DremioWriter.py プロジェクト: tejkm/dremio-cloner
	def __init__(self, target_dremio, dremio_data, config):
		self._config = config
		self._dremio_env = target_dremio
		self._d = dremio_data
		self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose)
		self._filter = DremioClonerFilter(config)
		self._utils = DremioClonerUtils(config)
コード例 #2
0
ファイル: DremioReader.py プロジェクト: tejkm/dremio-cloner
 def __init__(self, source_dremio, config):
     self._config = config
     self._dremio_env = source_dremio
     self._logger = DremioClonerLogger(self._config.max_errors,
                                       self._config.logging_verbose)
     self._utils = DremioClonerUtils(config)
     self._filter = DremioClonerFilter(config)
コード例 #3
0
ファイル: DremioWriter.py プロジェクト: tejkm/dremio-cloner
class DremioWriter:

	# Dremio Cloner Config, Utils, ...
	_config = None
	_utils = None
	_logger = None
	_filter = None

	# Dremio Environment to write to
	_dremio_env = None

	# Dremio Data to write
	_d = None

	# VDS list grouped by hierarchy
	_vds_hierarchy = []
	_hierarchy_depth = 0
	_unresolved_vds = []

	# Referenced Users and Groups in the target environment
	_target_dremio_users = []
	_target_dremio_groups = []

	# Resolved Datasets for Reflections
	_existing_reflections = list()

	# Dry run collections
	_dry_run_processed_vds_list = []
	_dry_run_processed_pds_list = []

	def __init__(self, target_dremio, dremio_data, config):
		self._config = config
		self._dremio_env = target_dremio
		self._d = dremio_data
		self._logger = DremioClonerLogger(self._config.max_errors, self._config.logging_verbose)
		self._filter = DremioClonerFilter(config)
		self._utils = DremioClonerUtils(config)

	def write_dremio_environment(self):
		self._retrieve_users_groups()
		if self._config.acl_transformation != {} and self._d.referenced_users == [] and self._d.referenced_groups == []:
			self._logger.warn("ACL Transformation has been defined while Referenced Users and Referenced Groups are not present in the Source Dremio Data.")

		if self._config.reflection_process_mode != 'skip':
			self._existing_reflections = self._dremio_env.list_reflections()['data']
		if self._config.source_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping source processing due to configuration source.process_mode=skip.")
		else:
			for source in self._d.sources:
				self._write_source(source, self._config.source_process_mode, self._config.source_ignore_missing_acl_user, self._config.source_ignore_missing_acl_group)
		if self._config.pds_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping source PDS processing due to configuration source.pds.process_mode=skip.")
		else:
			for pds in self._d.pds_list:
				self._write_pds(pds, self._config.pds_process_mode, self._config.pds_ignore_missing_acl_user, self._config.pds_ignore_missing_acl_group)
		if self._config.space_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping space processing due to configuration space.process_mode=skip.")
		else:
			for space in self._d.spaces:
				self._write_space(space, self._config.space_process_mode, self._config.space_ignore_missing_acl_user, self._config.space_ignore_missing_acl_group)
		if self._config.folder_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping folder processing due to configuration folder.process_mode=skip.")
		else:
			for folder in self._d.folders:
				self._write_folder(folder, self._config.folder_process_mode, self._config.folder_ignore_missing_acl_user, self._config.folder_ignore_missing_acl_group)
		if self._config.vds_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping VDS processing due to configuration vds.process_mode=skip.")
		else:
			self._order_vds(0)
			self._write_vds_hierarchy()
			self._write_remainder_vds()
		if self._config.reflection_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping reflection processing due to configuration reflection.process_mode=skip.")
		else:
			for reflection in self._d.reflections:
				self._write_reflection(reflection, self._config.reflection_process_mode)
		if self._config.reflection_refresh_mode != 'refresh':
			self._logger.info("write_dremio_environment: Skipping reflection refresh due to configuration reflection.refresh_mode=skip.")
		else:
			for pds in self._d.pds_list:
				self._dremio_env.refresh_reflections_by_pds_path(self._utils.normalize_path(pds['path']), self._config.dry_run)
		if self._config.wiki_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping wiki processing due to configuration wiki.process_mode=skip.")
		else:
			for wiki in self._d.wikis:
				self._write_wiki(wiki, self._config.wiki_process_mode)
		if self._config.tag_process_mode == 'skip':
			self._logger.info("write_dremio_environment: Skipping tag processing due to configuration tag.process_mode=skip.")
		else:
			for tags in self._d.tags:
				self._write_tags(tags, self._config.tag_process_mode)

	def _write_space(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		if self._filter.match_space_filter(entity):
			self._logger.debug("_write_space: processing entity: " + self._utils.get_entity_desc(entity))
			return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			self._logger.debug("_write_space: skipping entity: " + self._utils.get_entity_desc(entity))
			return None

	def _write_source(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		if self._filter.match_source_filter(entity):
			self._logger.debug("_write_source: processing entity: " + self._utils.get_entity_desc(entity))
			return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			self._logger.debug("_write_source: skipping entity: " + self._utils.get_entity_desc(entity))
			return None

	def _write_folder(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		# Drop ACL for HOME folders
		if entity['path'][0][:1] == '@' and 'accessControlList' in entity:
			entity.pop("accessControlList")
		# Do not apply space.folder.filter to Home folders
		if entity['path'][0][:1] == '@' or self._filter.match_space_folder_filter(entity):
			self._logger.debug("_write_folder: processing entity: " + self._utils.get_entity_desc(entity))
			return self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			self._logger.debug("_write_folder: skipping entity: " + self._utils.get_entity_desc(entity))
			return None

	def _retrieve_users_groups(self):
		for user in self._d.referenced_users:
			target_user = self._dremio_env.get_user_by_name(user['name'])
			if target_user is not None:
				self._target_dremio_users.append(target_user)
			else:
				self._logger.error("_retrieve_users_groups: Unable to resolve user in target Dremio environment: " + str(user['name']))
		for group in self._d.referenced_groups:
			target_group = self._dremio_env.get_group_by_name(group['name'])
			if target_group is not None:
				self._target_dremio_groups.append(target_group)
			else:
				self._logger.error("_retrieve_users_groups: Unable to resolve group in target Dremio environment: " + str(group['name']))
		# Retrieve acl transformation target users and groups
		for item in self._config.acl_transformation:
			if 'user' in item['target']:
				user = self._dremio_env.get_user_by_name(item['target']['user'])
				if user is not None:
					# dont worry about dups
					self._target_dremio_users.append(user)
				else:
					self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION user in target Dremio environment: " + str(item['target']['user']))
			if 'group' in item['target']:
				group = self._dremio_env.get_group_by_name(item['target']['group'])
				if group is not None:
					# dont worry about dups
					self._target_dremio_groups.append(group)
				else:
					self._logger.error("_retrieve_users_groups: Unable to resolve ACL_TRANSFORMATION group in target Dremio environment: " + str(item['target']['group']))

	def _write_vds_hierarchy(self):
		for level in range(0, self._hierarchy_depth):
			for item in self._vds_hierarchy:
				if item[0] == level:
					vds = item[1]
					if self._filter.match_vds_filter(vds):
						self._logger.debug("_write_vds_hierarchy: writing vds: " + self._utils.get_entity_desc(vds))
						self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group)

	def _write_remainder_vds(self):
		if not self._d.vds_list and not self._unresolved_vds:
			return
		else:
			self._logger.info("_write_remainder_vds: Attempt processing VDSs that failed ordering.")
		# Attempt to process max_hierarchy_depth
		for h in range(1, self._config.vds_max_hierarchy_depth):
			# These are VDSs that have all dependencies validated but could not be placed in the hierarchy
			# Go with decreasing index so we can remove VDS from the list
			for i in range(len(self._d.vds_list) - 1, -1, -1):
				vds = self._d.vds_list[i]
				if self._filter.match_vds_filter(vds):
					self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds))
					if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False):
						self._d.vds_list.remove(vds)
				else:
					self._d.vds_list.remove(vds)
			# Iterate through the remainder of unresolved VDS in the list
			# Go with decreasing index so we can remove VDS from the list
			for i in range(len(self._unresolved_vds) - 1, -1, -1):
				vds = self._unresolved_vds[i]
				if self._filter.match_vds_filter(vds):
					self._logger.debug("_write_remainder_vds: writing vds: " + self._utils.get_entity_desc(vds))
					if self._write_entity(vds, self._config.vds_process_mode, self._config.vds_ignore_missing_acl_user, self._config.vds_ignore_missing_acl_group, False):
						self._unresolved_vds.remove(vds)
				else:
					self._unresolved_vds.remove(vds)
		if self._d.vds_list != [] or self._unresolved_vds != []:
			self._logger.warn('_write_remainder_vds: After attempting to process VDSs that failed ordering, the following VDSs still failed. Set log level to DEBUG and see prior error messages for more information.')
			for vds in self._d.vds_list:
				self._logger.error("Failed VDS: " + str(vds['path']))
			for vds in self._unresolved_vds:
				self._logger.error("Failed VDS: " + str(vds['path']))
		else:
			self._logger.warn("_write_remainder_vds: Finished processing VDSs that failed ordering. All VDSs have been successfuly processed.")


	def _write_user(self):
		if self._config.user_process_mode == 'skip':
			self._logger.info("_write_user: Skipping user processing due to configuration user.process_mode=skip.")
			return True
		self._logger.error("_write_user: Cannot create users. API is not implemented.")

	def _write_entity(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag, report_error = True):
		self._logger.debug("_write_entity: processing entity: " + self._utils.get_entity_desc(entity))
		# Clean up the definition
		if 'id' in entity:
			entity.pop("id")
		if 'tag' in entity:
			entity.pop("tag")
		if 'children'in entity:
			entity.pop("children")
		if 'createdAt' in entity:
			entity.pop("createdAt")
		# Process ACL as needed
		if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
			# Skip this entity due to ACL processing errors
			self._logger.info("_write_entity: Skipping entity due to ignore_missing_acl_user_flag, ignore_missing_acl_group_flag: " + self._utils.get_entity_desc(entity))
			return False
		# Check if the entity already exists
		existing_entity = self._read_entity_definition(entity)
		# Ensure we have not received FOLDER instead of DATASET. See DX-16666
		if existing_entity is not None and 'entityType' in entity and \
				'entityType' in existing_entity and entity['entityType'] != existing_entity['entityType']:
			existing_entity = None
		if existing_entity is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_entity: Skipping entity creation due to configuration process_mode=update_only. " + self._utils.get_entity_desc(entity))
				return True
			# Reset version for proper concurrency
			if 'accessControlList' in entity:
				entity['accessControlList']['version'] = "0"
			if self._config.dry_run:
				self._logger.warn("_write_entity: Dry Run, NOT Creating entity: " + self._utils.get_entity_desc(entity))
				# For dry run, keep it in a seperate collection to suppress errors
				if self._utils.is_vds(entity):
					self._dry_run_processed_vds_list.append(entity)
				return False
			# Note for the CE target env, the ACL should have been popped out by _process_acl
			new_entity = self._dremio_env.create_catalog_entity(entity, self._config.dry_run)
			if new_entity is None:
				if report_error:
					self._logger.error("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity))
				else:
					self._logger.debug("_write_entity: could not create entity: " + self._utils.get_entity_desc(entity))
				return False
		else:  # Entity already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_entity: Found existing entity and process_mode is set to create_only. Skipping entity: " + self._utils.get_entity_desc(entity))
				return True
			self._logger.debug("_write_entity: Overwriting entity definition as per process_mode configuration : " + self._utils.get_entity_desc(entity))
			# Update entity definition with data from entity existing in the target environment
			entity['id'] = existing_entity['id']
			entity['tag'] = existing_entity['tag']  # Tag from the entity existing in the target environment required for proper concurrency control
			# Update ACL version for proper concurrency control, but do not use ACL if not really needed as HOME folders are not allowed to have ACL
			if ('path' in entity and entity['path'][0][:1] == '@') or ('name' in entity and entity['name'][:1] == '@'): 
				if 'accessControlList' in entity:
					entity.pop('accessControlList')
			else:
				# Note for the CE target env, the ACL should have been popped out by _process_acl
				if not self._config.target_ce:
					if 'accessControlList' not in entity:
						entity['accessControlList'] = {"version": "0"}
					# API changed behavior around version 4 and may not return version attribute for ACL.
					if 'accessControlList' in existing_entity and 'version' in existing_entity['accessControlList']:
						entity['accessControlList']['version'] = existing_entity['accessControlList']['version']
			if self._config.dry_run:
				self._logger.warn("_write_entity: Dry Run, NOT Updating entity: " + self._utils.get_entity_desc(entity))
				return False
			updated_entity = self._dremio_env.update_catalog_entity(entity['id'], entity, self._config.dry_run, report_error)
			if updated_entity is None:
				if report_error:
					self._logger.error("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity))
				else:
					self._logger.debug("_write_entity: Error updating entity: " + self._utils.get_entity_desc(entity))
				return False
		return True

	def _write_pds(self, entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		self._logger.debug("_write_pds: processing entity: " + self._utils.get_entity_desc(entity))
		if self._filter.match_pds_filter(entity):
			existing_entity = self._read_entity_definition(entity)
			if existing_entity is None:
				self._logger.error("_write_pds: Cannot find existing entity for PDS Entity. Either Folder, File, or PDS must exist prior to promoting or updating PDS. Source PDS: " + self._utils.get_entity_desc(entity))
				return False	
			# Check if PDS needs to be promoted first
			if 'type' not in existing_entity or existing_entity['type'] != 'PHYSICAL_DATASET':
				self._promote_pds(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
			# Update PDS now
			self._logger.debug("_write_pds: writing pds: " + self._utils.get_entity_desc(entity))
			self._write_entity(entity, process_mode, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag)
		else:
			return None

	def _promote_pds(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		self._logger.debug("_promote_pds: processing entity: " + self._utils.get_entity_desc(entity))
		# Clean up the definition
		if 'id' in entity:
			entity.pop("id")
		if 'tag' in entity:
			entity.pop("tag")
		if 'children'in entity:
			entity.pop("children")
		if 'createdAt' in entity:
			entity.pop("createdAt")
		# Process ACL as needed
		if not self._process_acl(entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
			# Skip this entity due to ACL processing errors
			self._logger.error("_promote_pds: Skipping PDS due to an error in ACL processing: " + self._utils.get_entity_desc(entity))
			return False
		# Read exisitng folder or file entity
		fs_entity = self._read_entity_definition(entity)
		if fs_entity is None:
			self._logger.error("_promote_pds: Skipping PDS. Cannot find folder or file for PDS Entity: " + self._utils.get_entity_desc(entity))
			return False
		# Add Folder ID to PDS Entity	
		entity['id'] = fs_entity['id']
		if 'accessControlList' in entity: 
			entity.pop('accessControlList')
		if self._config.dry_run:
			self._logger.warn("_promote_pds: Dry Run, NOT promoting pds: " + self._utils.get_entity_desc(entity))
			return True
		self._logger.debug("_promote_pds: promoting pds: " + self._utils.get_entity_desc(entity))
		new_pds_entity = self._dremio_env.promote_pds(entity, self._config.dry_run)
		if new_pds_entity is None:
			self._logger.error("_promote_pds: Error promoting PDS: " + self._utils.get_entity_desc(entity))
			return False
		return True


	def _write_reflection(self, reflection, process_mode):
		self._logger.debug("_write_reflection: processing reflection: " + self._utils.get_entity_desc(reflection))
		# Clean up the definition
		if 'id' in reflection:
			reflection.pop("id")
		if 'tag' in reflection:
			reflection.pop("tag")
		if 'createdAt' in reflection:
			reflection.pop("createdAt")
		if 'updatedAt' in reflection:
			reflection.pop("updatedAt")
		if 'currentSizeBytes' in reflection:
			reflection.pop("currentSizeBytes")
		if 'totalSizeBytes' in reflection:
			reflection.pop("totalSizeBytes")
		if 'status' in reflection:
			reflection.pop("status")
		reflection_path = reflection['path']
		# Write Reflection
		reflection.pop("path")
		reflected_dataset = self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(reflection_path))
		if reflected_dataset is None:
			self._logger.error("_write_reflection: Could not resolve dataset for " + self._utils.get_entity_desc(reflection))
			return None
		# Match filters if requested
		if self._config.reflection_filter_mode == "apply_vds_pds_filter":
			if not self._filter.match_reflection_path(reflection_path, reflected_dataset):
				return False
		reflection['datasetId'] = reflected_dataset['id']
		# Check if the reflection already exists
		existing_reflection = self._find_existing_reflection(reflection, reflected_dataset)
		if existing_reflection is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_reflection: Skipping reflection creation due to configuration reflection_process_mode. " + self._utils.get_entity_desc(reflection))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_reflection: Dry Run, NOT Creating reflection: " + self._utils.get_entity_desc(reflection))
				return None
			new_reflection = self._dremio_env.create_reflection(reflection, self._config.dry_run)
			if new_reflection is None:
				self._logger.error("_write_reflection: could not create " + self._utils.get_entity_desc(reflection))
				return None
		else:  # Reflection already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_reflection: Found existing refleciton and reflection_process_mode is set to create_only. Skipping " + self._utils.get_entity_desc(reflection))
				return None
			# make sure there are changes to update as it will invalidate existing reflection data
			if reflection['type'] == existing_reflection['type'] and \
				reflection['name'] == existing_reflection['name'] and \
				('partitionDistributionStrategy' in reflection and reflection['partitionDistributionStrategy'] == existing_reflection['partitionDistributionStrategy']) and \
			    ('measureFields' in reflection and reflection['measureFields'] == existing_reflection['measureFields']) and \
				('dimensionFields' in reflection and reflection['dimensionFields'] == existing_reflection['dimensionFields']) and \
				('displayFields' in reflection and reflection['displayFields'] == existing_reflection['displayFields']) and \
				('sortFields' in reflection and reflection['sortFields'] == existing_reflection['sortFields']) and \
				('partitionFields' in reflection and reflection['partitionFields'] == existing_reflection['partitionFields']) and \
				('distributionFields' in reflection and reflection['distributionFields'] == existing_reflection['distributionFields']):
				# Nothing to do
				self._logger.debug("_write_reflection: No pending changes. Skipping " + self._utils.get_entity_desc(reflection))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_entity: Dry Run, NOT Updating " + self._utils.get_entity_desc(reflection))
				return False
			self._logger.debug("_write_reflection: Overwriting " + self._utils.get_entity_desc(reflection))
			reflection['tag'] = existing_reflection['tag']
			updated_reflection = self._dremio_env.update_reflection(existing_reflection['id'], reflection, self._config.dry_run)
			if updated_reflection is None:
				self._logger.error("_write_reflection: Error updating " + self._utils.get_entity_desc(reflection))
				return False
		return True


	def _find_existing_reflection(self, reflection, dataset):
		for existing_reflection in self._existing_reflections:
			# Match reflections by name
			if reflection['name'] == existing_reflection['name']:
				existing_dataset = self._dremio_env.get_catalog_entity_by_id(existing_reflection['datasetId'])
				# Match reflections by respective dataset's path
				if existing_dataset is not None and existing_dataset['path'] == dataset['path']:
					return existing_reflection
		return None


	def _find_existing_dataset_by_path(self, path):
		return self._dremio_env.get_catalog_entity_by_path(path)


# Searches for Users from entity's ACL in the target environment and either:
	# - removes the user from ACL if not found and ignore_missing_acl_user_flag is set 
	# - returns False if if not found and ignore_missing_acl_user_flag is not set
	# - updates the ACL with userid from the new environment if User found there 
	def _process_acl(self, entity, ignore_missing_acl_user_flag, ignore_missing_acl_group_flag):
		self._logger.debug("_process_acl: processing entity: " + self._utils.get_entity_desc(entity))
		if 'accessControlList' not in entity:
			return True
		if self._config.target_ce:
			entity.pop('accessControlList')
			return True
		acl = entity['accessControlList']
		transformed_acl = {"users": [], "groups": []}
		if 'version' in entity:
			acl.pop('version')
		if acl == {} or ('users' not in acl and 'groups' not in acl):
			pass
		else:
			if 'users' in acl:
				# Note, taking a copy of the list for proper removal of items
				for user_def in acl['users'][:]:
					new_acl_principal = self._find_matching_principal_for_userid(user_def['id'], user_def['permissions'])
					if new_acl_principal == "REMOVE":
						self._logger.info("_process_acl: Source User " + user_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity))
					elif new_acl_principal is None:
						if ignore_missing_acl_user_flag:
							self._logger.warn("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. User is removed from ACL definition as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity))
						else:
							self._logger.error("_process_acl: Source User " + user_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_user configuration. " + self._utils.get_entity_desc(entity))
					elif "user" in new_acl_principal:
						transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']})
					elif "group" in new_acl_principal:
						transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else user_def['permissions']})
			if 'groups' in acl:
				# Note, taking a copy of the list for proper removal of items
				for group_def in acl['groups'][:]:
					new_acl_principal = self._find_matching_principal_for_groupid(group_def['id'], group_def['permissions'])
					if new_acl_principal == "REMOVE":
						self._logger.info("_process_acl: Source Group " + group_def['id'] + " is removed from ACL definition. " + self._utils.get_entity_desc(entity))
					elif new_acl_principal is None:
						if ignore_missing_acl_group_flag:
							self._logger.warn("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. Group is removed from ACL definition as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity))
						else:
							# Flag is not set - return error status
							self._logger.error("_process_acl: Source Group " + group_def['id'] + " not found in the target Dremio Environment. ACL Entry cannot be processed as per ignore_missing_acl_group configuration. " + self._utils.get_entity_desc(entity))
					elif "user" in new_acl_principal:
						transformed_acl['users'].append({"id":new_acl_principal["user"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']})
					elif "group" in new_acl_principal:
						transformed_acl['groups'].append({"id":new_acl_principal["group"],"permissions":new_acl_principal['permissions'] if "permissions" in new_acl_principal else group_def['permissions']})
			entity['accessControlList'] = transformed_acl
		return True

	def _transform_permissions(self, source_permissions, acl_mapping):
		# if permission mapping not explicitely defined, use source permissions as is
		if 'permission-mapping' not in acl_mapping:
			return source_permissions
		permissions_mapping = acl_mapping['permission-mapping']
		# READ is required for WRITE, so READ is always present in the list of permissions
		permissions = ["READ"]
		for permission in source_permissions:
			for mapping in permissions_mapping:
				# add only once
				if permission in mapping and mapping[permission] not in permissions:
					permissions.append(mapping[permission])
		return permissions

	def _find_matching_principal_for_userid(self, userid, permissions):
		self._logger.debug("_find_matching_principal_for_userid: processing user_id: " + str(userid))
		for user in self._d.referenced_users:
			if user['id'] == userid:
				transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions)
				if transformed_principal == "REMOVE":
					self._logger.info("_find_matching_principal_for_userid: Source User " + user['name'] + " [" + user['id'] + "] is mapped as NONE.")
					return "REMOVE"
				# If no tranformation is defined for this user
				elif transformed_principal is None:
					for target_user in self._target_dremio_users:
						if target_user['name'] == user['name']:
							return {"user":target_user['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		# If the username is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take username straight from target
		for user in self._target_dremio_users:
			if user['id'] == userid:
				transformed_principal = self._find_acl_transformation_by_username(user['name'], permissions)
				if transformed_principal is None:
					return {"user": user['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		return None

	def _find_acl_transformation_by_username(self, username, permissions):
		for item in self._config.acl_transformation:
			if 'user' in item['source'] and item['source']['user'] == username:
				if "REMOVE" in item['target']:
					return "REMOVE"
				elif "user" in item['target']:
					for target_user in self._target_dremio_users:
						if target_user['name'] == item['target']['user']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"user":target_user['id'],"permissions":new_permissions}
				elif "group" in item['target']:
					for target_group in self._target_dremio_groups:
						if target_group['name'] == item['target']['group']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"group":target_group['id'],"permissions":new_permissions}
				# The transformation is defined for this user, however, the target principal is not in the target Dremio Environment
				return {"error": "user_transformation_found_but_target_principle_is_not_in_target_dremio_environment"}
		# If the username is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take username straight from target
		for item in self._config.acl_transformation:
			if 'user' in item['target'] and item['target']['user'] == username:
				for target_user in self._target_dremio_users:
					if target_user['name'] == username:
						new_permissions = self._transform_permissions(permissions, item)
						return {"user": target_user['id'], "permissions": new_permissions}
			if 'group' in item['target'] and item['target']['group'] == username:
				for target_group in self._target_dremio_groups:
					if target_group['name'] == item['target']['group']:
						new_permissions = self._transform_permissions(permissions, item)
						return {"group": target_group['id'], "permissions": new_permissions}
		return None

	def _find_matching_principal_for_groupid(self, groupid, permissions):
		self._logger.debug("_find_matching_groupid: processing: " + str(groupid))
		for group in self._d.referenced_groups:
			if group['id'] == groupid:
				transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions)
				if transformed_principal == "REMOVE":
					self._logger.info("_find_matching_principal_for_groupid: Source Group " + group['name'] + " [" + group['id'] + "] is mapped as NONE.")
					return "REMOVE"
				# If no transformation is defined for this group
				elif transformed_principal is None:
					for target_group in self._target_dremio_groups:
						if target_group['name'] == group['name']:
							return {"group":target_group['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_groupid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		# If the group name is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take group name straight from target
		for group in self._target_dremio_groups:
			if group['id'] == groupid:
				transformed_principal = self._find_acl_transformation_by_groupname(group['name'], permissions)
				if transformed_principal is None:
					return {"user": group['id']}
				elif "error" in transformed_principal:
					# Something went wrong
					self._logger.error("_find_matching_principal_for_userid: error " + transformed_principal['error'])
					return None
				else:
					return transformed_principal
		return None


	def _find_acl_transformation_by_groupname(self, groupname, permissions):
		for item in self._config.acl_transformation:
			if 'group' in item['source'] and item['source']['group'] == groupname:
				if "REMOVE" in item['target']:
					return "REMOVE"
				elif "user" in item['target']:
					for target_user in self._target_dremio_users:
						if target_user['name'] == item['target']['user']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"user":target_user['id'],"permissions":new_permissions}
				elif "group" in item['target']:
					for target_group in self._target_dremio_groups:
						if target_group['name'] == item['target']['group']:
							new_permissions = self._transform_permissions(permissions, item)
							return {"group":target_group['id'],"permissions":new_permissions}
				# The transformation is defined for this group, however, the target principal is not in the target Dremio Environment
				return {"error": "group_transformation_found_but_target_principle_is_not_in_target_dremio_environment"}
		# If the group name is already in the target list (i.e. the mapping already happened
		# but the write_entity failed because parent objects were not yet created) then take group name straight from target
		for item in self._config.acl_transformation:
			if 'user' in item['target'] and item['target']['user'] == groupname:
				for target_user in self._target_dremio_users:
					if target_user['name'] == groupname:
						new_permissions = self._transform_permissions(permissions, item)
						return {"user": target_user['id'], "permissions": new_permissions}
			if 'group' in item['target'] and item['target']['group'] == groupname:
				for target_group in self._target_dremio_groups:
					if target_group['name'] == item['target']['group']:
						new_permissions = self._transform_permissions(permissions, item)
						return {"group": target_group['id'], "permissions": new_permissions}
		return None

	def _read_entity_definition(self, entity):
		self._logger.debug("_read_entity_definition: processing entity: " + self._utils.get_entity_desc(entity))
		if 'name' in entity:
			return self._dremio_env.get_catalog_entity_by_path(entity['name'])
		elif 'path' in entity:
			return self._dremio_env.get_catalog_entity_by_path(self._utils.normalize_path(entity['path']))
		else:
			self._logger.error("_read_entity_definition: bad data: " + self._utils.get_entity_desc(entity))
			return None

	# Process vds_list and save ordered list of VDSs into _vds_hierarchy. Recursive method.
	def _order_vds(self, processing_level=0):
		# Verify for the Max Hierarchy Depth
		if processing_level >= self._config.vds_max_hierarchy_depth:
			self._logger.debug("_order_vds: Finished processing with VDSs left to process:" + str(self._d.vds_list))
			return
		any_vds_leveled = False
		# Iterate through the remainder VDS in the list
		# Go with decreasing index so we can remove VDS from the list
		for i in range(len(self._d.vds_list) - 1, -1, -1):
			vds = self._d.vds_list[i]
			self._logger.debug("_order_vds: processing vds " + self._utils.get_entity_desc(vds))
			vds_hierarchy_level = processing_level
			any_dependency_unresolved = False
			sql_dependency_paths = self._get_vds_dependency_paths(vds)
			# Iterate through SQL dependencies to determine level of hierarchy for each dependency and the VDS
			for path in sql_dependency_paths:
				self._logger.debug("_order_vds: processing sql dependency " + path)
				# Validate the dependency against VDS and PDS
				sql_context = self._utils.get_sql_context(vds)
				dependency_vds = self._find_vds_by_path(self._utils.get_absolute_path(path, sql_context))
				if dependency_vds is None:
					dependency_pds = self._find_pds_by_path(self._utils.get_absolute_path(path, sql_context))
					if dependency_pds is None:
						# Dependency could not be resolved.
						self._logger.warn("_order_vds: giving up on ordering VDS '" + self._utils.normalize_path(vds['path']) + "'. Could not resolve dependency '" + self._utils.get_absolute_path(path, sql_context) + "' Will try to process without ordering.")
						# Move VDS into unresolved list
						self._unresolved_vds.append(vds)
						self._d.vds_list.remove(vds)
						# Mark as do-not-process
						any_dependency_unresolved = True
						break
					else:
						# The dependency has been resolved as PDS, continue to the next dependency
						continue
				else:
					# Dependency was found as VDS
					dependency_hierarchy_level = self._find_vds_level_in_hierarchy(dependency_vds['id'])
					if dependency_hierarchy_level is None:
						# Dependency has not been processed yet, push this VDS to the next processing level
						vds_hierarchy_level = None
						break
					# Find the highest level of hierarchy among dependencies
					elif vds_hierarchy_level < dependency_hierarchy_level + 1:
						vds_hierarchy_level = dependency_hierarchy_level + 1
			if any_dependency_unresolved or vds_hierarchy_level is None:
				# Do not process this VDS at this recursion
				self._logger.debug("_order_vds: some dependencies cannot be validated for entity " + vds['id'] + " at processing level " + str(processing_level))
			else:
				# Add the current VDS to the vds_hierarchy_level
				self._vds_hierarchy.append([vds_hierarchy_level, vds])
				# Remove the current VDS from further processing
				self._d.vds_list.remove(vds)
				# Mark this hierarchy level as successful
				any_vds_leveled = True
				self._logger.debug("_order_vds: dependencies have been validated for entity " + vds['id'] + " for hierarchy level " + str(vds_hierarchy_level))
		# Are we done yet with recursion
		if not any_vds_leveled or len(self._d.vds_list) == 0:
			self._hierarchy_depth = processing_level + 1
			self._logger.debug("_order_vds: finished processing all VDS with hierarchy depth of :" + str(self._hierarchy_depth + 1))
			return
		# Process the next Hierarchy Level recursively
		self._order_vds(processing_level + 1)

	def _get_vds_dependency_paths(self, vds):
		if self._is_source_ce() or not self._d.vds_parents:
			# CE does not support graph
			return parse_sql.tables_in_query(vds['sql'])
		else:
			for vds_entry in self._d.vds_parents:
				if vds_entry['path'] == vds['path']:
					return vds_entry['parents']

	def _is_source_ce(self):
		for item in self._d.dremio_get_config:
			if 'source' in item:
				for param in item['source']:
					if 'is_community_edition' in param:
						return eval(param['is_community_edition'])
		return False

	def _find_vds_by_path(self, path):
		# First, try finding in the VDS list from the source file
		for vds in self._d.vds_list:
			if path == self._utils.normalize_path(vds['path']):
				return vds
		# For dry run, check processed vds
		if self._config.dry_run:
			for vds in self._dry_run_processed_vds_list:
				if path == self._utils.normalize_path(vds['path']):
					return vds
		# Finally, try finding in the target environment
		entity = self._dremio_env.get_catalog_entity_by_path(path)
		# Make sure we get VDS and not folder/file
		if entity is not None and self._utils.is_vds(entity):
			return entity
		return None

	def _find_pds_by_path(self, path):
		# First, try finding in the PDS list from the source file
		for pds in self._d.pds_list:
			if path == self._utils.normalize_path(pds['path']):
				return pds
		# For dry run, check processed pds
		if self._config.dry_run:
			for pds in self._dry_run_processed_pds_list:
				if path == self._utils.normalize_path(pds['path']):
					return pds
		# Finally, try finding in the target environment
		entity = self._dremio_env.get_catalog_entity_by_path(path)
		# Make sure we get promoted PDS and not folder/file
		if entity is not None and self._utils.is_pds(entity):
			return entity
		return None

	def _find_vds_level_in_hierarchy(self, vds_id):
		for item in self._vds_hierarchy:
			if item[1]['id'] == vds_id:
				return item[0]
		return None

	def get_errors_count(self):
		return self._logger.errors_encountered


	def _write_wiki(self, wiki, process_mode):
		self._logger.debug("_write_wiki: processing wiki: " + str(wiki))
		new_wiki_text = wiki['text']
		wiki_path = wiki['path']
		# Check if the wiki already exists
		existing_wiki_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(wiki_path))
		if existing_wiki_entity is None:
			self._logger.error("_write_wiki: Unable to resolve wiki's dataset for " + str(wiki))
			return None
		existing_wiki = self._dremio_env.get_catalog_wiki(existing_wiki_entity['id'])
		if existing_wiki is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_wiki: Skipping wiki creation due to configuration wiki_process_mode. " + str(wiki))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_wiki: Dry Run, NOT Creating wiki: " + str(wiki))
				return None
			new_wiki = {"text":new_wiki_text}
			new_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], new_wiki, self._config.dry_run)
			if new_wiki is None:
				self._logger.error("_write_wiki: could not create " + str(wiki))
				return None
		else:  # Wiki already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_wiki: Found existing wiki and wiki_process_mode is set to create_only. Skipping " + str(wiki))
				return None
			# make sure there are changes to update as it will invalidate existing wiki data
			if new_wiki_text == existing_wiki['text']:
				# Nothing to do
				self._logger.debug("_write_wiki: No pending changes. Skipping " + str(wiki))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_wiki: Dry Run, NOT Updating " + str(wiki))
				return False
			self._logger.debug("_write_wiki: Overwriting " + str(wiki))
			existing_wiki['text'] = new_wiki_text
			updated_wiki = self._dremio_env.update_wiki(existing_wiki_entity['id'], existing_wiki, self._config.dry_run)
			if updated_wiki is None:
				self._logger.error("_write_wiki: Error updating " + str(wiki))
				return False
		return True


	def _write_tags(self, tags, process_mode):
		self._logger.debug("_write_tag: processing tags: " + str(tags))
		new_tags = tags['tags']
		tags_path = tags['path']
		# Check if the tags already exist
		existing_tags_entity = self._find_existing_dataset_by_path(self._utils.normalize_path(tags_path))
		if existing_tags_entity is None:
			self._logger.error("_write_tags: Unable to resolve tag's dataset for " + str(tags))
			return None
		existing_tags = self._dremio_env.get_catalog_tags(existing_tags_entity['id'])
		if existing_tags is None:  # Need to create new entity
			if process_mode == 'update_only':
				self._logger.info("_write_tags: Skipping tags creation due to configuration tag_process_mode. " + str(tags))
				return None
			if self._config.dry_run:
				self._logger.warn("_write_tags: Dry Run, NOT Creating tags: " + str(tags))
				return None
			new_tags = {"tags":new_tags}
			new_tags = self._dremio_env.update_tag(existing_tags_entity['id'], new_tags, self._config.dry_run)
			if new_tags is None:
				self._logger.error("_write_tags: could not create " + str(tags))
				return None
		else:  # Tags already exists in the target environment
			if process_mode == 'create_only':
				self._logger.info("_write_tags: Found existing tags and tag_process_mode is set to create_only. Skipping " + str(tags))
				return None
			# make sure there are changes to update as it will invalidate existing tags data
			if new_tags == existing_tags['tags']:
				# Nothing to do
				self._logger.debug("_write_tags: No pending changes. Skipping " + str(tags))
				return None
			if self._config.dry_run:
				self._logger.warn("tags: Dry Run, NOT Updating " + str(tags))
				return False
			self._logger.debug("_write_tags: Overwriting " + str(tags))
			existing_tags['tags'] = new_tags
			updated_tags = self._dremio_env.update_tag(existing_tags_entity['id'], existing_tags, self._config.dry_run)
			if updated_tags is None:
				self._logger.error("_write_tags: Error updating " + str(tags))
				return False
		return True
コード例 #4
0
ファイル: DremioReader.py プロジェクト: tejkm/dremio-cloner
class DremioReader:

    # Dremio Cloner Configuration, Utils, ...
    _config = None
    _utils = None
    _logger = None
    _filter = None

    # Dremio object pointing to the source Dremio environment
    _dremio_env = None

    # DremioData object containing data from Dremio source environment
    _d = DremioData()

    # Current top-level hierarchy context: Home, Space, Source
    _top_level_hierarchy_context = None

    def __init__(self, source_dremio, config):
        self._config = config
        self._dremio_env = source_dremio
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)
        self._filter = DremioClonerFilter(config)

    # Read all data from the source Dremio environemnt
    # Return DremioData
    def read_dremio_environment(self):
        self._read_catalog()
        if not self._config.pds_list_useapi and self._filter.is_pds_in_scope():
            self._read_all_pds()
        self._read_reflections()
        self._read_rules()
        self._read_queues()
        self._read_votes()
        # Make sure that all VDS dependencies included as per configuration
        self._process_vds_dependencies()
        return self._d

    def _read_all_pds(self):
        if self._config.pds_list_useapi or not self._filter.is_pds_in_scope():
            self._logger.info(
                "_read_all_pds: skipping PDS reading as per pds.filter configuration."
            )
        else:
            pds_list = self._dremio_env.list_pds(
                self._d.sources,
                self._config.source_folder_filter,
                self._config.source_folder_exclude_filter,
                self._config.pds_filter,
                self._config.pds_exclude_filter,
                pds_error_list=self._d.pds_error_list)
            for pds in pds_list:
                if self._filter.match_pds_filter(pds):
                    self._d.pds_list.append(pds)

    # Read Dremio catalog from source environment recursively going to containers and their children objects
    def _read_catalog(self):
        containers = self._dremio_env.list_catalog()['data']
        for container in containers:
            self._logger.debug("_read_catalog: processing container " +
                               self._utils.get_entity_desc(container))
            self._process_container(container)

    # Identify a container and delegate processing
    def _process_container(self, container):
        self._logger.debug("_process_container: " +
                           self._utils.get_entity_desc(container))
        if container['containerType'] == "HOME":
            self._read_home(container)
        elif container['containerType'] == "SPACE":
            self._read_space(container)
        elif container['containerType'] == "SOURCE":
            self._read_source(container)
        else:
            self._logger.fatal("_process_container: unexpected entity type " +
                               self._utils.get_entity_desc(container))

    def _read_home(self, container):
        self._logger.debug("_read_home: processing container: " +
                           self._utils.get_entity_desc(container))
        if self._config.home_process_mode == 'process':
            self._top_level_hierarchy_context = "HOME"
            self._d.containers.append(container)
            entity = self._get_entity_definition_by_id(container)
            if entity is not None:
                self._logger.info("_read_home: " +
                                  self._utils.get_entity_desc(entity))
                self._d.homes.append(entity)
                self._read_acl(entity)
                self._read_wiki(entity)
                self._read_space_children(entity)
            else:
                self._logger.error(
                    "_read_home: error reading entity for container: " +
                    self._utils.get_entity_desc(container))
        else:
            self._logger.debug("_read_home: skipping due to job configuration")

    def _read_space(self, container):
        self._logger.debug("_read_space: processing container: " +
                           self._utils.get_entity_desc(container))
        self._top_level_hierarchy_context = "SPACE"
        if self._filter.match_space_filter(container):
            self._d.containers.append(container)
            entity = self._get_entity_definition_by_id(container)
            if entity is not None:
                self._logger.debug("_read_space: " +
                                   self._utils.get_entity_desc(container))
                self._d.spaces.append(entity)
                self._read_acl(entity)
                self._read_wiki(entity)
                self._read_space_children(entity)
            else:
                self._logger.error(
                    "_read_space: error reading entity for container: " +
                    self._utils.get_entity_desc(container))

    def _read_source(self, container):
        self._logger.debug("_read_source: processing container: " +
                           self._utils.get_entity_desc(container))
        if self._config.source_process_mode == 'process' or (
                self._config.pds_process_mode == 'process'
                and self._config.pds_list_useapi):
            self._top_level_hierarchy_context = "SOURCE"
            if self._filter.match_source_filter(container):
                self._d.containers.append(container)
                entity = self._get_entity_definition_by_id(container)
                if entity is not None:
                    # Re-validate the filter with entity since there is more details in entity
                    if self._filter.match_source_filter(entity):
                        self._logger.debug("_read_source: " +
                                           self._utils.get_entity_desc(entity))
                        self._d.sources.append(entity)
                        self._read_acl(entity)
                        self._read_wiki(entity)
                        # Depending on the useapi flag, PDSs can be collected via INFORMATION_SCHEMA. See also DX16597
                        if self._config.pds_list_useapi:
                            self._read_source_children(entity)
                else:
                    self._logger.error(
                        "_read_source: error reading entity for container: " +
                        self._utils.get_entity_desc(container))
        else:
            self._logger.debug(
                "_read_source: skipping due to job configuration")

    def _read_space_folder(self, folder):
        self._logger.debug("_read_space_folder: processing folder: " +
                           self._utils.get_entity_desc(folder))
        if self._top_level_hierarchy_context not in ["SPACE", "HOME"]:
            return
        entity = self._get_entity_definition_by_id(folder)
        if entity is None:
            self._logger.error(
                "_read_space_folder: error reading entity for folder: " +
                self._utils.get_entity_desc(folder))
            return
        if self._top_level_hierarchy_context == "HOME" or self._filter.match_space_folder_filter(
                folder):
            self._logger.debug("_read_space_folder: " +
                               self._utils.get_entity_desc(folder))
            self._d.folders.append(entity)
            self._read_acl(entity)
            self._read_wiki(entity)
            # Validate all parent folders in the path have been saved already
            folder_path = entity['path']
            for i in range(1, len(folder_path) - 1):
                folderSaved = False
                for item in self._d.folders:
                    if item['path'][-1] == folder_path[i]:
                        folderSaved = True
                if not folderSaved:
                    parent_entity = self._get_entity_definition_by_path(
                        folder_path[0:i + 1])
                    self._d.folders.append(parent_entity)
        self._read_space_children(entity)

    def _read_space_children(self, parent_entity):
        self._logger.debug("_read_space_children: processing parent_entity: " +
                           self._utils.get_entity_desc(parent_entity))
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_read_space_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        for child in parent_entity['children']:
            if child['type'] == "DATASET":
                self._read_dataset(child)
            elif child['type'] == "FILE":
                self._read_file(child)
            elif child['containerType'] == "FOLDER":
                self._read_space_folder(child)
            else:
                self._logger.error(
                    "_read_space_children: not supported entity type " +
                    child['type'])

    def _read_source_folder(self, folder):
        self._logger.debug("_read_source_folder: processing folder: " +
                           self._utils.get_entity_desc(folder))
        if self._top_level_hierarchy_context == "SOURCE" and self._filter.match_source_folder_filter(
                folder):
            entity = self._get_entity_definition_by_id(folder)
            if entity is not None:
                self._logger.debug("_read_source_folder: " +
                                   self._utils.get_entity_desc(folder))
                self._read_source_children(entity)
            else:
                self._logger.error(
                    "_read_source_folder: error reading entity for folder: " +
                    self._utils.get_entity_desc(folder))

    def _read_source_children(self, parent_entity):
        self._logger.debug(
            "_read_source_children: processing parent entity '" +
            self._utils.get_entity_desc(parent_entity) + "'")
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_read_source_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        for child in parent_entity['children']:
            if child['type'] == "DATASET":
                self._read_dataset(child)
            elif child['type'] == "FILE":
                self._read_file(child)
            elif child['containerType'] == "FOLDER":
                self._read_source_folder(child)
            else:
                self._logger.error(
                    "_read_source_children: not supported entity type " +
                    child['type'])

    def _read_dataset(self, dataset):
        self._logger.debug("_read_dataset: processing dataset: " +
                           self._utils.get_entity_desc(dataset))
        entity = self._get_entity_definition_by_id(dataset)
        if entity is not None:
            self._logger.debug("_read_dataset: " + dataset['datasetType'] +
                               " : " + self._utils.get_entity_desc(dataset))
            if dataset['datasetType'] == "PROMOTED" or dataset[
                    'datasetType'] == "DIRECT":
                self._d.pds_list.append(entity)
            elif dataset['datasetType'] == "VIRTUAL":
                tags = self._dremio_env.get_catalog_tags(entity['id'])
                if self._filter.match_vds_filter(dataset, tags=tags):
                    self._d.vds_list.append(entity)
            else:
                self._logger.error("_read_dataset: Unexpected dataset type " +
                                   dataset['datasetType'] + " for " +
                                   self._utils.get_entity_desc(dataset) + ".")
            self._read_acl(entity)
            self._read_wiki(entity)
            self._read_tags(entity)

    def _read_file(self, file_name):
        # do nothing
        return

    def _read_reflections(self):
        self._logger.debug("_read_reflections: starting")
        if self._config.reflection_process_mode == 'process' and not self._config.source_ce:
            reflections = self._dremio_env.list_reflections()['data']
            for reflection in reflections:
                reflection_dataset = self._dremio_env.get_catalog_entity_by_id(
                    reflection['datasetId'])
                if reflection_dataset is None:
                    self._logger.debug(
                        "_read_reflections: error processing reflection, cannot get path for dataset: "
                        + reflection['datasetId'])
                    continue
                reflection_path = reflection_dataset['path']
                self._logger.debug(
                    "_read_reflections: processing reflection " +
                    reflection['datasetId'] + " path: " + str(reflection_path))
                reflection["path"] = reflection_path
                self._d.reflections.append(reflection)
#				self._read_acl(reflection)
#				self._read_wiki(reflection)
        else:
            self._logger.debug(
                "_read_reflections: skipping reflections processing as per job configuration"
            )

    # Note, tags are only available for datasets
    def _read_tags(self, entity):
        self._logger.debug("_read_tags: for entity " +
                           self._utils.get_entity_desc(entity))
        if self._config.tag_process_mode == 'process':
            tag = self._dremio_env.get_catalog_tags(entity['id'])
            if tag is not None:
                tag['entity_id'] = entity['id']
                if entity['entityType'] == 'space' or entity[
                        'entityType'] == 'source':
                    tag['path'] = [entity['name']]
                else:
                    tag['path'] = entity['path']
                if tag not in self._d.tags:
                    self._d.tags.append(tag)
        else:
            self._logger.debug(
                "_read_tags: skipping tags processing as per job configuration"
            )

    def _read_wiki(self, entity):
        self._logger.debug("_read_wiki: for entity " +
                           self._utils.get_entity_desc(entity))
        if self._config.wiki_process_mode == 'process':
            wiki = self._dremio_env.get_catalog_wiki(entity['id'])
            if wiki is not None:
                wiki["entity_id"] = entity['id']
                if entity['entityType'] == 'space' or entity[
                        'entityType'] == 'source' or entity[
                            'entityType'] == 'home':
                    wiki['path'] = [entity['name']]
                else:
                    wiki['path'] = entity['path']
                if wiki not in self._d.wikis:
                    self._d.wikis.append(wiki)
        else:
            self._logger.debug(
                "_read_wiki: skipping wiki processing as per job configuration"
            )

    def _read_acl(self, entity):
        self._logger.debug("_read_acl: for entity " +
                           self._utils.get_entity_desc(entity))
        if 'accessControlList' in entity:
            acl = entity['accessControlList']
            if 'users' in acl:
                for user in acl['users']:
                    user_entity = self._dremio_env.get_user(user['id'])
                    if user_entity is not None:
                        if user_entity not in self._d.referenced_users:
                            self._d.referenced_users.append(user_entity)
            if 'groups' in acl:
                for group in acl['groups']:
                    group_entity = self._dremio_env.get_group(group['id'])
                    if group_entity is not None:
                        if group_entity not in self._d.referenced_groups:
                            self._d.referenced_groups.append(group_entity)

    def _process_vds_dependencies(self):
        if self._config.vds_dependencies_process_mode == 'get':
            for vds in self._d.vds_list:
                self._discover_dependencies(vds)
            for vds in self._d.vds_list:
                self._populate_dependencies_graph(vds)

    # Discovers dependencies for the passed dataset and adds them to the self._d.vds_list
    def _discover_dependencies(self, dataset):
        self._logger.debug("_discover_dependencies: processing dataset: " +
                           self._utils.get_entity_desc(dataset))
        if dataset is not None:
            if 'type' not in dataset:
                self._logger.error(
                    "_discover_dependencies: Expected Dataset Entity but got: "
                    + self._utils.get_entity_desc(dataset))
                return
            if dataset['type'] == 'PHYSICAL_DATASET':
                if dataset not in self._d.pds_list:
                    self._d.pds_list.append(dataset)
                return
            elif dataset['type'] == 'VIRTUAL_DATASET':
                if dataset not in self._d.vds_list:
                    self._d.vds_list.append(dataset)
                # Process VDS dependencies
                sql_dependency_paths = self._get_vds_dependency_paths(dataset)
                for dependency_path in sql_dependency_paths:
                    dependency_path = self._utils.get_absolute_path(
                        dependency_path, self._utils.get_sql_context(dataset))
                    entity = self._find_entity(dependency_path)
                    if entity is not None:
                        # Entity has already been read
                        return
                    dependency_dataset = self._dremio_env.get_catalog_entity_by_path(
                        dependency_path)
                    if dependency_dataset is None:
                        self._logger.warn(
                            "_discover_dependencies: unable to resolve dataset likely due to datasource availability: "
                            + dependency_path)
                    else:
                        self._discover_dependencies(dependency_dataset)
            else:
                self._logger.error(
                    "_discover_dependencies: Unknown Entity Type: " +
                    dataset['type'])
        else:
            self._logger.error(
                "_discover_dependencies: Could not resolve dependency: None")

    def _populate_dependencies_graph(self, vds):
        self._logger.debug("_populate_dependencies_graph: processing vds: " +
                           self._utils.get_entity_desc(vds))
        # For some broken VDSs,
        vds_parent_list = self._get_vds_dependency_paths(vds)
        vds_parent_json = {
            'id': vds['id'],
            'path': vds['path'],
            'parents': vds_parent_list
        }
        if not self._config.source_ce and self._config.source_graph_support:
            self._d.vds_parents.append(vds_parent_json)

    def _get_vds_dependency_paths(self, vds):
        self._logger.debug("_get_vds_dependency_paths: processing vds: " +
                           self._utils.get_entity_desc(vds))
        if self._config.source_ce or not self._config.source_graph_support:
            return parse_sql.tables_in_query(vds['sql'])
        else:
            graph = self._dremio_env.get_catalog_entity_graph_by_id(vds['id'])
            if graph is None:
                self._logger.warn(
                    "Could not receive Graph via API. Try to set graph_api_support to False in the job configuration."
                )
                return parse_sql.tables_in_query(vds['sql'])
            vds_parent_list = []
            for parent in graph['parents']:
                vds_parent_list.append(
                    self._utils.normalize_path(parent['path']))
            return vds_parent_list

    def _find_entity(self, path):
        self._logger.debug("_find_entity: processing path: " + str(path))
        for vds in self._d.vds_list:
            if self._utils.normalize_path(vds['path']) == path:
                return vds
        for pds in self._d.pds_list:
            if self._utils.normalize_path(pds['path']) == path:
                return pds

    # Helper method, used by most read* methods
    def _get_entity_definition_by_id(self, src):
        self._logger.debug("_get_entity_definition_by_id: processing src: " +
                           self._utils.get_entity_desc(src))
        if 'id' not in src:
            self._logger.error(
                "_read_entity_definition: bad data, skipping entity: " +
                self._utils.get_entity_desc(src))
            return None
        else:
            entity = self._dremio_env.get_catalog_entity_by_id(src['id'])
            if entity is None:
                self._logger.error(
                    "_read_entity_definition: cannot retrieve entity for id: "
                    + src['id'])
            return entity

    def _get_entity_definition_by_path(self, path):
        self._logger.debug(
            "_get_entity_definition_by_path: processing path: " + str(path))
        path = self._utils.normalize_path(path)
        entity = self._dremio_env.get_catalog_entity_by_path(path)
        if entity is None:
            self._logger.error(
                "_read_entity_definition: cannot retrieve entity for path: " +
                str(path))
        return entity

    def _read_queues(self):
        self._logger.debug("read_queues: started")
        if self._config.wlm_queue_process_mode == 'process' and not self._config.source_ce:
            self._d.queues = self._dremio_env.list_queues()['data']
        else:
            self._logger.debug(
                "_read_queues: skipping as per job configuration")

    def _read_rules(self):
        self._logger.debug("read_rules: started")
        if self._config.wlm_rule_process_mode == 'process' and not self._config.source_ce:
            self._d.rules = self._dremio_env.list_rules()['rules']
        else:
            self._logger.debug("read_rules: skipping as per job configuration")

    def _read_votes(self):
        self._logger.debug("read_votes: started")
        if self._config.vote_process_mode == 'process' and not self._config.source_ce:
            self._d.votes = self._dremio_env.list_votes()['data']
        else:
            self._logger.debug("read_votes: skipping as per job configuration")

    def get_errors_count(self):
        return self._logger.errors_encountered
コード例 #5
0
class DremioCascadeAcl:

    # Dremio Cloner Config, Logger, Utils
    _config = None
    _logger = None
    _utils = None
    _filter = None

    # Dremio Environment to write to
    _dremio_env = None

    # List of PDS for processing
    _pds_list = None

    def __init__(self, dremio, config):
        self._config = config
        self._dremio_env = dremio
        self._logger = DremioClonerLogger(self._config.max_errors,
                                          self._config.logging_verbose)
        self._utils = DremioClonerUtils(config)
        self._filter = DremioClonerFilter(config)

    def cascade_acl(self):
        if not self._config.pds_list_useapi:
            self._pds_list = self._dremio_env.list_pds(
                self._config.source_filter, self._config.source_exclude_filter,
                self._config.source_folder_filter,
                self._config.source_folder_exclude_filter,
                self._config.pds_filter, self._config.pds_exclude_filter)
            self._logger.info(
                "cascade_acl: Not using API for PDS retrieval. Filtered PDS are NOT reported in the log."
            )
        containers = self._dremio_env.list_catalog()['data']
        for container in containers:
            self._logger.debug("cascade_acl: processing container " +
                               self._utils.get_entity_desc(container))
            if container[
                    'containerType'] == "SPACE" and self._filter.match_space_filter(
                        container):
                self._process_space(container)
            elif container[
                    'containerType'] == "SOURCE" and self._filter.match_source_filter(
                        container):
                self._process_source(container)

    def _process_space(self, space):
        entity = self._get_entity_definition(space)
        if entity is None:
            self._logger.error(
                "_process_space: error reading entity for container: " +
                self._utils.get_entity_desc(space))
        else:
            if self._config.space_cascade_acl_origin_override_object is None:
                # Use Space ACL as an 'origin'
                self._logger.info(
                    "_process_space: SPACE: '" + str(space['path']) +
                    "' will be used as an ACL Origin for its children FOLDERs and VDSs."
                )
                acl = self._get_acl(entity)
            else:
                # Use ACL from a configured object
                acl_entity = self._dremio_env.get_catalog_entity_by_path(
                    self._config.space_cascade_acl_origin_override_object)
                if acl_entity is None:
                    self._logger.error(
                        "_process_space: error reading origin entity for path: "
                        + str(self._config.
                              space_cascade_acl_origin_override_object))
                    return
                self._logger.info(
                    "_process_space: SPACE: '" + str(space['path']) +
                    "' Using override origin instead as an ACL Origin for its children FOLDERs and VDSs."
                )
                acl = self._get_acl(acl_entity)
            self._process_space_children(entity, acl)

    def _process_source(self, source):
        entity = self._get_entity_definition(source)
        if entity is None:
            self._logger.error(
                "_process_source: error reading entity for container: " +
                self._utils.get_entity_desc(source))
        else:
            if self._config.source_cascade_acl_origin_override_object is None:
                # Use Source ACL as an 'origin'
                self._logger.info(
                    "_process_source: SOURCE: '" + str(source['path']) +
                    "' will be used as an ACL Origin for its children PDSs.")
                acl = self._get_acl(entity)
            else:
                # Use ACL from a configured object
                acl_entity = self._dremio_env.get_catalog_entity_by_path(
                    self._config.source_cascade_acl_origin_override_object)
                if acl_entity is None:
                    self._logger.error(
                        "_process_source: error reading origin entity for path: "
                        + str(self._config.
                              source_cascade_acl_origin_override_object))
                    return
                self._logger.info(
                    "_process_source: SOURCE: '" + str(source['path']) +
                    "' Using override origin instead as an ACL Origin for its children PDSs."
                )
                acl = self._get_acl(acl_entity)
            # Process PDSs
            if self._config.pds_list_useapi:
                self._process_source_children(entity, acl)
            else:
                for pds in self._pds_list:
                    # Does the PDS belong to the current Source
                    if pds['path'][0] == source['path'][0]:
                        self._logger.debug("_process_source: pds: " +
                                           self._utils.get_entity_desc(pds))
                        if self._filter.match_pds_filter(pds):
                            self._logger.debug(
                                "_process_source_children: applying ACL to PDS: "
                                + self._utils.get_entity_desc(pds))
                            self._apply_acl(pds, acl)

    def _process_source_children(self, parent_entity, acl):
        # This is a recursive function
        if 'children' not in parent_entity:
            return
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_process_source_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        self._logger.debug(
            "_process_source_children: processing parent entity '" +
            self._utils.get_entity_desc(parent_entity) + "'")
        for child in parent_entity['children']:
            child_entity = self._get_entity_definition(child)
            if child_entity is None:
                self._logger.error(
                    "_process_source_children: error reading entity for: " +
                    self._utils.get_entity_desc(child))
            if child['type'] == "DATASET":
                if self._filter.match_pds_filter(child_entity):
                    self._logger.debug(
                        "_process_source_children: applying ACL to PDS: " +
                        self._utils.get_entity_desc(child_entity))
                    self._apply_acl(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_source_children: skipping PDS: " +
                        str(child_entity['path']) +
                        "as per filter configuration")
            elif child['type'] == "FILE":
                self._logger.info("_process_source_children: skipping FILE: " +
                                  self._utils.get_entity_desc(child_entity))
            elif 'containerType' in child and child[
                    'containerType'] == "FOLDER":
                if self._filter.match_source_folder_filter(child_entity):
                    self._process_source_children(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_source_children: skipping FOLDER: " +
                        str(child_entity['path']) +
                        "as per filter configuration")

    def _process_space_children(self, parent_entity, acl):
        # This is a recursive function
        if 'children' not in parent_entity:
            return
        if 'entityType' not in parent_entity:
            self._logger.error(
                "_process_space_children: bad data, skipping entity: " +
                self._utils.get_entity_desc(parent_entity))
            return
        self._logger.debug(
            "_process_space_children: processing parent entity '" +
            self._utils.get_entity_desc(parent_entity) + "'")
        for child in parent_entity['children']:
            child_entity = self._get_entity_definition(child)
            if child_entity is None:
                self._logger.error(
                    "_process_space_children: error reading entity for: " +
                    self._utils.get_entity_desc(child))
            if child['type'] == "DATASET":
                if self._filter.match_vds_filter(child_entity):
                    self._logger.debug(
                        "_process_space_children: applying ACL to VDS: " +
                        self._utils.get_entity_desc(child_entity))
                    self._apply_acl(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_space_children: skipping VDS: " +
                        self._utils.get_entity_desc(child_entity))
            elif child['containerType'] == "FOLDER":
                if self._filter.match_space_folder_filter(child_entity):
                    if self._filter.match_space_folder_cascade_acl_origin_filter(
                            child_entity):
                        self._logger.info(
                            "_process_space_children: FOLDER: " +
                            str(child_entity['path']) +
                            " will be used as an ACL Origin for its children.")
                        self._process_space_children(
                            child_entity, self._get_acl(child_entity))
                    else:
                        self._logger.info(
                            "_process_space_children: applying ACL to FOLDER: "
                            + self._utils.get_entity_desc(child_entity))
                        self._apply_acl(child_entity, acl)
                        self._process_space_children(child_entity, acl)
                else:
                    self._logger.info(
                        "_process_space_children: skipping FOLDER: " +
                        self._utils.get_entity_desc(child_entity))
                    self._process_space_children(child_entity, acl)

    def _get_entity_definition(self, src):
        if 'id' not in src:
            self._logger.error(
                "_read_entity_definition: bad data, skipping entity: " +
                self._utils.get_entity_desc(src))
            return None
        else:
            entity = self._dremio_env.get_catalog_entity_by_id(src['id'])
            if entity is None:
                self._logger.error(
                    "_read_entity_definition: cannot retrieve entity for id: "
                    + src['id'])
            return entity

    def _get_acl(self, entity):
        if 'accessControlList' in entity:
            return entity['accessControlList']
        else:
            self._logger.fatal("ACL is not defined for " +
                               self._utils.get_entity_desc(entity))
            return None

    def _apply_acl(self, entity, acl):
        # Clear the current ACL definition
        if 'accessControlList' not in entity:
            entity['accessControlList'] = {"version": "0"}
        if 'users' in entity['accessControlList']:
            entity['accessControlList'].pop('users')
        if 'groups' in entity['accessControlList']:
            entity['accessControlList'].pop('groups')
        # Apply ACL to entity
        if 'users' in acl:
            entity['accessControlList']['users'] = acl['users']
        if 'groups' in acl:
            entity['accessControlList']['groups'] = acl['groups']
        if self._config.dry_run:
            self._logger.warn("_apply_acl: Dry Run, NOT Updating entity: " +
                              self._utils.get_entity_desc(entity))
            return False
        self._logger.info("_apply_acl: updating entity: " +
                          self._utils.get_entity_desc(entity))
        updated_entity = self._dremio_env.update_catalog_entity(
            entity['id'], entity, self._config.dry_run)
        if updated_entity is None:
            self._logger.error("_apply_acl: Error updating entity: " +
                               self._utils.get_entity_desc(entity))
            return False
        return True

    def get_errors_count(self):
        return self._logger.errors_encountered