def deleteGenProcess(data,atlasHost, atlasPort, atlasUser,atlasUserPass): # Get connection to Atlas client=Atlas(atlasHost,port=atlasPort,username=atlasUser,password=atlasUserPass) # Find object by type and qualifiedName entity = client.entity_unique_attribute(data['fields']['typeName'], qualifiedName=data['fields']['typeName']+"."+data['fields']['name']) # Do delete entity.delete()
def updataGenProcces(data,atlasHost, atlasPort, atlasUser,atlasUserPass): # Get connection to Atlas client=Atlas(atlasHost,port=atlasPort,username=atlasUser,password=atlasUserPass) # Find Process by type and qualifiedName entity = client.entity_unique_attribute(data['fields']['typeName'], qualifiedName=data['fields']['typeName']+"."+data['fields']['name']) # Save executionDates on array entityExecutionDates=entity.entity['attributes']['executionDates'] # Obtain new executionDates from params file for dates in data['fields']['executionDates']: entitiExecutionDates.append(dates) # Assign new value to attribute entity['attributes']['executionDates']= entityExecutionDates # Call atlas to update attribute (executionDates) entity.update(attribute='executionDates')
class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] STATISTICS_FORMAT_SPEC = app.config['STATISTICS_FORMAT_SPEC'] BOOKMARK_TYPE = 'Bookmark' USER_TYPE = 'User' READER_TYPE = 'Reader' QN_KEY = 'qualifiedName' BOOKMARK_ACTIVE_KEY = 'active' ENTITY_ACTIVE_STATUS = 'ACTIVE' GUID_KEY = 'guid' ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' ENTITY_URI_KEY = 'entityUri' _CACHE = CacheManager(**parse_cache_config_options( { 'cache.regions': 'atlas_proxy', 'cache.atlas_proxy.type': 'memory', 'cache.atlas_proxy.expire': _ATLAS_PROXY_CACHE_EXPIRY_SEC })) def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '', encrypted: bool = False, validate_ssl: bool = False) -> None: """ Initiate the Apache Atlas client with the provided credentials """ protocol = 'https' if encrypted else 'http' self._driver = Atlas(host=host, port=port, username=user, password=password, protocol=protocol, validate_ssl=validate_ssl) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_flat_values_from_dsl(self, dsl_param: dict) -> List: """ Makes a DSL query asking for specific attribute, extracts that attribute from result (which is a list of list, and converts that into a flat list. :param dsl_param: A DSL parameter, with SELECT clause :return: A Flat list of specified attributes in SELECT clause """ attributes: List = list() _search_collection = self._driver.search_dsl(**dsl_param) for collection in _search_collection: attributes = collection.flatten_attrs() return attributes def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Table Name """ pattern = re.compile( r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _parse_reader_qn(self, reader_qn: str) -> Dict: """ Parse reader qualifiedName and extract the info :param reader_qn: :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<user_id>[^.]*)\.reader \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(reader_qn) return result.groupdict() if result else dict() def _parse_bookmark_qn(self, bookmark_qn: str) -> Dict: """ Parse bookmark qualifiedName and extract the info :param bookmark_qn: Qualified Name of Bookmark entity :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<entity_type>[^.]*) \. (?P<user_id>[^.]*)\.bookmark \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(bookmark_qn) return result.groupdict() if result else dict() def _get_table_entity(self, *, table_uri: str) -> EntityUniqueAttribute: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: A tuple of Table entity and parsed information of table qualified name """ table_info = self._extract_info_from_uri(table_uri=table_uri) table_qn = make_table_qualified_name(table_info.get('name'), table_info.get('cluster'), table_info.get('db')) try: return self._driver.entity_unique_attribute(table_info['entity'], qualifiedName=table_qn) except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException( 'Table URI( {table_uri} ) does not exist'.format( table_uri=table_uri)) def _get_user_entity(self, user_id: str) -> EntityUniqueAttribute: """ Fetches an user entity from an id :param user_id: :return: """ try: return self._driver.entity_unique_attribute("User", qualifiedName=user_id) except Exception as ex: raise NotFoundException( '(User {user_id}) does not exist'.format(user_id=user_id)) def _create_bookmark(self, entity: EntityUniqueAttribute, user_guid: str, bookmark_qn: str, table_uri: str) -> None: """ Creates a bookmark entity for a specific user and table uri. :param user_guid: User's guid :param bookmark_qn: Bookmark qualifiedName :return: """ bookmark_entity = { 'entity': { 'typeName': self.BOOKMARK_TYPE, 'attributes': { 'qualifiedName': bookmark_qn, self.BOOKMARK_ACTIVE_KEY: True, 'entityUri': table_uri, 'user': { 'guid': user_guid }, 'entity': { 'guid': entity.entity[self.GUID_KEY] } } } } self._driver.entity_post.create(data=bookmark_entity) def _get_bookmark_entity(self, entity_uri: str, user_id: str) -> EntityUniqueAttribute: """ Fetch a Bookmark entity from parsing table uri and user id. If Bookmark is not present, create one for the user. :param table_uri: :param user_id: Qualified Name of a user :return: """ table_info = self._extract_info_from_uri(table_uri=entity_uri) bookmark_qn = '{}.{}.{}.{}.bookmark@{}'.format( table_info.get('db'), table_info.get('name'), table_info.get('entity'), user_id, table_info.get('cluster')) try: bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) if not bookmark_entity.entity: table_entity = self._get_table_entity(table_uri=entity_uri) # Fetch user entity from user_id for relation user_entity = self._get_user_entity(user_id) # Create bookmark entity with the user relation. self._create_bookmark(table_entity, user_entity.entity[self.GUID_KEY], bookmark_qn, entity_uri) # Fetch bookmark entity after creating it. bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) return bookmark_entity except Exception as ex: LOGGER.exception(f'Bookmark not found. {str(ex)}') raise NotFoundException( 'Bookmark( {bookmark_qn} ) does not exist'.format( bookmark_qn=bookmark_qn)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column[ self.GUID_KEY]] if column_name == col_details[self.ATTRS_KEY]['name']: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def _serialize_columns(self, *, entity: EntityUniqueAttribute) -> \ Union[List[Column], List]: """ Helper function to fetch the columns from entity and serialize them using Column and Statistics model. :param entity: EntityUniqueAttribute object, along with relationshipAttributes :return: A list of Column objects, if there are any columns available, else an empty list. """ columns = list() for column in entity.entity[self.REL_ATTRS_KEY].get( 'columns') or list(): column_status = column.get('entityStatus', 'inactive').lower() if column_status != 'active': continue col_entity = entity.referredEntities[column[self.GUID_KEY]] col_attrs = col_entity[self.ATTRS_KEY] statistics = list() for stats in col_attrs.get('statistics') or list(): stats_attrs = stats['attributes'] stat_type = stats_attrs.get('stat_name') stat_format = self.STATISTICS_FORMAT_SPEC.get( stat_type, dict()) if not stat_format.get('drop', False): stat_type = stat_format.get('new_name', stat_type) stat_val = stats_attrs.get('stat_val') format_val = stat_format.get('format') if format_val: stat_val = format_val.format(stat_val) else: stat_val = str(stat_val) start_epoch = stats_attrs.get('start_epoch') end_epoch = stats_attrs.get('end_epoch') statistics.append( Statistics( stat_type=stat_type, stat_val=stat_val, start_epoch=start_epoch, end_epoch=end_epoch, )) columns.append( Column( name=col_attrs.get('name'), description=col_attrs.get('description') or col_attrs.get('comment'), col_type=col_attrs.get('type') or col_attrs.get('dataType'), sort_order=col_attrs.get('position') or 9999, stats=statistics, )) return sorted(columns, key=lambda item: item.sort_order) def _get_reports(self, guids: List[str]) -> List[ResourceReport]: reports = [] if guids: report_entities_collection = self._driver.entity_bulk(guid=guids) for report_entity in extract_entities(report_entities_collection): try: if report_entity.status == self.ENTITY_ACTIVE_STATUS: report_attrs = report_entity.attributes reports.append( ResourceReport(name=report_attrs['name'], url=report_attrs['url'])) except (KeyError, AttributeError) as ex: LOGGER.exception( 'Error while accessing table report: {}. {}'.format( str(report_entity), str(ex))) parsed_reports = app.config['RESOURCE_REPORT_CLIENT'](reports) \ if app.config['RESOURCE_REPORT_CLIENT'] else reports return parsed_reports def get_user(self, *, id: str) -> Union[UserEntity, None]: pass def get_users(self) -> List[UserEntity]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] programmatic_descriptions = self._get_programmatic_descriptions( attrs.get('parameters')) table_qn = parse_table_qualified_name( qualified_name=attrs.get(self.QN_KEY)) tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get( "classifications") or list(): tags.append( Tag(tag_name=classification.get('typeName'), tag_type="default")) columns = self._serialize_columns(entity=entity) reports_guids = [ report.get("guid") for report in attrs.get("reports") or list() ] table = Table( database=table_details.get('typeName'), cluster=table_qn.get('cluster_name', ''), schema=table_qn.get('db_name', ''), name=attrs.get('name') or table_qn.get("table_name", ''), tags=tags, description=attrs.get('description') or attrs.get('comment'), owners=[User(email=attrs.get('owner'))], resource_reports=self._get_reports(guids=reports_guids), columns=columns, table_readers=self._get_readers(attrs.get(self.QN_KEY)), last_updated_timestamp=self._parse_date( table_details.get('updateTime')), programmatic_descriptions=programmatic_descriptions) return table except KeyError as ex: LOGGER.exception( 'Error while accessing table information. {}'.format(str(ex))) raise BadRequest( 'Some of the required attributes ' 'are missing in : ( {table_uri} )'.format(table_uri=table_uri)) def delete_owner(self, *, table_uri: str, owner: str) -> None: pass def add_owner(self, *, table_uri: str, owner: str) -> None: """ It simply replaces the owner field in atlas with the new string. FixMe (Verdan): Implement multiple data owners and atlas changes in the documentation if needed to make owner field a list :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ entity = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['owner'] = owner entity.update() def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :param tag_type :return: None """ entity = self._get_table_entity(table_uri=id) entity_bulk_tag = { "classification": { "typeName": tag }, "entityGuids": [entity.entity[self.GUID_KEY]] } self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def delete_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity = self._get_table_entity(table_uri=id) guid_entity = self._driver.entity_guid( entity.entity[self.GUID_KEY]) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format( str(ex))) def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) col_guid = column_detail[self.GUID_KEY] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') def get_popular_tables(self, *, num_entries: int) -> List[PopularTable]: """ :param num_entries: Number of popular tables to fetch :return: A List of popular tables instances """ popular_tables = list() popular_query_params = { 'typeName': 'Table', 'sortBy': 'popularityScore', 'sortOrder': 'DESCENDING', 'excludeDeletedEntities': True, 'limit': num_entries } search_results = self._driver.search_basic.create( data=popular_query_params) for table in search_results.entities: table_attrs = table.attributes table_qn = parse_table_qualified_name( qualified_name=table_attrs.get(self.QN_KEY)) table_name = table_qn.get("table_name") or table_attrs.get('name') db_name = table_qn.get("db_name", '') db_cluster = table_qn.get("cluster_name", '') popular_table = PopularTable( database=table.typeName, cluster=db_cluster, schema=db_name, name=table_name, description=table_attrs.get('description') or table_attrs.get('comment')) popular_tables.append(popular_table) return popular_tables def get_latest_updated_ts(self) -> int: pass def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for metrics in self._driver.admin_metrics: tag_stats = metrics.tag for tag, count in tag_stats["tagEntities"].items(): tags.append(TagDetail(tag_name=tag, tag_count=count)) return tags def get_dashboard_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) \ -> Dict[str, List[DashboardSummary]]: pass def get_table_by_user_relation( self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: params = { 'typeName': self.BOOKMARK_TYPE, 'offset': '0', 'limit': '1000', 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': self.QN_KEY, 'operator': 'contains', 'attributeValue': f'.{user_email}.bookmark' }, { 'attributeName': self.BOOKMARK_ACTIVE_KEY, 'operator': 'eq', 'attributeValue': 'true' }] }, 'attributes': ['count', self.QN_KEY, self.ENTITY_URI_KEY] } # Fetches the bookmark entities based on filters search_results = self._driver.search_basic.create(data=params) results = [] for record in search_results.entities: table_info = self._extract_info_from_uri( table_uri=record.attributes[self.ENTITY_URI_KEY]) res = self._parse_bookmark_qn(record.attributes[self.QN_KEY]) results.append( PopularTable(database=table_info['entity'], cluster=res['cluster'], schema=res['db'], name=res['table'])) return {'table': results} def get_frequently_used_tables( self, *, user_email: str) -> Dict[str, List[PopularTable]]: user = self._driver.entity_unique_attribute( self.USER_TYPE, qualifiedName=user_email).entity readers_guids = [] for user_reads in user['relationshipAttributes'].get('entityReads'): entity_status = user_reads['entityStatus'] relationship_status = user_reads['relationshipStatus'] if entity_status == 'ACTIVE' and relationship_status == 'ACTIVE': readers_guids.append(user_reads['guid']) readers = extract_entities( self._driver.entity_bulk(guid=readers_guids, ignoreRelationships=True)) _results = {} for reader in readers: entity_uri = reader.attributes.get(self.ENTITY_URI_KEY) count = reader.attributes.get('count') if count: details = self._extract_info_from_uri(table_uri=entity_uri) _results[count] = dict(cluster=details.get('cluster'), name=details.get('name'), schema=details.get('db'), database=details.get('entity')) sorted_counts = sorted(_results.keys()) results = [] for count in sorted_counts: data: dict = _results.get(count, dict()) table = PopularTable(**data) results.append(table) return {'table': results} def add_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) self._add_table_relation_by_user(table_uri=id, user_email=user_id, relation_type=relation_type) def _add_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: entity = self._get_bookmark_entity(entity_uri=table_uri, user_id=user_email) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = True entity.update() def delete_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) self._delete_table_relation_by_user(table_uri=id, user_email=user_id, relation_type=relation_type) def _delete_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: entity = self._get_bookmark_entity(entity_uri=table_uri, user_id=user_email) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = False entity.update() def _parse_date(self, date: int) -> Optional[int]: try: date_str = str(date) date_trimmed = date_str[:10] assert len(date_trimmed) == 10 return int(date_trimmed) except Exception: return None def _get_readers(self, qualified_name: str, top: Optional[int] = 15) -> List[Reader]: params = { 'typeName': self.READER_TYPE, 'offset': '0', 'limit': top, 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': self.QN_KEY, 'operator': 'STARTSWITH', 'attributeValue': qualified_name.split('@')[0] + '.' }, { 'attributeName': 'count', 'operator': 'gte', 'attributeValue': f'{app.config["POPULAR_TABLE_MINIMUM_READER_COUNT"]}' }] }, 'attributes': ['count', self.QN_KEY], 'sortBy': 'count', 'sortOrder': 'DESCENDING' } search_results = self._driver.search_basic.create( data=params, ignoreRelationships=False) readers = [] for record in search_results.entities: readers.append(record.guid) results = [] if readers: read_entities = extract_entities( self._driver.entity_bulk(guid=readers, ignoreRelationships=False)) for read_entity in read_entities: reader = Reader(user=User( email=read_entity.relationshipAttributes['user'] ['displayText'], user_id=read_entity.relationshipAttributes['user'] ['displayText']), read_count=read_entity.attributes['count']) results.append(reader) return results def _get_programmatic_descriptions( self, parameters: dict) -> List[ProgrammaticDescription]: programmatic_descriptions: Dict[str, ProgrammaticDescription] = {} for source, text in parameters.items(): use_parameter = True for regex_filter in app.config[ 'PROGRAMMATIC_DESCRIPTIONS_EXCLUDE_FILTERS']: pattern = re.compile(regex_filter) if pattern.match(source): use_parameter = False break if use_parameter: source = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", source).lower() programmatic_descriptions[source] = ProgrammaticDescription( source=source, text=text) result = dict(sorted(programmatic_descriptions.items())) return list(result.values()) def get_dashboard( self, dashboard_uri: str, ) -> DashboardDetailEntity: pass def get_dashboard_description(self, *, id: str) -> Description: pass def put_dashboard_description(self, *, id: str, description: str) -> None: pass def get_resources_using_table( self, *, id: str, resource_type: ResourceType) -> Dict[str, List[DashboardSummary]]: return {}
class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] READER_TYPE = 'Reader' QN_KEY = 'qualifiedName' BKMARKS_KEY = 'isFollowing' METADATA_KEY = 'metadata' GUID_KEY = 'guid' ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' ENTITY_URI_KEY = 'entityUri' _CACHE = CacheManager(**parse_cache_config_options({'cache.regions': 'atlas_proxy', 'cache.atlas_proxy.type': 'memory', 'cache.atlas_proxy.expire': _ATLAS_PROXY_CACHE_EXPIRY_SEC})) def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '') -> None: """ Initiate the Apache Atlas client with the provided credentials """ self._driver = Atlas(host=host, port=port, username=user, password=password) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_flat_values_from_dsl(self, dsl_param: dict) -> List: """ Makes a DSL query asking for specific attribute, extracts that attribute from result (which is a list of list, and converts that into a flat list. :param dsl_param: A DSL parameter, with SELECT clause :return: A Flat list of specified attributes in SELECT clause """ attributes: List = list() _search_collection = self._driver.search_dsl(**dsl_param) for collection in _search_collection: attributes = collection.flatten_attrs() return attributes def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Table Name """ pattern = re.compile(r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _parse_reader_qn(self, reader_qn: str) -> Dict: """ Parse reader qualifiedName and extract the info :param reader_qn: :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile(r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*)\.metadata \. (?P<user_id>[^.]*)\.reader \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(reader_qn) return result.groupdict() if result else dict() def _get_table_entity(self, *, table_uri: str) -> Tuple[EntityUniqueAttribute, Dict]: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: A tuple of Table entity and parsed information of table qualified name """ table_info = self._extract_info_from_uri(table_uri=table_uri) table_qn = make_table_qualified_name(table_info.get('name'), table_info.get('cluster'), table_info.get('db') ) try: return self._driver.entity_unique_attribute( table_info['entity'], qualifiedName=table_qn), table_info except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException('Table URI( {table_uri} ) does not exist' .format(table_uri=table_uri)) def _get_user_entity(self, user_id: str) -> EntityUniqueAttribute: """ Fetches an user entity from an id :param user_id: :return: """ try: return self._driver.entity_unique_attribute("User", qualifiedName=user_id) except Exception as ex: raise NotFoundException('(User {user_id}) does not exist' .format(user_id=user_id)) def _create_reader(self, metadata_guid: str, user_guid: str, reader_qn: str, table_uri: str) -> None: """ Creates a reader entity for a specific user and table uri. :param metadata_guid: Table's metadata guid :param user_guid: User's guid :param reader_qn: Reader qualifiedName :return: """ reader_entity = { 'typeName': self.READER_TYPE, 'attributes': {'qualifiedName': reader_qn, 'isFollowing': True, 'count': 0, 'entityMetadata': {'guid': metadata_guid}, 'user': {'guid': user_guid}, 'entityUri': table_uri} } self._driver.entity_bulk.create(data={'entities': [reader_entity]}) def _get_reader_entity(self, table_uri: str, user_id: str) -> EntityUniqueAttribute: """ Fetch a Reader entity from parsing table uri and user id. If Reader is not present, create one for the user. :param table_uri: :param user_id: Qualified Name of a user :return: """ table_info = self._extract_info_from_uri(table_uri=table_uri) reader_qn = '{}.{}.metadata.{}.reader@{}'.format(table_info.get('db'), table_info.get('name'), user_id, table_info.get('cluster')) try: reader_entity = self._driver.entity_unique_attribute( self.READER_TYPE, qualifiedName=reader_qn) if not reader_entity.entity: # Fetch the table entity from the uri for obtaining metadata guid. table_entity, table_info = self._get_table_entity(table_uri=table_uri) # Fetch user entity from user_id for relation user_entity = self._get_user_entity(user_id) # Create reader entity with the metadata and user relation. self._create_reader(table_entity.entity[self.ATTRS_KEY][self.METADATA_KEY][self.GUID_KEY], user_entity.entity[self.GUID_KEY], reader_qn, table_uri) # Fetch reader entity after creating it. reader_entity = self._driver.entity_unique_attribute(self.READER_TYPE, qualifiedName=reader_qn) return reader_entity except Exception as ex: LOGGER.exception(f'Reader not found. {str(ex)}') raise NotFoundException('Reader( {reader_qn} ) does not exist' .format(reader_qn=reader_qn)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity, _ = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column['guid']] if column_name == col_details[self.ATTRS_KEY]['name']: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def _serialize_columns(self, *, entity: EntityUniqueAttribute) -> \ Union[List[Column], List]: """ Helper function to fetch the columns from entity and serialize them using Column and Statistics model. :param entity: EntityUniqueAttribute object, along with relationshipAttributes :return: A list of Column objects, if there are any columns available, else an empty list. """ columns = list() for column in entity.entity[self.REL_ATTRS_KEY].get('columns') or list(): col_entity = entity.referredEntities[column['guid']] col_attrs = col_entity[self.ATTRS_KEY] col_rel_attrs = col_entity[self.REL_ATTRS_KEY] col_metadata = col_rel_attrs.get('metadata') statistics = list() if col_metadata: col_metadata = entity.referredEntities.get(col_metadata.get('guid')) for stats in col_metadata['attributes'].get('statistics') or list(): stats_attrs = stats['attributes'] statistics.append( Statistics( stat_type=stats_attrs.get('stat_name'), stat_val=stats_attrs.get('stat_val'), start_epoch=stats_attrs.get('start_epoch'), end_epoch=stats_attrs.get('end_epoch'), ) ) columns.append( Column( name=col_attrs.get('name'), description=col_attrs.get('description') or col_attrs.get('comment'), col_type=col_attrs.get('type') or col_attrs.get('dataType'), sort_order=col_attrs.get('position'), stats=statistics, ) ) return sorted(columns, key=lambda item: item.sort_order) def get_user_detail(self, *, user_id: str) -> Union[UserEntity, None]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity, table_info = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] table_qn = parse_table_qualified_name( qualified_name=attrs.get(self.QN_KEY) ) tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get("classifications") or list(): tags.append( Tag( tag_name=classification.get('typeName'), tag_type="default" ) ) columns = self._serialize_columns(entity=entity) table = Table( database=table_details.get('typeName'), cluster=table_qn.get('cluster_name', ''), schema=table_qn.get('db_name', ''), name=attrs.get('name') or table_qn.get("table_name", ''), tags=tags, description=attrs.get('description') or attrs.get('comment'), owners=[User(email=attrs.get('owner'))], columns=columns, last_updated_timestamp=table_details.get('updateTime')) return table except KeyError as ex: LOGGER.exception('Error while accessing table information. {}' .format(str(ex))) raise BadRequest('Some of the required attributes ' 'are missing in : ( {table_uri} )' .format(table_uri=table_uri)) def delete_owner(self, *, table_uri: str, owner: str) -> None: pass def add_owner(self, *, table_uri: str, owner: str) -> None: """ It simply replaces the owner field in atlas with the new string. FixMe (Verdan): Implement multiple data owners and atlas changes in the documentation if needed to make owner field a list :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['owner'] = owner entity.update() def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity, _ = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, table_uri: str, tag: str, tag_type: str) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :param tag_type :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity_bulk_tag = {"classification": {"typeName": tag}, "entityGuids": [entity.entity['guid']]} self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def delete_tag(self, *, table_uri: str, tag: str, tag_type: str) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity, _ = self._get_table_entity(table_uri=table_uri) guid_entity = self._driver.entity_guid(entity.entity['guid']) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format(str(ex))) def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) col_guid = column_detail['guid'] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') @_CACHE.region('atlas_proxy', '_get_metadata_entities') def _get_metadata_entities(self, popular_query_params: dict) -> List: try: popular_tables_guids = list() # Fetch the metadata entities based on popularity score search_results = self._driver.search_basic.create(data=popular_query_params) for metadata in search_results.entities: table_guid = metadata.attributes.get("table").get("guid") popular_tables_guids.append(table_guid) # In order to get comments and other extra fields from table entity table_collection = self._driver.entity_bulk(guid=popular_tables_guids, ignoreRelationships=True) table_entities: List = list() for _collection in table_collection: table_entities.extend(_collection.entities) return table_entities except (KeyError, TypeError) as ex: LOGGER.exception(f'_get_metadata_entities Failed : {ex}') raise NotFoundException('Unable to fetch popular tables. ' 'Please check your configurations.') def get_popular_tables(self, *, num_entries: int) -> List[PopularTable]: """ :param num_entries: Number of popular tables to fetch :return: A List of popular tables instances """ popular_tables = list() popular_query_params = {'typeName': 'table_metadata', 'sortBy': 'popularityScore', 'sortOrder': 'DESCENDING', 'excludeDeletedEntities': True, 'limit': num_entries, 'attributes': ['table']} table_entities = self._get_metadata_entities(popular_query_params) for table in table_entities: table_attrs = table.attributes table_qn = parse_table_qualified_name( qualified_name=table_attrs.get(self.QN_KEY) ) table_name = table_qn.get("table_name") or table_attrs.get('name') db_name = table_qn.get("db_name", '') db_cluster = table_qn.get("cluster_name", '') popular_table = PopularTable( database=table.typeName, cluster=db_cluster, schema=db_name, name=table_name, description=table_attrs.get('description') or table_attrs.get('comment')) popular_tables.append(popular_table) return popular_tables def get_latest_updated_ts(self) -> int: pass def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for metrics in self._driver.admin_metrics: tag_stats = metrics.tag for tag, count in tag_stats["tagEntities"].items(): tags.append( TagDetail( tag_name=tag, tag_count=count ) ) return tags def get_table_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: params = { 'typeName': self.READER_TYPE, 'offset': '0', 'limit': '1000', 'entityFilters': { 'condition': 'AND', 'criterion': [ { 'attributeName': self.QN_KEY, 'operator': 'contains', 'attributeValue': user_email }, { 'attributeName': self.BKMARKS_KEY, 'operator': 'eq', 'attributeValue': 'true' } ] }, 'attributes': ['count', self.QN_KEY, self.ENTITY_URI_KEY] } # Fetches the reader entities based on filters search_results = self._driver.search_basic.create(data=params) results = [] for record in search_results.entities: table_info = self._extract_info_from_uri(table_uri=record.attributes[self.ENTITY_URI_KEY]) res = self._parse_reader_qn(record.attributes[self.QN_KEY]) results.append(PopularTable( database=table_info['entity'], cluster=res['cluster'], schema=res['db'], name=res['table'])) return {'table': results} def get_frequently_used_tables(self, *, user_email: str) -> Dict[str, Any]: pass def add_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: entity = self._get_reader_entity(table_uri=table_uri, user_id=user_email) entity.entity[self.ATTRS_KEY][self.BKMARKS_KEY] = True entity.update() def delete_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: entity = self._get_reader_entity(table_uri=table_uri, user_id=user_email) entity.entity[self.ATTRS_KEY][self.BKMARKS_KEY] = False entity.update()
class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] STATISTICS_FORMAT_SPEC = app.config['STATISTICS_FORMAT_SPEC'] BOOKMARK_TYPE = 'Bookmark' USER_TYPE = 'User' READER_TYPE = 'Reader' QN_KEY = 'qualifiedName' BOOKMARK_ACTIVE_KEY = 'active' GUID_KEY = 'guid' ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' ENTITY_URI_KEY = 'entityUri' _CACHE = CacheManager(**parse_cache_config_options( { 'cache.regions': 'atlas_proxy', 'cache.atlas_proxy.type': 'memory', 'cache.atlas_proxy.expire': _ATLAS_PROXY_CACHE_EXPIRY_SEC })) def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '', encrypted: bool = False, validate_ssl: bool = False) -> None: """ Initiate the Apache Atlas client with the provided credentials """ protocol = 'https' if encrypted else 'http' self._driver = Atlas(host=host, port=port, username=user, password=password, protocol=protocol, validate_ssl=validate_ssl) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_flat_values_from_dsl(self, dsl_param: dict) -> List: """ Makes a DSL query asking for specific attribute, extracts that attribute from result (which is a list of list, and converts that into a flat list. :param dsl_param: A DSL parameter, with SELECT clause :return: A Flat list of specified attributes in SELECT clause """ attributes: List = list() _search_collection = self._driver.search_dsl(**dsl_param) for collection in _search_collection: attributes = collection.flatten_attrs() return attributes def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Table Name """ pattern = re.compile( r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _parse_reader_qn(self, reader_qn: str) -> Dict: """ Parse reader qualifiedName and extract the info :param reader_qn: :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<user_id>[^.]*)\.reader \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(reader_qn) return result.groupdict() if result else dict() def _parse_bookmark_qn(self, bookmark_qn: str) -> Dict: """ Parse bookmark qualifiedName and extract the info :param bookmark_qn: Qualified Name of Bookmark entity :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<entity_type>[^.]*) \. (?P<user_id>[^.]*)\.bookmark \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(bookmark_qn) return result.groupdict() if result else dict() def _get_user_details(self, user_id: str) -> Dict: """ Helper function to help get the user details if the `USER_DETAIL_METHOD` is configured, else uses the user_id for both email and user_id properties. :param user_id: The Unique user id of a user entity :return: a dictionary of user details """ if app.config.get('USER_DETAIL_METHOD'): user_details = app.config.get('USER_DETAIL_METHOD')( user_id) # type: ignore else: user_details = {'email': user_id, 'user_id': user_id} return user_details def _get_table_entity(self, *, table_uri: str) -> EntityUniqueAttribute: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: A tuple of Table entity and parsed information of table qualified name """ table_info = self._extract_info_from_uri(table_uri=table_uri) table_qn = make_table_qualified_name(table_info.get('name'), table_info.get('cluster'), table_info.get('db')) try: return self._driver.entity_unique_attribute(table_info['entity'], qualifiedName=table_qn) except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException( 'Table URI( {table_uri} ) does not exist'.format( table_uri=table_uri)) def _get_user_entity(self, user_id: str) -> EntityUniqueAttribute: """ Fetches an user entity from an id :param user_id: :return: """ try: return self._driver.entity_unique_attribute("User", qualifiedName=user_id) except Exception as ex: raise NotFoundException( '(User {user_id}) does not exist'.format(user_id=user_id)) def _create_bookmark(self, entity: EntityUniqueAttribute, user_guid: str, bookmark_qn: str, table_uri: str) -> None: """ Creates a bookmark entity for a specific user and table uri. :param user_guid: User's guid :param bookmark_qn: Bookmark qualifiedName :return: """ bookmark_entity = { 'entity': { 'typeName': self.BOOKMARK_TYPE, 'attributes': { 'qualifiedName': bookmark_qn, self.BOOKMARK_ACTIVE_KEY: True, 'entityUri': table_uri, 'user': { 'guid': user_guid }, 'entity': { 'guid': entity.entity[self.GUID_KEY] } } } } self._driver.entity_post.create(data=bookmark_entity) def _get_bookmark_entity(self, entity_uri: str, user_id: str) -> EntityUniqueAttribute: """ Fetch a Bookmark entity from parsing table uri and user id. If Bookmark is not present, create one for the user. :param table_uri: :param user_id: Qualified Name of a user :return: """ table_info = self._extract_info_from_uri(table_uri=entity_uri) bookmark_qn = '{}.{}.{}.{}.bookmark@{}'.format( table_info.get('db'), table_info.get('name'), table_info.get('entity'), user_id, table_info.get('cluster')) try: bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) if not bookmark_entity.entity: table_entity = self._get_table_entity(table_uri=entity_uri) # Fetch user entity from user_id for relation user_entity = self._get_user_entity(user_id) # Create bookmark entity with the user relation. self._create_bookmark(table_entity, user_entity.entity[self.GUID_KEY], bookmark_qn, entity_uri) # Fetch bookmark entity after creating it. bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) return bookmark_entity except Exception as ex: LOGGER.exception(f'Bookmark not found. {str(ex)}') raise NotFoundException( 'Bookmark( {bookmark_qn} ) does not exist'.format( bookmark_qn=bookmark_qn)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column[ self.GUID_KEY]] if column_name == col_details[self.ATTRS_KEY]['name']: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def _serialize_columns(self, *, entity: EntityUniqueAttribute) -> \ Union[List[Column], List]: """ Helper function to fetch the columns from entity and serialize them using Column and Stat model. :param entity: EntityUniqueAttribute object, along with relationshipAttributes :return: A list of Column objects, if there are any columns available, else an empty list. """ columns = list() for column in entity.entity[self.REL_ATTRS_KEY].get( 'columns') or list(): column_status = column.get('entityStatus', 'inactive').lower() if column_status != 'active': continue col_entity = entity.referredEntities[column[self.GUID_KEY]] col_attrs = col_entity[self.ATTRS_KEY] statistics = list() for stats in col_attrs.get('statistics') or list(): stats_attrs = stats['attributes'] stat_type = stats_attrs.get('stat_name') stat_format = self.STATISTICS_FORMAT_SPEC.get( stat_type, dict()) if not stat_format.get('drop', False): stat_type = stat_format.get('new_name', stat_type) stat_val = stats_attrs.get('stat_val') format_val = stat_format.get('format') if format_val: stat_val = format_val.format(stat_val) else: stat_val = str(stat_val) start_epoch = stats_attrs.get('start_epoch') end_epoch = stats_attrs.get('end_epoch') statistics.append( Stat( stat_type=stat_type, stat_val=stat_val, start_epoch=start_epoch, end_epoch=end_epoch, )) columns.append( Column( name=col_attrs.get('name'), description=col_attrs.get('description') or col_attrs.get('comment'), col_type=col_attrs.get('type') or col_attrs.get('dataType') or col_attrs.get('data_type'), sort_order=col_attrs.get('position') or 9999, stats=statistics, )) return sorted(columns, key=lambda item: item.sort_order) def _get_reports(self, guids: List[str]) -> List[ResourceReport]: reports = [] if guids: report_entities_collection = self._driver.entity_bulk(guid=guids) for report_entity in extract_entities(report_entities_collection): try: if report_entity.status == Status.ACTIVE: report_attrs = report_entity.attributes reports.append( ResourceReport(name=report_attrs['name'], url=report_attrs['url'])) except (KeyError, AttributeError) as ex: LOGGER.exception( 'Error while accessing table report: {}. {}'.format( str(report_entity), str(ex))) parsed_reports = app.config['RESOURCE_REPORT_CLIENT'](reports) \ if app.config['RESOURCE_REPORT_CLIENT'] else reports return parsed_reports def _get_owners(self, data_owners: list, fallback_owner: str = None) -> List[User]: owners_detail = list() active_owners_list = list() active_owners = filter( lambda item: item['entityStatus'] == Status.ACTIVE and item[ 'relationshipStatus'] == Status.ACTIVE, data_owners) for owner in active_owners: owner_qn = owner['displayText'] owner_data = self._get_user_details(owner_qn) owners_detail.append(User(**owner_data)) active_owners_list.append(owner_qn) # To avoid the duplication, # we are checking if the fallback is not in data_owners if fallback_owner and (fallback_owner not in active_owners_list): owners_detail.append( User(**self._get_user_details(fallback_owner))) return owners_detail def get_user(self, *, id: str) -> Union[UserEntity, None]: pass def get_users(self) -> List[UserEntity]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] programmatic_descriptions = self._get_programmatic_descriptions( attrs.get('parameters', dict())) table_qn = parse_table_qualified_name( qualified_name=attrs.get(self.QN_KEY)) tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get( 'classifications') or list(): tags.append( Tag(tag_name=classification.get('typeName'), tag_type="default")) columns = self._serialize_columns(entity=entity) reports_guids = [ report.get("guid") for report in attrs.get("reports") or list() ] table_type = attrs.get('tableType') or 'table' is_view = 'view' in table_type.lower() table = Table( database=table_details.get('typeName'), cluster=table_qn.get('cluster_name', ''), schema=table_qn.get('db_name', ''), name=attrs.get('name') or table_qn.get("table_name", ''), tags=tags, description=attrs.get('description') or attrs.get('comment'), owners=self._get_owners( table_details[self.REL_ATTRS_KEY].get('ownedBy', []), attrs.get('owner')), resource_reports=self._get_reports(guids=reports_guids), columns=columns, is_view=is_view, table_readers=self._get_readers(attrs.get(self.QN_KEY)), last_updated_timestamp=self._parse_date( table_details.get('updateTime')), programmatic_descriptions=programmatic_descriptions, watermarks=self._get_table_watermarks(table_details)) return table except KeyError as ex: LOGGER.exception( 'Error while accessing table information. {}'.format(str(ex))) raise BadRequest( 'Some of the required attributes ' 'are missing in : ( {table_uri} )'.format(table_uri=table_uri)) @staticmethod def _validate_date( text_date: str, date_format: str ) -> Tuple[Optional[datetime.datetime], Optional[str]]: try: return datetime.datetime.strptime(text_date, date_format), date_format except (ValueError, TypeError): return None, None @staticmethod def _select_watermark_format(partition_names: List[str]) -> Optional[str]: result = None for partition_name in partition_names: # Assume that all partitions for given table have the same date format. Only thing that needs to be done # is establishing which format out of the supported ones it is and then we validate every partition # against it. for df in app.config['WATERMARK_DATE_FORMATS']: _, result = AtlasProxy._validate_date(partition_name, df) if result: LOGGER.debug('Established date format', extra=dict(date_format=result)) return result return result @staticmethod def _render_partition_key_name( entity: EntityUniqueAttribute) -> Optional[str]: _partition_keys = [] for partition_key in entity.get('attributes', dict()).get('partitionKeys', []): partition_key_column_name = partition_key.get('displayName') if partition_key_column_name: _partition_keys.append(partition_key_column_name) partition_key = ' '.join(_partition_keys).strip() return partition_key def _get_table_watermarks( self, entity: EntityUniqueAttribute) -> List[Watermark]: partition_value_format = '%Y-%m-%d %H:%M:%S' _partitions = entity.get('relationshipAttributes', dict()).get('partitions', list()) names = [ _partition.get('displayText') for _partition in _partitions if _partition.get('entityStatus') == Status.ACTIVE and _partition.get('relationshipStatus') == Status.ACTIVE ] if not names: return [] partition_key = AtlasProxy._render_partition_key_name(entity) watermark_date_format = AtlasProxy._select_watermark_format(names) partitions = {} for _partition in _partitions: partition_name = _partition.get('displayText') if partition_name and watermark_date_format: partition_date, _ = AtlasProxy._validate_date( partition_name, watermark_date_format) if partition_date: common_values = { 'partition_value': datetime.datetime.strftime(partition_date, partition_value_format), 'create_time': 0, 'partition_key': partition_key } partitions[partition_date] = common_values if partitions: low_watermark_date = min(partitions.keys()) high_watermark_date = max(partitions.keys()) low_watermark = Watermark(watermark_type='low_watermark', **partitions.get(low_watermark_date)) high_watermark = Watermark(watermark_type='high_watermark', **partitions.get(high_watermark_date)) return [low_watermark, high_watermark] else: return [] def delete_owner(self, *, table_uri: str, owner: str) -> None: """ :param table_uri: :param owner: :return: """ table = self._get_table_entity(table_uri=table_uri) table_entity = table.entity if table_entity[self.REL_ATTRS_KEY].get("ownedBy"): try: active_owners = filter( lambda item: item['relationshipStatus'] == Status.ACTIVE and item['displayText'] == owner, table_entity[self.REL_ATTRS_KEY]['ownedBy']) if list(active_owners): self._driver.relationship_guid( next(active_owners).get('relationshipGuid')).delete() else: raise BadRequest('You can not delete this owner.') except NotFound as ex: LOGGER.exception( 'Error while removing table data owner. {}'.format( str(ex))) def add_owner(self, *, table_uri: str, owner: str) -> None: """ Query on Atlas User entity to find if the entity exist for the owner string in parameter, if not create one. And then use that User entity's GUID and add a relationship between Table and User, on ownedBy field. :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ owner_info = self._get_user_details(owner) if not owner_info: raise NotFoundException(f'User "{owner}" does not exist.') user_dict = { "entity": { "typeName": "User", "attributes": { "qualifiedName": owner }, } } # Get or Create a User user_entity = self._driver.entity_post.create(data=user_dict) user_guid = next(iter(user_entity.get("guidAssignments").values())) table = self._get_table_entity(table_uri=table_uri) entity_def = { "typeName": "DataSet_Users_Owner", "end1": { "guid": table.entity.get("guid"), "typeName": "Table", }, "end2": { "guid": user_guid, "typeName": "User", }, } try: self._driver.relationship.create(data=entity_def) except Conflict as ex: LOGGER.exception( 'Error while adding the owner information. {}'.format(str(ex))) raise BadRequest( f'User {owner} is already added as a data owner for ' f'table {table_uri}.') def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :param tag_type :return: None """ entity = self._get_table_entity(table_uri=id) entity_bulk_tag = { "classification": { "typeName": tag }, "entityGuids": [entity.entity[self.GUID_KEY]] } self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def add_badge(self, *, id: str, badge_name: str, category: str = '', resource_type: ResourceType) -> None: # Not implemented raise NotImplementedError def delete_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity = self._get_table_entity(table_uri=id) guid_entity = self._driver.entity_guid( entity.entity[self.GUID_KEY]) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format( str(ex))) def delete_badge(self, *, id: str, badge_name: str, category: str, resource_type: ResourceType) -> None: # Not implemented raise NotImplementedError def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) col_guid = column_detail[self.GUID_KEY] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') def _serialize_popular_tables(self, entities: list) -> List[PopularTable]: """ Gets a list of entities and serialize the popular tables. :param entities: List of entities from atlas client :return: a list of PopularTable objects """ popular_tables = list() for table in entities: table_attrs = table.attributes table_qn = parse_table_qualified_name( qualified_name=table_attrs.get(self.QN_KEY)) table_name = table_qn.get("table_name") or table_attrs.get('name') db_name = table_qn.get("db_name", '') db_cluster = table_qn.get("cluster_name", '') popular_table = PopularTable( database=table.typeName, cluster=db_cluster, schema=db_name, name=table_name, description=table_attrs.get('description') or table_attrs.get('comment')) popular_tables.append(popular_table) return popular_tables def get_popular_tables(self, *, num_entries: int) -> List[PopularTable]: """ Generates a list of Popular tables to be shown on the home page of Amundsen. :param num_entries: Number of popular tables to fetch :return: A List of popular tables instances """ popular_query_params = { 'typeName': 'Table', 'sortBy': 'popularityScore', 'sortOrder': 'DESCENDING', 'excludeDeletedEntities': True, 'limit': num_entries } search_results = self._driver.search_basic.create( data=popular_query_params) return self._serialize_popular_tables(search_results.entities) def get_latest_updated_ts(self) -> int: date = None for metrics in self._driver.admin_metrics: try: date = self._parse_date( metrics.general.get( 'stats', {}).get('Notification:lastMessageProcessedTime')) except AttributeError: pass date = date or 0 return date def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for metrics in self._driver.admin_metrics: tag_stats = metrics.tag for tag, count in tag_stats["tagEntities"].items(): tags.append(TagDetail(tag_name=tag, tag_count=count)) return tags def get_badges(self) -> List: # Not implemented return [] def _get_resources_followed_by_user(self, user_id: str, resource_type: str) \ -> List[Union[PopularTable, DashboardSummary]]: """ ToDo (Verdan): Dashboard still needs to be implemented. Helper function to get the resource, table, dashboard etc followed by a user. :param user_id: User ID of a user :param resource_type: Type of a resource that returns, could be table, dashboard etc. :return: A list of PopularTable, DashboardSummary or any other resource. """ params = { 'typeName': self.BOOKMARK_TYPE, 'offset': '0', 'limit': '1000', 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': self.QN_KEY, 'operator': 'contains', 'attributeValue': f'.{user_id}.bookmark' }, { 'attributeName': self.BOOKMARK_ACTIVE_KEY, 'operator': 'eq', 'attributeValue': 'true' }] }, 'attributes': ['count', self.QN_KEY, self.ENTITY_URI_KEY] } # Fetches the bookmark entities based on filters search_results = self._driver.search_basic.create(data=params) resources = [] for record in search_results.entities: table_info = self._extract_info_from_uri( table_uri=record.attributes[self.ENTITY_URI_KEY]) res = self._parse_bookmark_qn(record.attributes[self.QN_KEY]) resources.append( PopularTable(database=table_info['entity'], cluster=res['cluster'], schema=res['db'], name=res['table'])) return resources def _get_resources_owned_by_user(self, user_id: str, resource_type: str) \ -> List[Union[PopularTable, DashboardSummary, Any]]: """ ToDo (Verdan): Dashboard still needs to be implemented. Helper function to get the resource, table, dashboard etc owned by a user. :param user_id: User ID of a user :param resource_type: Type of a resource that returns, could be table, dashboard etc. :return: A list of PopularTable, DashboardSummary or any other resource. """ resources = list() if resource_type == ResourceType.Table.name: type_regex = "(.*)_table$" # elif resource_type == ResourceType.Dashboard.name: # type_regex = "Dashboard" else: LOGGER.exception( f'Resource Type ({resource_type}) is not yet implemented') raise NotImplemented user_entity = self._driver.entity_unique_attribute( self.USER_TYPE, qualifiedName=user_id).entity if not user_entity: LOGGER.exception(f'User ({user_id}) not found in Atlas') raise NotFoundException(f'User {user_id} not found.') resource_guids = set() for item in user_entity[self.REL_ATTRS_KEY].get('owns') or list(): if (item['entityStatus'] == Status.ACTIVE and item['relationshipStatus'] == Status.ACTIVE and re.compile(type_regex).match(item['typeName'])): resource_guids.add(item[self.GUID_KEY]) params = { 'typeName': self.TABLE_ENTITY, 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': 'owner', 'operator': 'startsWith', 'attributeValue': user_id.lower() }] }, 'attributes': [self.GUID_KEY] } table_entities = self._driver.search_basic.create(data=params) for table in table_entities.entities: resource_guids.add(table.guid) if resource_guids: entities = extract_entities( self._driver.entity_bulk(guid=list(resource_guids), ignoreRelationships=True)) if resource_type == ResourceType.Table.name: resources = self._serialize_popular_tables(entities) else: LOGGER.info(f'User ({user_id}) does not own any "{resource_type}"') return resources def get_dashboard_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) \ -> Dict[str, List[DashboardSummary]]: pass def get_table_by_user_relation( self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: tables = list() if relation_type == UserResourceRel.follow: tables = self._get_resources_followed_by_user( user_id=user_email, resource_type=ResourceType.Table.name) elif relation_type == UserResourceRel.own: tables = self._get_resources_owned_by_user( user_id=user_email, resource_type=ResourceType.Table.name) return {'table': tables} def get_frequently_used_tables( self, *, user_email: str) -> Dict[str, List[PopularTable]]: user = self._driver.entity_unique_attribute( self.USER_TYPE, qualifiedName=user_email).entity readers_guids = [] for user_reads in user['relationshipAttributes'].get('entityReads'): entity_status = user_reads['entityStatus'] relationship_status = user_reads['relationshipStatus'] if entity_status == Status.ACTIVE and relationship_status == Status.ACTIVE: readers_guids.append(user_reads['guid']) readers = extract_entities( self._driver.entity_bulk(guid=readers_guids, ignoreRelationships=True)) _results = {} for reader in readers: entity_uri = reader.attributes.get(self.ENTITY_URI_KEY) count = reader.attributes.get('count') if count: details = self._extract_info_from_uri(table_uri=entity_uri) _results[count] = dict(cluster=details.get('cluster'), name=details.get('name'), schema=details.get('db'), database=details.get('entity')) sorted_counts = sorted(_results.keys()) results = [] for count in sorted_counts: data: dict = _results.get(count, dict()) table = PopularTable(**data) results.append(table) return {'table': results} def add_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) entity = self._get_bookmark_entity(entity_uri=id, user_id=user_id) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = True entity.update() def delete_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) entity = self._get_bookmark_entity(entity_uri=id, user_id=user_id) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = False entity.update() def _parse_date(self, date: int) -> Optional[int]: try: date_str = str(date) date_trimmed = date_str[:10] assert len(date_trimmed) == 10 return int(date_trimmed) except Exception: return None def _get_readers(self, qualified_name: str, top: Optional[int] = 15) -> List[Reader]: params = { 'typeName': self.READER_TYPE, 'offset': '0', 'limit': top, 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': self.QN_KEY, 'operator': 'STARTSWITH', 'attributeValue': qualified_name.split('@')[0] + '.' }, { 'attributeName': 'count', 'operator': 'gte', 'attributeValue': f'{app.config["POPULAR_TABLE_MINIMUM_READER_COUNT"]}' }] }, 'attributes': ['count', self.QN_KEY], 'sortBy': 'count', 'sortOrder': 'DESCENDING' } search_results = self._driver.search_basic.create( data=params, ignoreRelationships=False) readers = [] for record in search_results.entities: readers.append(record.guid) results = [] if readers: read_entities = extract_entities( self._driver.entity_bulk(guid=readers, ignoreRelationships=False)) for read_entity in read_entities: reader_qn = read_entity.relationshipAttributes['user'][ 'displayText'] reader_details = self._get_user_details(reader_qn) reader = Reader(user=User(**reader_details), read_count=read_entity.attributes['count']) results.append(reader) return results def _get_programmatic_descriptions( self, parameters: dict) -> List[ProgrammaticDescription]: programmatic_descriptions: Dict[str, ProgrammaticDescription] = {} for source, text in parameters.items(): use_parameter = True for regex_filter in app.config[ 'PROGRAMMATIC_DESCRIPTIONS_EXCLUDE_FILTERS']: pattern = re.compile(regex_filter) if pattern.match(source): use_parameter = False break if use_parameter: source = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", source).lower() programmatic_descriptions[source] = ProgrammaticDescription( source=source, text=text) result = dict(sorted(programmatic_descriptions.items())) return list(result.values()) def get_dashboard( self, dashboard_uri: str, ) -> DashboardDetailEntity: pass def get_dashboard_description(self, *, id: str) -> Description: pass def put_dashboard_description(self, *, id: str, description: str) -> None: pass def get_resources_using_table( self, *, id: str, resource_type: ResourceType) -> Dict[str, List[DashboardSummary]]: return {}
class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] NAME_ATTRIBUTE = app.config['ATLAS_NAME_ATTRIBUTE'] QN_KEY = 'qualifiedName' ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' # Table Qualified Name Regex TABLE_QN_REGEX = pattern = re.compile(r""" ^(?P<db_name>.*?)\.(?P<table_name>.*)@(?P<cluster_name>.*?)$ """, re.X) def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '') -> None: """ Initiate the Apache Atlas client with the provided credentials """ self._driver = Atlas(host=host, port=port, username=user, password=password) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_flat_values_from_dsl(self, dsl_param: dict) -> List: """ Makes a DSL query asking for specific attribute, extracts that attribute from result (which is a list of list, and converts that into a flat list. :param dsl_param: A DSL parameter, with SELECT clause :return: A Flat list of specified attributes in SELECT clause """ attributes: List = list() _search_collection = self._driver.search_dsl(**dsl_param) for collection in _search_collection: attributes = collection.flatten_attrs() return attributes def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Database Namespace: rdbms_table, hive_table etc. entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Unique Table Identifier """ pattern = re.compile(r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _get_table_entity(self, *, table_uri: str) -> Tuple[EntityUniqueAttribute, Dict]: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: """ table_info = self._extract_info_from_uri(table_uri=table_uri) try: return self._driver.entity_unique_attribute( table_info['entity'], qualifiedName=table_info.get('name')), table_info except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException('Table URI( {table_uri} ) does not exist' .format(table_uri=table_uri)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity, _ = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column['guid']] if column_name == col_details[self.ATTRS_KEY][self.NAME_ATTRIBUTE]: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def _serialize_columns(self, *, entity: EntityUniqueAttribute) -> \ Union[List[Column], List]: """ Helper function to fetch the columns from entity and serialize them using Column and Statistics model. :param entity: EntityUniqueAttribute object, along with relationshipAttributes :return: A list of Column objects, if there are any columns available, else an empty list. """ columns = list() for column in entity.entity[self.REL_ATTRS_KEY].get('columns') or list(): col_entity = entity.referredEntities[column['guid']] col_attrs = col_entity[self.ATTRS_KEY] statistics = list() for stats in col_attrs.get('stats') or list(): stats_attrs = stats['attributes'] statistics.append( Statistics( stat_type=stats_attrs.get('stat_name'), stat_val=stats_attrs.get('stat_val'), start_epoch=stats_attrs.get('start_epoch'), end_epoch=stats_attrs.get('end_epoch'), ) ) columns.append( Column( name=col_attrs.get(self.NAME_ATTRIBUTE), description=col_attrs.get('description'), col_type=col_attrs.get('type') or col_attrs.get('dataType'), sort_order=col_attrs.get('position'), stats=statistics, ) ) return columns def get_user_detail(self, *, user_id: str) -> Union[UserEntity, None]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity, table_info = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get("classifications") or list(): tags.append( Tag( tag_name=classification.get('typeName'), tag_type="default" ) ) columns = self._serialize_columns(entity=entity) table = Table(database=table_info['entity'], cluster=table_info['cluster'], schema=table_info['db'], name=table_info['name'], tags=tags, description=attrs.get('description'), owners=[User(email=attrs.get('owner'))], columns=columns, last_updated_timestamp=table_details.get('updateTime')) return table except KeyError as ex: LOGGER.exception('Error while accessing table information. {}' .format(str(ex))) raise BadRequest('Some of the required attributes ' 'are missing in : ( {table_uri} )' .format(table_uri=table_uri)) def delete_owner(self, *, table_uri: str, owner: str) -> None: pass def add_owner(self, *, table_uri: str, owner: str) -> None: """ It simply replaces the owner field in atlas with the new string. FixMe (Verdan): Implement multiple data owners and atlas changes in the documentation if needed to make owner field a list :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['owner'] = owner entity.update() def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity, _ = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, table_uri: str, tag: str) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity_bulk_tag = {"classification": {"typeName": tag}, "entityGuids": [entity.entity['guid']]} self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def delete_tag(self, *, table_uri: str, tag: str) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity, _ = self._get_table_entity(table_uri=table_uri) guid_entity = self._driver.entity_guid(entity.entity['guid']) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format(str(ex))) def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) col_guid = column_detail['guid'] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') def get_popular_tables(self, *, num_entries: int) -> List[PopularTable]: """ :param num_entries: Number of popular tables to fetch :return: A List of popular tables instances """ popular_tables = list() try: # Fetch the metadata entities based on popularity score query_metadata_ids = {'query': f'FROM Table SELECT metadata.__guid ' f'ORDERBY popularityScore desc ' f'LIMIT {num_entries}'} metadata_ids = self._get_flat_values_from_dsl(dsl_param=query_metadata_ids) metadata_collection = self._driver.entity_bulk(guid=metadata_ids) except KeyError as ex: LOGGER.exception(f'DSL Search query failed: {ex}') raise BadRequest('Unable to fetch popular tables. ' 'Please check your configurations.') if not metadata_collection: raise NotFoundException('Unable to fetch popular tables. ' 'Please check your configurations.') for _collection in metadata_collection: metadata_entities = _collection.entities_with_relationships(attributes=["parentEntity"]) for metadata in metadata_entities: table = metadata.relationshipAttributes.get("parentEntity") table_attrs = table.get(self.ATTRS_KEY) _regex_result = self.TABLE_QN_REGEX.match(table_attrs.get(self.QN_KEY)) table_qn = _regex_result.groupdict() if _regex_result else dict() # Hardcoded empty strings as default, because these values are not optional table_name = table_attrs.get(self.NAME_ATTRIBUTE) or table_qn.get("table_name", '') db_name = table_qn.get("db_name", '') db_cluster = table_qn.get("cluster_name", '') popular_table = PopularTable(database=table.get("typeName"), cluster=db_cluster, schema=db_name, name=table_name, description=table_attrs.get('description')) popular_tables.append(popular_table) return popular_tables def get_latest_updated_ts(self) -> int: pass def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for type_def in self._driver.typedefs: for classification in type_def.classificationDefs: tags.append( TagDetail( tag_name=classification.name, tag_count=0 # FixMe (Verdan): Implement the tag count ) ) return tags def get_table_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: pass def get_frequently_used_tables(self, *, user_email: str) -> Dict[str, Any]: pass def add_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: pass def delete_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: pass
class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] NAME_ATTRIBUTE = app.config['ATLAS_NAME_ATTRIBUTE'] ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '') -> None: """ Initiate the Apache Atlas client with the provided credentials """ self._driver = Atlas(host=host, port=port, username=user, password=password) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_rel_attributes_dict(self, *, entities: List[Entity], attribute: str) -> Dict: """ Atlas doesn't provide relational in referredEntities when making searching on the superTypes entities. This function will make a dictionary same as the referredEntities. :param entities: The list of entities from which relational attributes needed to be fetched :param attribute: The name of the relational attribute :return: A dictionary of entities details, with GUIDs as keys of each entity """ entities_dict = dict() # type: Dict rel_attribute_ids = list() for entity in entities: attrs = entity.attributes rel_id = attrs.get(attribute, {}).get('guid') if rel_id: rel_attribute_ids.append(rel_id) _rel_attr_collection = self._driver.entity_bulk(guid=rel_attribute_ids) for rel_entities in _rel_attr_collection: entities_dict = dict((rel_entity.guid, rel_entity) for rel_entity in rel_entities.entities) return entities_dict def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Database Namespace: rdbms_table, hive_table etc. entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Unique Table Identifier """ pattern = re.compile(r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _get_table_entity(self, *, table_uri: str) -> Tuple[EntityUniqueAttribute, Dict]: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: """ table_info = self._extract_info_from_uri(table_uri=table_uri) try: return self._driver.entity_unique_attribute( table_info['entity'], qualifiedName=table_info.get('name')), table_info except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException('Table URI( {table_uri} ) does not exist' .format(table_uri=table_uri)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity, _ = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column['guid']] if column_name == col_details[self.ATTRS_KEY][self.NAME_ATTRIBUTE]: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def get_user_detail(self, *, user_id: str) -> Union[UserEntity, None]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity, table_info = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] rel_attrs = table_details[self.REL_ATTRS_KEY] tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get("classifications") or list(): tags.append( Tag( tag_name=classification.get('typeName'), tag_type="default" ) ) columns = [] for column in rel_attrs.get('columns') or list(): col_entity = entity.referredEntities[column['guid']] col_attrs = col_entity[self.ATTRS_KEY] columns.append( Column( name=col_attrs.get(self.NAME_ATTRIBUTE), description=col_attrs.get('description'), col_type=col_attrs.get('type') or col_attrs.get('dataType'), sort_order=col_attrs.get('position'), ) ) table = Table(database=table_info['entity'], cluster=table_info['cluster'], schema=table_info['db'], name=table_info['name'], tags=tags, description=attrs.get('description'), owners=[User(email=attrs.get('owner'))], columns=columns, last_updated_timestamp=table_details.get('updateTime')) return table except KeyError as ex: LOGGER.exception('Error while accessing table information. {}' .format(str(ex))) raise BadRequest('Some of the required attributes ' 'are missing in : ( {table_uri} )' .format(table_uri=table_uri)) def delete_owner(self, *, table_uri: str, owner: str) -> None: pass def add_owner(self, *, table_uri: str, owner: str) -> None: """ It simply replaces the owner field in atlas with the new string. FixMe (Verdan): Implement multiple data owners and atlas changes in the documentation if needed to make owner field a list :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['owner'] = owner entity.update() def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity, _ = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, table_uri: str, tag: str) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity_bulk_tag = {"classification": {"typeName": tag}, "entityGuids": [entity.entity['guid']]} self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def delete_tag(self, *, table_uri: str, tag: str) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity, _ = self._get_table_entity(table_uri=table_uri) guid_entity = self._driver.entity_guid(entity.entity['guid']) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format(str(ex))) def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) col_guid = column_detail['guid'] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') def get_popular_tables(self, *, num_entries: int = 10) -> List[PopularTable]: """ FixMe: For now it simply returns ALL the tables available, Need to generate the formula for popular tables only. :param num_entries: :return: """ popular_tables = list() params = {'typeName': self.TABLE_ENTITY, 'excludeDeletedEntities': True, self.ATTRS_KEY: [self.DB_ATTRIBUTE] } try: # Fetch all the Popular Tables _table_collection = self._driver.search_basic.create(data=params) # Inflate the table entities table_entities = _table_collection.entities except BadRequest as ex: LOGGER.exception(f'Please make sure you have assigned the appropriate ' f'self.TABLE_ENTITY entity to your atlas tables. {ex}') raise BadRequest('Unable to fetch popular tables. ' 'Please check your configurations.') # Make a dictionary of Database Entities to avoid multiple DB calls dbs_dict = self._get_rel_attributes_dict(entities=table_entities, attribute=self.DB_ATTRIBUTE) # Make instances of PopularTable for entity in table_entities: attrs = entity.attributes # DB would be available in attributes # because it is in the request parameter. db_id = attrs.get(self.DB_ATTRIBUTE, {}).get('guid') db_entity = dbs_dict.get(db_id) if db_entity: db_attrs = db_entity.attributes db_name = db_attrs.get(self.NAME_ATTRIBUTE) db_cluster = db_attrs.get('clusterName') else: db_name = '' db_cluster = '' popular_table = PopularTable(database=entity.typeName, cluster=db_cluster, schema=db_name, name=attrs.get(self.NAME_ATTRIBUTE), description=attrs.get('description')) popular_tables.append(popular_table) return popular_tables def get_latest_updated_ts(self) -> int: pass def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for type_def in self._driver.typedefs: for classification in type_def.classificationDefs: tags.append( TagDetail( tag_name=classification.name, tag_count=0 # FixMe (Verdan): Implement the tag count ) ) return tags def get_table_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: pass def add_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: pass def delete_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: pass