class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] NAME_ATTRIBUTE = app.config['ATLAS_NAME_ATTRIBUTE'] QN_KEY = 'qualifiedName' ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' # Table Qualified Name Regex TABLE_QN_REGEX = pattern = re.compile(r""" ^(?P<db_name>.*?)\.(?P<table_name>.*)@(?P<cluster_name>.*?)$ """, re.X) def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '') -> None: """ Initiate the Apache Atlas client with the provided credentials """ self._driver = Atlas(host=host, port=port, username=user, password=password) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_flat_values_from_dsl(self, dsl_param: dict) -> List: """ Makes a DSL query asking for specific attribute, extracts that attribute from result (which is a list of list, and converts that into a flat list. :param dsl_param: A DSL parameter, with SELECT clause :return: A Flat list of specified attributes in SELECT clause """ attributes: List = list() _search_collection = self._driver.search_dsl(**dsl_param) for collection in _search_collection: attributes = collection.flatten_attrs() return attributes def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Database Namespace: rdbms_table, hive_table etc. entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Unique Table Identifier """ pattern = re.compile(r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _get_table_entity(self, *, table_uri: str) -> Tuple[EntityUniqueAttribute, Dict]: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: """ table_info = self._extract_info_from_uri(table_uri=table_uri) try: return self._driver.entity_unique_attribute( table_info['entity'], qualifiedName=table_info.get('name')), table_info except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException('Table URI( {table_uri} ) does not exist' .format(table_uri=table_uri)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity, _ = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column['guid']] if column_name == col_details[self.ATTRS_KEY][self.NAME_ATTRIBUTE]: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def _serialize_columns(self, *, entity: EntityUniqueAttribute) -> \ Union[List[Column], List]: """ Helper function to fetch the columns from entity and serialize them using Column and Statistics model. :param entity: EntityUniqueAttribute object, along with relationshipAttributes :return: A list of Column objects, if there are any columns available, else an empty list. """ columns = list() for column in entity.entity[self.REL_ATTRS_KEY].get('columns') or list(): col_entity = entity.referredEntities[column['guid']] col_attrs = col_entity[self.ATTRS_KEY] statistics = list() for stats in col_attrs.get('stats') or list(): stats_attrs = stats['attributes'] statistics.append( Statistics( stat_type=stats_attrs.get('stat_name'), stat_val=stats_attrs.get('stat_val'), start_epoch=stats_attrs.get('start_epoch'), end_epoch=stats_attrs.get('end_epoch'), ) ) columns.append( Column( name=col_attrs.get(self.NAME_ATTRIBUTE), description=col_attrs.get('description'), col_type=col_attrs.get('type') or col_attrs.get('dataType'), sort_order=col_attrs.get('position'), stats=statistics, ) ) return columns def get_user_detail(self, *, user_id: str) -> Union[UserEntity, None]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity, table_info = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get("classifications") or list(): tags.append( Tag( tag_name=classification.get('typeName'), tag_type="default" ) ) columns = self._serialize_columns(entity=entity) table = Table(database=table_info['entity'], cluster=table_info['cluster'], schema=table_info['db'], name=table_info['name'], tags=tags, description=attrs.get('description'), owners=[User(email=attrs.get('owner'))], columns=columns, last_updated_timestamp=table_details.get('updateTime')) return table except KeyError as ex: LOGGER.exception('Error while accessing table information. {}' .format(str(ex))) raise BadRequest('Some of the required attributes ' 'are missing in : ( {table_uri} )' .format(table_uri=table_uri)) def delete_owner(self, *, table_uri: str, owner: str) -> None: pass def add_owner(self, *, table_uri: str, owner: str) -> None: """ It simply replaces the owner field in atlas with the new string. FixMe (Verdan): Implement multiple data owners and atlas changes in the documentation if needed to make owner field a list :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['owner'] = owner entity.update() def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity, _ = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, table_uri: str, tag: str) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :return: None """ entity, _ = self._get_table_entity(table_uri=table_uri) entity_bulk_tag = {"classification": {"typeName": tag}, "entityGuids": [entity.entity['guid']]} self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def delete_tag(self, *, table_uri: str, tag: str) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity, _ = self._get_table_entity(table_uri=table_uri) guid_entity = self._driver.entity_guid(entity.entity['guid']) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format(str(ex))) def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) col_guid = column_detail['guid'] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column( table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') def get_popular_tables(self, *, num_entries: int) -> List[PopularTable]: """ :param num_entries: Number of popular tables to fetch :return: A List of popular tables instances """ popular_tables = list() try: # Fetch the metadata entities based on popularity score query_metadata_ids = {'query': f'FROM Table SELECT metadata.__guid ' f'ORDERBY popularityScore desc ' f'LIMIT {num_entries}'} metadata_ids = self._get_flat_values_from_dsl(dsl_param=query_metadata_ids) metadata_collection = self._driver.entity_bulk(guid=metadata_ids) except KeyError as ex: LOGGER.exception(f'DSL Search query failed: {ex}') raise BadRequest('Unable to fetch popular tables. ' 'Please check your configurations.') if not metadata_collection: raise NotFoundException('Unable to fetch popular tables. ' 'Please check your configurations.') for _collection in metadata_collection: metadata_entities = _collection.entities_with_relationships(attributes=["parentEntity"]) for metadata in metadata_entities: table = metadata.relationshipAttributes.get("parentEntity") table_attrs = table.get(self.ATTRS_KEY) _regex_result = self.TABLE_QN_REGEX.match(table_attrs.get(self.QN_KEY)) table_qn = _regex_result.groupdict() if _regex_result else dict() # Hardcoded empty strings as default, because these values are not optional table_name = table_attrs.get(self.NAME_ATTRIBUTE) or table_qn.get("table_name", '') db_name = table_qn.get("db_name", '') db_cluster = table_qn.get("cluster_name", '') popular_table = PopularTable(database=table.get("typeName"), cluster=db_cluster, schema=db_name, name=table_name, description=table_attrs.get('description')) popular_tables.append(popular_table) return popular_tables def get_latest_updated_ts(self) -> int: pass def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for type_def in self._driver.typedefs: for classification in type_def.classificationDefs: tags.append( TagDetail( tag_name=classification.name, tag_count=0 # FixMe (Verdan): Implement the tag count ) ) return tags def get_table_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: pass def get_frequently_used_tables(self, *, user_email: str) -> Dict[str, Any]: pass def add_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: pass def delete_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: pass
class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] STATISTICS_FORMAT_SPEC = app.config['STATISTICS_FORMAT_SPEC'] BOOKMARK_TYPE = 'Bookmark' USER_TYPE = 'User' READER_TYPE = 'Reader' QN_KEY = 'qualifiedName' BOOKMARK_ACTIVE_KEY = 'active' GUID_KEY = 'guid' ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' ENTITY_URI_KEY = 'entityUri' _CACHE = CacheManager(**parse_cache_config_options( { 'cache.regions': 'atlas_proxy', 'cache.atlas_proxy.type': 'memory', 'cache.atlas_proxy.expire': _ATLAS_PROXY_CACHE_EXPIRY_SEC })) def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '', encrypted: bool = False, validate_ssl: bool = False) -> None: """ Initiate the Apache Atlas client with the provided credentials """ protocol = 'https' if encrypted else 'http' self._driver = Atlas(host=host, port=port, username=user, password=password, protocol=protocol, validate_ssl=validate_ssl) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_flat_values_from_dsl(self, dsl_param: dict) -> List: """ Makes a DSL query asking for specific attribute, extracts that attribute from result (which is a list of list, and converts that into a flat list. :param dsl_param: A DSL parameter, with SELECT clause :return: A Flat list of specified attributes in SELECT clause """ attributes: List = list() _search_collection = self._driver.search_dsl(**dsl_param) for collection in _search_collection: attributes = collection.flatten_attrs() return attributes def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Table Name """ pattern = re.compile( r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _parse_reader_qn(self, reader_qn: str) -> Dict: """ Parse reader qualifiedName and extract the info :param reader_qn: :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<user_id>[^.]*)\.reader \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(reader_qn) return result.groupdict() if result else dict() def _parse_bookmark_qn(self, bookmark_qn: str) -> Dict: """ Parse bookmark qualifiedName and extract the info :param bookmark_qn: Qualified Name of Bookmark entity :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<entity_type>[^.]*) \. (?P<user_id>[^.]*)\.bookmark \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(bookmark_qn) return result.groupdict() if result else dict() def _get_user_details(self, user_id: str) -> Dict: """ Helper function to help get the user details if the `USER_DETAIL_METHOD` is configured, else uses the user_id for both email and user_id properties. :param user_id: The Unique user id of a user entity :return: a dictionary of user details """ if app.config.get('USER_DETAIL_METHOD'): user_details = app.config.get('USER_DETAIL_METHOD')( user_id) # type: ignore else: user_details = {'email': user_id, 'user_id': user_id} return user_details def _get_table_entity(self, *, table_uri: str) -> EntityUniqueAttribute: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: A tuple of Table entity and parsed information of table qualified name """ table_info = self._extract_info_from_uri(table_uri=table_uri) table_qn = make_table_qualified_name(table_info.get('name'), table_info.get('cluster'), table_info.get('db')) try: return self._driver.entity_unique_attribute(table_info['entity'], qualifiedName=table_qn) except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException( 'Table URI( {table_uri} ) does not exist'.format( table_uri=table_uri)) def _get_user_entity(self, user_id: str) -> EntityUniqueAttribute: """ Fetches an user entity from an id :param user_id: :return: """ try: return self._driver.entity_unique_attribute("User", qualifiedName=user_id) except Exception as ex: raise NotFoundException( '(User {user_id}) does not exist'.format(user_id=user_id)) def _create_bookmark(self, entity: EntityUniqueAttribute, user_guid: str, bookmark_qn: str, table_uri: str) -> None: """ Creates a bookmark entity for a specific user and table uri. :param user_guid: User's guid :param bookmark_qn: Bookmark qualifiedName :return: """ bookmark_entity = { 'entity': { 'typeName': self.BOOKMARK_TYPE, 'attributes': { 'qualifiedName': bookmark_qn, self.BOOKMARK_ACTIVE_KEY: True, 'entityUri': table_uri, 'user': { 'guid': user_guid }, 'entity': { 'guid': entity.entity[self.GUID_KEY] } } } } self._driver.entity_post.create(data=bookmark_entity) def _get_bookmark_entity(self, entity_uri: str, user_id: str) -> EntityUniqueAttribute: """ Fetch a Bookmark entity from parsing table uri and user id. If Bookmark is not present, create one for the user. :param table_uri: :param user_id: Qualified Name of a user :return: """ table_info = self._extract_info_from_uri(table_uri=entity_uri) bookmark_qn = '{}.{}.{}.{}.bookmark@{}'.format( table_info.get('db'), table_info.get('name'), table_info.get('entity'), user_id, table_info.get('cluster')) try: bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) if not bookmark_entity.entity: table_entity = self._get_table_entity(table_uri=entity_uri) # Fetch user entity from user_id for relation user_entity = self._get_user_entity(user_id) # Create bookmark entity with the user relation. self._create_bookmark(table_entity, user_entity.entity[self.GUID_KEY], bookmark_qn, entity_uri) # Fetch bookmark entity after creating it. bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) return bookmark_entity except Exception as ex: LOGGER.exception(f'Bookmark not found. {str(ex)}') raise NotFoundException( 'Bookmark( {bookmark_qn} ) does not exist'.format( bookmark_qn=bookmark_qn)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column[ self.GUID_KEY]] if column_name == col_details[self.ATTRS_KEY]['name']: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def _serialize_columns(self, *, entity: EntityUniqueAttribute) -> \ Union[List[Column], List]: """ Helper function to fetch the columns from entity and serialize them using Column and Statistics model. :param entity: EntityUniqueAttribute object, along with relationshipAttributes :return: A list of Column objects, if there are any columns available, else an empty list. """ columns = list() for column in entity.entity[self.REL_ATTRS_KEY].get( 'columns') or list(): column_status = column.get('entityStatus', 'inactive').lower() if column_status != 'active': continue col_entity = entity.referredEntities[column[self.GUID_KEY]] col_attrs = col_entity[self.ATTRS_KEY] statistics = list() for stats in col_attrs.get('statistics') or list(): stats_attrs = stats['attributes'] stat_type = stats_attrs.get('stat_name') stat_format = self.STATISTICS_FORMAT_SPEC.get( stat_type, dict()) if not stat_format.get('drop', False): stat_type = stat_format.get('new_name', stat_type) stat_val = stats_attrs.get('stat_val') format_val = stat_format.get('format') if format_val: stat_val = format_val.format(stat_val) else: stat_val = str(stat_val) start_epoch = stats_attrs.get('start_epoch') end_epoch = stats_attrs.get('end_epoch') statistics.append( Statistics( stat_type=stat_type, stat_val=stat_val, start_epoch=start_epoch, end_epoch=end_epoch, )) columns.append( Column( name=col_attrs.get('name'), description=col_attrs.get('description') or col_attrs.get('comment'), col_type=col_attrs.get('type') or col_attrs.get('dataType'), sort_order=col_attrs.get('position') or 9999, stats=statistics, )) return sorted(columns, key=lambda item: item.sort_order) def _get_reports(self, guids: List[str]) -> List[ResourceReport]: reports = [] if guids: report_entities_collection = self._driver.entity_bulk(guid=guids) for report_entity in extract_entities(report_entities_collection): try: if report_entity.status == Status.ACTIVE: report_attrs = report_entity.attributes reports.append( ResourceReport(name=report_attrs['name'], url=report_attrs['url'])) except (KeyError, AttributeError) as ex: LOGGER.exception( 'Error while accessing table report: {}. {}'.format( str(report_entity), str(ex))) parsed_reports = app.config['RESOURCE_REPORT_CLIENT'](reports) \ if app.config['RESOURCE_REPORT_CLIENT'] else reports return parsed_reports def _get_owners(self, data_owners: list, fallback_owner: str = None) -> List[User]: owners_detail = list() active_owners_list = list() active_owners = filter( lambda item: item['entityStatus'] == Status.ACTIVE and item[ 'relationshipStatus'] == Status.ACTIVE, data_owners) for owner in active_owners: owner_qn = owner['displayText'] owner_data = self._get_user_details(owner_qn) owners_detail.append(User(**owner_data)) active_owners_list.append(owner_qn) # To avoid the duplication, # we are checking if the fallback is not in data_owners if fallback_owner and (fallback_owner not in active_owners_list): owners_detail.append( User(**self._get_user_details(fallback_owner))) return owners_detail def get_user(self, *, id: str) -> Union[UserEntity, None]: pass def get_users(self) -> List[UserEntity]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] programmatic_descriptions = self._get_programmatic_descriptions( attrs.get('parameters', dict())) table_qn = parse_table_qualified_name( qualified_name=attrs.get(self.QN_KEY)) tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get( 'classifications') or list(): tags.append( Tag(tag_name=classification.get('typeName'), tag_type="default")) columns = self._serialize_columns(entity=entity) reports_guids = [ report.get("guid") for report in attrs.get("reports") or list() ] is_view = True if attrs.get( 'tableType', 'table').lower().find('view') != -1 else False table = Table( database=table_details.get('typeName'), cluster=table_qn.get('cluster_name', ''), schema=table_qn.get('db_name', ''), name=attrs.get('name') or table_qn.get("table_name", ''), tags=tags, description=attrs.get('description') or attrs.get('comment'), owners=self._get_owners( table_details[self.REL_ATTRS_KEY].get('ownedBy', []), attrs.get('owner')), resource_reports=self._get_reports(guids=reports_guids), columns=columns, is_view=is_view, table_readers=self._get_readers(attrs.get(self.QN_KEY)), last_updated_timestamp=self._parse_date( table_details.get('updateTime')), programmatic_descriptions=programmatic_descriptions) return table except KeyError as ex: LOGGER.exception( 'Error while accessing table information. {}'.format(str(ex))) raise BadRequest( 'Some of the required attributes ' 'are missing in : ( {table_uri} )'.format(table_uri=table_uri)) def delete_owner(self, *, table_uri: str, owner: str) -> None: """ :param table_uri: :param owner: :return: """ table = self._get_table_entity(table_uri=table_uri) table_entity = table.entity if table_entity[self.REL_ATTRS_KEY].get("ownedBy"): try: active_owners = filter( lambda item: item['relationshipStatus'] == Status.ACTIVE and item['displayText'] == owner, table_entity[self.REL_ATTRS_KEY]['ownedBy']) if list(active_owners): self._driver.relationship_guid( next(active_owners).get('relationshipGuid')).delete() else: raise BadRequest('You can not delete this owner.') except NotFound as ex: LOGGER.exception( 'Error while removing table data owner. {}'.format( str(ex))) def add_owner(self, *, table_uri: str, owner: str) -> None: """ Query on Atlas User entity to find if the entity exist for the owner string in parameter, if not create one. And then use that User entity's GUID and add a relationship between Table and User, on ownedBy field. :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ owner_info = self._get_user_details(owner) if not owner_info: raise NotFoundException(f'User "{owner}" does not exist.') user_dict = { "entity": { "typeName": "User", "attributes": { "qualifiedName": owner }, } } # Get or Create a User user_entity = self._driver.entity_post.create(data=user_dict) user_guid = next(iter(user_entity.get("guidAssignments").values())) table = self._get_table_entity(table_uri=table_uri) entity_def = { "typeName": "DataSet_Users_Owner", "end1": { "guid": table.entity.get("guid"), "typeName": "Table", }, "end2": { "guid": user_guid, "typeName": "User", }, } try: self._driver.relationship.create(data=entity_def) except Conflict as ex: LOGGER.exception( 'Error while adding the owner information. {}'.format(str(ex))) raise BadRequest( f'User {owner} is already added as a data owner for ' f'table {table_uri}.') def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :param tag_type :return: None """ entity = self._get_table_entity(table_uri=id) entity_bulk_tag = { "classification": { "typeName": tag }, "entityGuids": [entity.entity[self.GUID_KEY]] } self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def add_badge(self, *, id: str, badge_name: str, category: str = '', badge_type: str = '', resource_type: ResourceType) -> None: # Not implemented raise NotImplementedError def delete_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity = self._get_table_entity(table_uri=id) guid_entity = self._driver.entity_guid( entity.entity[self.GUID_KEY]) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format( str(ex))) def delete_badge(self, *, id: str, badge_name: str, category: str, badge_type: str, resource_type: ResourceType) -> None: # Not implemented raise NotImplementedError def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) col_guid = column_detail[self.GUID_KEY] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') def _serialize_popular_tables(self, entities: list) -> List[PopularTable]: """ Gets a list of entities and serialize the popular tables. :param entities: List of entities from atlas client :return: a list of PopularTable objects """ popular_tables = list() for table in entities: table_attrs = table.attributes table_qn = parse_table_qualified_name( qualified_name=table_attrs.get(self.QN_KEY)) table_name = table_qn.get("table_name") or table_attrs.get('name') db_name = table_qn.get("db_name", '') db_cluster = table_qn.get("cluster_name", '') popular_table = PopularTable( database=table.typeName, cluster=db_cluster, schema=db_name, name=table_name, description=table_attrs.get('description') or table_attrs.get('comment')) popular_tables.append(popular_table) return popular_tables def get_popular_tables(self, *, num_entries: int) -> List[PopularTable]: """ Generates a list of Popular tables to be shown on the home page of Amundsen. :param num_entries: Number of popular tables to fetch :return: A List of popular tables instances """ popular_query_params = { 'typeName': 'Table', 'sortBy': 'popularityScore', 'sortOrder': 'DESCENDING', 'excludeDeletedEntities': True, 'limit': num_entries } search_results = self._driver.search_basic.create( data=popular_query_params) return self._serialize_popular_tables(search_results.entities) def get_latest_updated_ts(self) -> int: date = None for metrics in self._driver.admin_metrics: try: date = self._parse_date( metrics.general.get( 'stats', {}).get('Notification:lastMessageProcessedTime')) except AttributeError: pass date = date or 0 return date def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for metrics in self._driver.admin_metrics: tag_stats = metrics.tag for tag, count in tag_stats["tagEntities"].items(): tags.append(TagDetail(tag_name=tag, tag_count=count)) return tags def get_badges(self) -> List: # Not implemented return [] def _get_resources_followed_by_user(self, user_id: str, resource_type: str) \ -> List[Union[PopularTable, DashboardSummary]]: """ ToDo (Verdan): Dashboard still needs to be implemented. Helper function to get the resource, table, dashboard etc followed by a user. :param user_id: User ID of a user :param resource_type: Type of a resource that returns, could be table, dashboard etc. :return: A list of PopularTable, DashboardSummary or any other resource. """ params = { 'typeName': self.BOOKMARK_TYPE, 'offset': '0', 'limit': '1000', 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': self.QN_KEY, 'operator': 'contains', 'attributeValue': f'.{user_id}.bookmark' }, { 'attributeName': self.BOOKMARK_ACTIVE_KEY, 'operator': 'eq', 'attributeValue': 'true' }] }, 'attributes': ['count', self.QN_KEY, self.ENTITY_URI_KEY] } # Fetches the bookmark entities based on filters search_results = self._driver.search_basic.create(data=params) resources = [] for record in search_results.entities: table_info = self._extract_info_from_uri( table_uri=record.attributes[self.ENTITY_URI_KEY]) res = self._parse_bookmark_qn(record.attributes[self.QN_KEY]) resources.append( PopularTable(database=table_info['entity'], cluster=res['cluster'], schema=res['db'], name=res['table'])) return resources def _get_resources_owned_by_user(self, user_id: str, resource_type: str) \ -> List[Union[PopularTable, DashboardSummary, Any]]: """ ToDo (Verdan): Dashboard still needs to be implemented. Helper function to get the resource, table, dashboard etc owned by a user. :param user_id: User ID of a user :param resource_type: Type of a resource that returns, could be table, dashboard etc. :return: A list of PopularTable, DashboardSummary or any other resource. """ resources = list() if resource_type == ResourceType.Table.name: type_regex = "(.*)_table$" # elif resource_type == ResourceType.Dashboard.name: # type_regex = "Dashboard" else: LOGGER.exception( f'Resource Type ({resource_type}) is not yet implemented') raise NotImplemented user_entity = self._driver.entity_unique_attribute( self.USER_TYPE, qualifiedName=user_id).entity if not user_entity: LOGGER.exception(f'User ({user_id}) not found in Atlas') raise NotFoundException(f'User {user_id} not found.') resource_guids = set() for item in user_entity[self.REL_ATTRS_KEY].get('owns') or list(): if (item['entityStatus'] == Status.ACTIVE and item['relationshipStatus'] == Status.ACTIVE and re.compile(type_regex).match(item['typeName'])): resource_guids.add(item[self.GUID_KEY]) params = { 'typeName': self.TABLE_ENTITY, 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': 'owner', 'operator': 'startsWith', 'attributeValue': user_id.lower() }] }, 'attributes': [self.GUID_KEY] } table_entities = self._driver.search_basic.create(data=params) for table in table_entities.entities: resource_guids.add(table.guid) if resource_guids: entities = extract_entities( self._driver.entity_bulk(guid=list(resource_guids), ignoreRelationships=True)) if resource_type == ResourceType.Table.name: resources = self._serialize_popular_tables(entities) else: LOGGER.info(f'User ({user_id}) does not own any "{resource_type}"') return resources def get_dashboard_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) \ -> Dict[str, List[DashboardSummary]]: pass def get_table_by_user_relation( self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: tables = list() if relation_type == UserResourceRel.follow: tables = self._get_resources_followed_by_user( user_id=user_email, resource_type=ResourceType.Table.name) elif relation_type == UserResourceRel.own: tables = self._get_resources_owned_by_user( user_id=user_email, resource_type=ResourceType.Table.name) return {'table': tables} def get_frequently_used_tables( self, *, user_email: str) -> Dict[str, List[PopularTable]]: user = self._driver.entity_unique_attribute( self.USER_TYPE, qualifiedName=user_email).entity readers_guids = [] for user_reads in user['relationshipAttributes'].get('entityReads'): entity_status = user_reads['entityStatus'] relationship_status = user_reads['relationshipStatus'] if entity_status == Status.ACTIVE and relationship_status == Status.ACTIVE: readers_guids.append(user_reads['guid']) readers = extract_entities( self._driver.entity_bulk(guid=readers_guids, ignoreRelationships=True)) _results = {} for reader in readers: entity_uri = reader.attributes.get(self.ENTITY_URI_KEY) count = reader.attributes.get('count') if count: details = self._extract_info_from_uri(table_uri=entity_uri) _results[count] = dict(cluster=details.get('cluster'), name=details.get('name'), schema=details.get('db'), database=details.get('entity')) sorted_counts = sorted(_results.keys()) results = [] for count in sorted_counts: data: dict = _results.get(count, dict()) table = PopularTable(**data) results.append(table) return {'table': results} def add_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) entity = self._get_bookmark_entity(entity_uri=id, user_id=user_id) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = True entity.update() def delete_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) entity = self._get_bookmark_entity(entity_uri=id, user_id=user_id) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = False entity.update() def _parse_date(self, date: int) -> Optional[int]: try: date_str = str(date) date_trimmed = date_str[:10] assert len(date_trimmed) == 10 return int(date_trimmed) except Exception: return None def _get_readers(self, qualified_name: str, top: Optional[int] = 15) -> List[Reader]: params = { 'typeName': self.READER_TYPE, 'offset': '0', 'limit': top, 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': self.QN_KEY, 'operator': 'STARTSWITH', 'attributeValue': qualified_name.split('@')[0] + '.' }, { 'attributeName': 'count', 'operator': 'gte', 'attributeValue': f'{app.config["POPULAR_TABLE_MINIMUM_READER_COUNT"]}' }] }, 'attributes': ['count', self.QN_KEY], 'sortBy': 'count', 'sortOrder': 'DESCENDING' } search_results = self._driver.search_basic.create( data=params, ignoreRelationships=False) readers = [] for record in search_results.entities: readers.append(record.guid) results = [] if readers: read_entities = extract_entities( self._driver.entity_bulk(guid=readers, ignoreRelationships=False)) for read_entity in read_entities: reader_qn = read_entity.relationshipAttributes['user'][ 'displayText'] reader_details = self._get_user_details(reader_qn) reader = Reader(user=User(**reader_details), read_count=read_entity.attributes['count']) results.append(reader) return results def _get_programmatic_descriptions( self, parameters: dict) -> List[ProgrammaticDescription]: programmatic_descriptions: Dict[str, ProgrammaticDescription] = {} for source, text in parameters.items(): use_parameter = True for regex_filter in app.config[ 'PROGRAMMATIC_DESCRIPTIONS_EXCLUDE_FILTERS']: pattern = re.compile(regex_filter) if pattern.match(source): use_parameter = False break if use_parameter: source = re.sub("([a-z])([A-Z])", "\g<1> \g<2>", source).lower() programmatic_descriptions[source] = ProgrammaticDescription( source=source, text=text) result = dict(sorted(programmatic_descriptions.items())) return list(result.values()) def get_dashboard( self, dashboard_uri: str, ) -> DashboardDetailEntity: pass def get_dashboard_description(self, *, id: str) -> Description: pass def put_dashboard_description(self, *, id: str, description: str) -> None: pass def get_resources_using_table( self, *, id: str, resource_type: ResourceType) -> Dict[str, List[DashboardSummary]]: return {}
class AtlasProxy(BaseProxy): """ Atlas Proxy client for the amundsen metadata {ATLAS_API_DOCS} = https://atlas.apache.org/api/v2/ """ TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] STATISTICS_FORMAT_SPEC = app.config['STATISTICS_FORMAT_SPEC'] BOOKMARK_TYPE = 'Bookmark' USER_TYPE = 'User' QN_KEY = 'qualifiedName' BOOKMARK_ACTIVE_KEY = 'active' GUID_KEY = 'guid' ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' ENTITY_URI_KEY = 'entityUri' _CACHE = CacheManager(**parse_cache_config_options( { 'cache.regions': 'atlas_proxy', 'cache.atlas_proxy.type': 'memory', 'cache.atlas_proxy.expire': _ATLAS_PROXY_CACHE_EXPIRY_SEC })) def __init__(self, *, host: str, port: int, user: str = 'admin', password: str = '', encrypted: bool = False, validate_ssl: bool = False) -> None: """ Initiate the Apache Atlas client with the provided credentials """ protocol = 'https' if encrypted else 'http' self._driver = Atlas(host=host, port=port, username=user, password=password, protocol=protocol, validate_ssl=validate_ssl) def _get_ids_from_basic_search(self, *, params: Dict) -> List[str]: """ FixMe (Verdan): UNUSED. Please remove after implementing atlas proxy Search for the entities based on the params provided as argument. :param params: the dictionary of parameters to be used for the basic search :return: The flat list of GUIDs of entities founds based on the params. """ ids = list() search_results = self._driver.search_basic(**params) for result in search_results: for entity in result.entities: ids.append(entity.guid) return ids def _get_flat_values_from_dsl(self, dsl_param: dict) -> List: """ Makes a DSL query asking for specific attribute, extracts that attribute from result (which is a list of list, and converts that into a flat list. :param dsl_param: A DSL parameter, with SELECT clause :return: A Flat list of specified attributes in SELECT clause """ attributes: List = list() _search_collection = self._driver.search_dsl(**dsl_param) for collection in _search_collection: attributes = collection.flatten_attrs() return attributes def _extract_info_from_uri(self, *, table_uri: str) -> Dict: """ Extracts the table information from table_uri coming from frontend. :param table_uri: :return: Dictionary object, containing following information: entity: Type of entity example: rdbms_table, hive_table etc. cluster: Cluster information db: Database Name name: Table Name """ pattern = re.compile( r""" ^ (?P<entity>.*?) :\/\/ (?P<cluster>.*) \. (?P<db>.*?) \/ (?P<name>.*?) $ """, re.X) result = pattern.match(table_uri) return result.groupdict() if result else dict() def _parse_reader_qn(self, reader_qn: str) -> Dict: """ Parse reader qualifiedName and extract the info :param reader_qn: :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<user_id>[^.]*)\.reader \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(reader_qn) return result.groupdict() if result else dict() def _parse_bookmark_qn(self, bookmark_qn: str) -> Dict: """ Parse bookmark qualifiedName and extract the info :param bookmark_qn: Qualified Name of Bookmark entity :return: Dictionary object containing following information: cluster: cluster information db: Database name name: Table name """ pattern = re.compile( r""" ^(?P<db>[^.]*) \. (?P<table>[^.]*) \. (?P<entity_type>[^.]*) \. (?P<user_id>[^.]*)\.bookmark \@ (?P<cluster>.*) $ """, re.X) result = pattern.match(bookmark_qn) return result.groupdict() if result else dict() def _get_table_entity(self, *, table_uri: str) -> EntityUniqueAttribute: """ Fetch information from table_uri and then find the appropriate entity The reason, we're not returning the entity_unique_attribute().entity directly is because the entity_unique_attribute() return entity Object that can be used for update purposes, while entity_unique_attribute().entity only returns the dictionary :param table_uri: :return: A tuple of Table entity and parsed information of table qualified name """ table_info = self._extract_info_from_uri(table_uri=table_uri) table_qn = make_table_qualified_name(table_info.get('name'), table_info.get('cluster'), table_info.get('db')) try: return self._driver.entity_unique_attribute(table_info['entity'], qualifiedName=table_qn) except Exception as ex: LOGGER.exception(f'Table not found. {str(ex)}') raise NotFoundException( 'Table URI( {table_uri} ) does not exist'.format( table_uri=table_uri)) def _get_user_entity(self, user_id: str) -> EntityUniqueAttribute: """ Fetches an user entity from an id :param user_id: :return: """ try: return self._driver.entity_unique_attribute("User", qualifiedName=user_id) except Exception as ex: raise NotFoundException( '(User {user_id}) does not exist'.format(user_id=user_id)) def _create_bookmark(self, entity: EntityUniqueAttribute, user_guid: str, bookmark_qn: str, table_uri: str) -> None: """ Creates a bookmark entity for a specific user and table uri. :param user_guid: User's guid :param bookmark_qn: Bookmark qualifiedName :return: """ bookmark_entity = { 'entity': { 'typeName': self.BOOKMARK_TYPE, 'attributes': { 'qualifiedName': bookmark_qn, self.BOOKMARK_ACTIVE_KEY: True, 'entityUri': table_uri, 'user': { 'guid': user_guid }, 'entity': { 'guid': entity.entity[self.GUID_KEY] } } } } self._driver.entity_post.create(data=bookmark_entity) def _get_bookmark_entity(self, entity_uri: str, user_id: str) -> EntityUniqueAttribute: """ Fetch a Bookmark entity from parsing table uri and user id. If Bookmark is not present, create one for the user. :param table_uri: :param user_id: Qualified Name of a user :return: """ table_info = self._extract_info_from_uri(table_uri=entity_uri) bookmark_qn = '{}.{}.{}.{}.bookmark@{}'.format( table_info.get('db'), table_info.get('name'), table_info.get('entity'), user_id, table_info.get('cluster')) try: bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) if not bookmark_entity.entity: table_entity = self._get_table_entity(table_uri=entity_uri) # Fetch user entity from user_id for relation user_entity = self._get_user_entity(user_id) # Create bookmark entity with the user relation. self._create_bookmark(table_entity, user_entity.entity[self.GUID_KEY], bookmark_qn, entity_uri) # Fetch bookmark entity after creating it. bookmark_entity = self._driver.entity_unique_attribute( self.BOOKMARK_TYPE, qualifiedName=bookmark_qn) return bookmark_entity except Exception as ex: LOGGER.exception(f'Bookmark not found. {str(ex)}') raise NotFoundException( 'Bookmark( {bookmark_qn} ) does not exist'.format( bookmark_qn=bookmark_qn)) def _get_column(self, *, table_uri: str, column_name: str) -> Dict: """ Fetch the column information from referredEntities of the table entity :param table_uri: :param column_name: :return: A dictionary containing the column details """ try: table_entity = self._get_table_entity(table_uri=table_uri) columns = table_entity.entity[self.REL_ATTRS_KEY].get('columns') for column in columns or list(): col_details = table_entity.referredEntities[column[ self.GUID_KEY]] if column_name == col_details[self.ATTRS_KEY]['name']: return col_details raise NotFoundException(f'Column not found: {column_name}') except KeyError as ex: LOGGER.exception(f'Column not found: {str(ex)}') raise NotFoundException(f'Column not found: {column_name}') def _serialize_columns(self, *, entity: EntityUniqueAttribute) -> \ Union[List[Column], List]: """ Helper function to fetch the columns from entity and serialize them using Column and Statistics model. :param entity: EntityUniqueAttribute object, along with relationshipAttributes :return: A list of Column objects, if there are any columns available, else an empty list. """ columns = list() for column in entity.entity[self.REL_ATTRS_KEY].get( 'columns') or list(): column_status = column.get('entityStatus', 'inactive').lower() if column_status != 'active': continue col_entity = entity.referredEntities[column[self.GUID_KEY]] col_attrs = col_entity[self.ATTRS_KEY] statistics = list() for stats in col_attrs.get('statistics') or list(): stats_attrs = stats['attributes'] stat_type = stats_attrs.get('stat_name') stat_format = self.STATISTICS_FORMAT_SPEC.get( stat_type, dict()) if not stat_format.get('drop', False): stat_type = stat_format.get('new_name', stat_type) stat_val = stats_attrs.get('stat_val') format_val = stat_format.get('format') if format_val: stat_val = format_val.format(stat_val) else: stat_val = str(stat_val) start_epoch = stats_attrs.get('start_epoch') end_epoch = stats_attrs.get('end_epoch') statistics.append( Statistics( stat_type=stat_type, stat_val=stat_val, start_epoch=start_epoch, end_epoch=end_epoch, )) columns.append( Column( name=col_attrs.get('name'), description=col_attrs.get('description') or col_attrs.get('comment'), col_type=col_attrs.get('type') or col_attrs.get('dataType'), sort_order=col_attrs.get('position'), stats=statistics, )) return sorted(columns, key=lambda item: item.sort_order) def get_user(self, *, id: str) -> Union[UserEntity, None]: pass def get_users(self) -> List[UserEntity]: pass def get_table(self, *, table_uri: str) -> Table: """ Gathers all the information needed for the Table Detail Page. :param table_uri: :return: A Table object with all the information available or gathered from different entities. """ entity = self._get_table_entity(table_uri=table_uri) table_details = entity.entity try: attrs = table_details[self.ATTRS_KEY] table_qn = parse_table_qualified_name( qualified_name=attrs.get(self.QN_KEY)) tags = [] # Using or in case, if the key 'classifications' is there with a None for classification in table_details.get( "classifications") or list(): tags.append( Tag(tag_name=classification.get('typeName'), tag_type="default")) columns = self._serialize_columns(entity=entity) table = Table(database=table_details.get('typeName'), cluster=table_qn.get('cluster_name', ''), schema=table_qn.get('db_name', ''), name=attrs.get('name') or table_qn.get("table_name", ''), tags=tags, description=attrs.get('description') or attrs.get('comment'), owners=[User(email=attrs.get('owner'))], columns=columns, last_updated_timestamp=self._parse_date( table_details.get('updateTime'))) return table except KeyError as ex: LOGGER.exception( 'Error while accessing table information. {}'.format(str(ex))) raise BadRequest( 'Some of the required attributes ' 'are missing in : ( {table_uri} )'.format(table_uri=table_uri)) def delete_owner(self, *, table_uri: str, owner: str) -> None: pass def add_owner(self, *, table_uri: str, owner: str) -> None: """ It simply replaces the owner field in atlas with the new string. FixMe (Verdan): Implement multiple data owners and atlas changes in the documentation if needed to make owner field a list :param table_uri: :param owner: Email address of the owner :return: None, as it simply adds the owner. """ entity = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['owner'] = owner entity.update() def get_table_description(self, *, table_uri: str) -> Union[str, None]: """ :param table_uri: :return: The description of the table as a string """ entity = self._get_table_entity(table_uri=table_uri) return entity.entity[self.ATTRS_KEY].get('description') def put_table_description(self, *, table_uri: str, description: str) -> None: """ Update the description of the given table. :param table_uri: :param description: Description string :return: None """ entity = self._get_table_entity(table_uri=table_uri) entity.entity[self.ATTRS_KEY]['description'] = description entity.update() def add_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Assign the tag/classification to the give table API Ref: /resource_EntityREST.html#resource_EntityREST_addClassification_POST :param table_uri: :param tag: Tag/Classification Name :param tag_type :return: None """ entity = self._get_table_entity(table_uri=id) entity_bulk_tag = { "classification": { "typeName": tag }, "entityGuids": [entity.entity[self.GUID_KEY]] } self._driver.entity_bulk_classification.create(data=entity_bulk_tag) def delete_tag(self, *, id: str, tag: str, tag_type: str, resource_type: ResourceType = ResourceType.Table) -> None: """ Delete the assigned classfication/tag from the given table API Ref: /resource_EntityREST.html#resource_EntityREST_deleteClassification_DELETE :param table_uri: :param tag: :return: """ try: entity = self._get_table_entity(table_uri=id) guid_entity = self._driver.entity_guid( entity.entity[self.GUID_KEY]) guid_entity.classifications(tag).delete() except Exception as ex: # FixMe (Verdan): Too broad exception. Please make it specific LOGGER.exception('For some reason this deletes the classification ' 'but also always return exception. {}'.format( str(ex))) def put_column_description(self, *, table_uri: str, column_name: str, description: str) -> None: """ :param table_uri: :param column_name: Name of the column to update the description :param description: The description string :return: None, as it simply updates the description of a column """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) col_guid = column_detail[self.GUID_KEY] entity = self._driver.entity_guid(col_guid) entity.entity[self.ATTRS_KEY]['description'] = description entity.update(attribute='description') def get_column_description(self, *, table_uri: str, column_name: str) -> Union[str, None]: """ :param table_uri: :param column_name: :return: The column description using the referredEntities information of a table entity """ column_detail = self._get_column(table_uri=table_uri, column_name=column_name) return column_detail[self.ATTRS_KEY].get('description') def get_popular_tables(self, *, num_entries: int) -> List[PopularTable]: """ :param num_entries: Number of popular tables to fetch :return: A List of popular tables instances """ popular_tables = list() popular_query_params = { 'typeName': 'Table', 'sortBy': 'popularityScore', 'sortOrder': 'DESCENDING', 'excludeDeletedEntities': True, 'limit': num_entries } search_results = self._driver.search_basic.create( data=popular_query_params) for table in search_results.entities: table_attrs = table.attributes table_qn = parse_table_qualified_name( qualified_name=table_attrs.get(self.QN_KEY)) table_name = table_qn.get("table_name") or table_attrs.get('name') db_name = table_qn.get("db_name", '') db_cluster = table_qn.get("cluster_name", '') popular_table = PopularTable( database=table.typeName, cluster=db_cluster, schema=db_name, name=table_name, description=table_attrs.get('description') or table_attrs.get('comment')) popular_tables.append(popular_table) return popular_tables def get_latest_updated_ts(self) -> int: pass def get_tags(self) -> List: """ Fetch all the classification entity definitions from atlas as this will be used to generate the autocomplete on the table detail page :return: A list of TagDetail Objects """ tags = [] for metrics in self._driver.admin_metrics: tag_stats = metrics.tag for tag, count in tag_stats["tagEntities"].items(): tags.append(TagDetail(tag_name=tag, tag_count=count)) return tags def get_dashboard_by_user_relation(self, *, user_email: str, relation_type: UserResourceRel) \ -> Dict[str, List[DashboardSummary]]: pass def get_table_by_user_relation( self, *, user_email: str, relation_type: UserResourceRel) -> Dict[str, Any]: params = { 'typeName': self.BOOKMARK_TYPE, 'offset': '0', 'limit': '1000', 'excludeDeletedEntities': True, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'attributeName': self.QN_KEY, 'operator': 'contains', 'attributeValue': f'.{user_email}.bookmark' }, { 'attributeName': self.BOOKMARK_ACTIVE_KEY, 'operator': 'eq', 'attributeValue': 'true' }] }, 'attributes': ['count', self.QN_KEY, self.ENTITY_URI_KEY] } # Fetches the bookmark entities based on filters search_results = self._driver.search_basic.create(data=params) results = [] for record in search_results.entities: table_info = self._extract_info_from_uri( table_uri=record.attributes[self.ENTITY_URI_KEY]) res = self._parse_bookmark_qn(record.attributes[self.QN_KEY]) results.append( PopularTable(database=table_info['entity'], cluster=res['cluster'], schema=res['db'], name=res['table'])) return {'table': results} def get_frequently_used_tables(self, *, user_email: str) -> Dict[str, Any]: pass def add_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) self._add_table_relation_by_user(table_uri=id, user_email=user_id, relation_type=relation_type) def _add_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: entity = self._get_bookmark_entity(entity_uri=table_uri, user_id=user_email) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = True entity.update() def delete_resource_relation_by_user(self, *, id: str, user_id: str, relation_type: UserResourceRel, resource_type: ResourceType) -> None: if resource_type is not ResourceType.Table: raise NotImplemented( 'resource type {} is not supported'.format(resource_type)) self._delete_table_relation_by_user(table_uri=id, user_email=user_id, relation_type=relation_type) def _delete_table_relation_by_user(self, *, table_uri: str, user_email: str, relation_type: UserResourceRel) -> None: entity = self._get_bookmark_entity(entity_uri=table_uri, user_id=user_email) entity.entity[self.ATTRS_KEY][self.BOOKMARK_ACTIVE_KEY] = False entity.update() def _parse_date(self, date: int) -> Optional[int]: try: date_str = str(date) date_trimmed = date_str[:10] assert len(date_trimmed) == 10 return int(date_trimmed) except Exception: return None def get_dashboard( self, dashboard_uri: str, ) -> DashboardDetailEntity: pass def get_dashboard_description(self, *, id: str) -> Description: pass def put_dashboard_description(self, *, id: str, description: str) -> None: pass def get_resources_using_table( self, *, id: str, resource_type: ResourceType) -> Dict[str, List[DashboardSummary]]: pass
class AtlasProxy(BaseProxy): TABLE_ENTITY = app.config['ATLAS_TABLE_ENTITY'] DB_ATTRIBUTE = app.config['ATLAS_DB_ATTRIBUTE'] NAME_ATTRIBUTE = app.config['ATLAS_NAME_ATTRIBUTE'] ATTRS_KEY = 'attributes' REL_ATTRS_KEY = 'relationshipAttributes' """ AtlasSearch connection handler """ atlas: Atlas def __init__(self, *, host: str = None, index: str = None, user: str = '', password: str = '', page_size: int = 10) -> None: self.atlas = Atlas(host, username=user, password=password) self.index = index self.page_size = page_size @staticmethod def _entities(collections: EntityCollection) -> List[Entity]: """ Helper method for flattening all collections from {collections} :return: list of all entities """ entities: List[Entity] = [] for collection in collections: entities.extend(collection.entities) return entities def _parse_results(self, response: EntityCollection) -> List[Table]: """ based on an atlas {response} with table entities, we map the required information :return: list of tables """ table_results = [] ids = list() for hit in response: ids.append(hit.guid) # receive all entities entities = self._entities(self.atlas.entity_bulk(guid=ids)) db_ids = [] for entity in entities: relations = entity.relationshipAttributes database = relations.get(self.DB_ATTRIBUTE) if database: db_ids.append(database['guid']) # request databases dbs_list = self._entities(self.atlas.entity_bulk( guid=db_ids)) if len(db_ids) > 0 else [] dbs_dict: Dict[str, Entity] = {db.guid: db for db in dbs_list} for entity in entities: relations = entity.relationshipAttributes attrs = entity.attributes database = relations.get(self.DB_ATTRIBUTE) if database and database['guid'] in dbs_dict: db_entity = dbs_dict[database['guid']] db_attrs = db_entity.attributes db_name = db_attrs.get(self.NAME_ATTRIBUTE) db_cluster = db_attrs.get("clusterName", "") else: db_cluster = '' db_name = '' tags = [] # Using or in case, if the key 'classifications' is there with attrs None for classification in attrs.get("classifications") or list(): tags.append(classification.get('typeName')) # TODO: Implement columns columns: List[str] = [] # for column in attrs.get('columns') or list(): # col_entity = entity.referredEntities[column['guid']] # col_attrs = col_entity['attributes'] # columns.append(col_attrs.get(self.NAME_KEY)) table_name = attrs.get(self.NAME_ATTRIBUTE) table = Table( name=table_name, key=f"{entity.typeName}://{db_cluster}.{db_name}/{table_name}", description=attrs.get('description'), cluster=db_cluster, database=entity.typeName or 'Table', schema_name=db_name, column_names=columns, tags=tags, last_updated_epoch=attrs.get('updateTime')) table_results.append(table) return table_results @timer_with_counter def fetch_search_results_with_field(self, *, query_term: str, field_name: str, field_value: str, page_index: int = 0) -> SearchResult: """ Query Atlas and return results as list of Table objects. Per field name we have a count query and a query for the tables. https://atlas.apache.org/Search-Advanced.html :param query_term: search query term :param field_name: field name to do the searching(e.g schema_name, tag_names) :param field_value: value for the field for filtering :param page_index: index of search page user is currently on :return: SearchResult Object :return: """ sql = f"Table from Table where false" count_sql = f"{sql} select count()" if field_name == 'tag': sql = f"from Table where Table is '{field_value}'" count_sql = f"{sql} select count()" elif field_name == 'schema': sql = f"from Table where db.name like '{field_value}'" count_sql = f"{sql} select count()" elif field_name == 'table': sql = f"from Table where name like '{field_value}'" count_sql = f"{sql} select count()" elif field_name == 'column': sql = f"hive_column where name like '{field_value}' select table" # TODO nanne: count tables instead of columns count_sql = f"hive_column where name like '{field_value}' select count()" LOGGER.debug(f"Used following sql query: {sql}") tables: List[Table] = [] count_value = 0 try: # count results count_params = {'query': count_sql} count_results = list(self.atlas.search_dsl(**count_params))[0] count_value = count_results._data['attributes']['values'][0][0] params = { 'query': f"{sql} limit {self.page_size} offset {page_index * self.page_size}" } search_results = self.atlas.search_dsl(**params) if count_value > 0 and page_index * self.page_size <= count_value: # unpack all collections (usually just one collection though) for collection in search_results: if hasattr(collection, 'entities'): tables.extend( self._parse_results(response=collection.entities)) except BadRequest: LOGGER.error("Atlas Search DSL error with the following query:", sql) return SearchResult(total_results=count_value, results=tables) @timer_with_counter def fetch_search_results(self, *, query_term: str, page_index: int = 0) -> SearchResult: """ Query Atlas and return results as list of Table objects We use the Atlas DSL for querying the tables. https://atlas.apache.org/Search-Advanced.html :param query_term: search query term :param page_index: index of search page user is currently on :return: SearchResult Object """ if not query_term: # return empty result for blank query term return SearchResult(total_results=0, results=[]) # define query sql = f"Table from Table " \ f"where name like '*{query_term}*' or " \ f"description like '*{query_term}*' " # count amount of tables count_params = {'query': f"{sql} select count()"} count_results = list(self.atlas.search_dsl(**count_params))[0] count_value = count_results._data['attributes']['values'][0][0] # select tables params = { 'query': f"{sql} " f"limit {self.page_size} " f"offset {page_index * self.page_size}" } search_results = self.atlas.search_dsl(**params) # retrieve results tables = [] if 0 < count_value >= page_index * self.page_size: for s in search_results: tables.extend(self._parse_results(response=s.entities)) return SearchResult(total_results=count_value, results=tables)