def search_entities_interactive(self, query_s=None, limit=100, offset=0, facetFields=None, facetPrefix=None, facetRanges=None, filterQueries=None, firstClassEntitiesOnly=None, sources=None): try: response = {"status": 0, "results": [], "facets": {"tags": {}}} # This takes care of the list_tags endpoint if (not query_s and facetFields and 'tags' in facetFields): # Classification names from Atlas can contain spaces which doesn't work with the top search at the moment # so for now we return an empty list # classification_response = self._root.get('/v2/types/typedefs?type=classification') # for classification_def in classification_response['classificationDefs']: # response['facets']['tags'][classification_def['name']] = 0 return response query_s = (query_s.strip() if query_s else '') + '*' search_terms = [term for term in query_s.strip().split() ] if query_s else [] query = [] atlas_type = 'hive_table' for term in search_terms: if ':' not in term: query.append(term) else: name, val = term.rstrip('*').split(':') if val and name.lower( ) == 'type' and self.NAV_TO_ATLAS_TYPE.get(val.lower()): atlas_type = self.NAV_TO_ATLAS_TYPE.get(val.lower()) atlas_dsl_query = 'from %s where name like \'%s\' limit %s' % ( atlas_type, ' '.join(query) or '*', limit) atlas_response = self._root.get('/v2/search/dsl?query=%s' % atlas_dsl_query) # Adapt Atlas entities to Navigator structure in the results return self.parse_atlas_response(atlas_response) except RestException, e: LOG.error('Failed to search for entities with search query: %s' % atlas_dsl_query) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def search_entities(self, query_s, limit=100, offset=0, raw_query=False, **filters): try: found_entities = [] search_terms = [term for term in query_s.strip().split() ] if query_s else [] parentPath = None for term in search_terms: if 'parentPath:' in term: name, val = term.split(':') parentPath = val.strip('"').lstrip('/').replace('/', '.') if query_s == 'type:database': if get_catalog_search_cluster(): atlas_dsl_query = 'from hive_db where qualifiedName like \'*@%s\' limit %s' % ( get_catalog_search_cluster(), limit) else: atlas_dsl_query = 'from hive_db limit %s' % limit elif not parentPath: return found_entities else: atlas_type = 'hive_table' if parentPath.count( '.') == 0 else 'hive_column' if get_catalog_search_cluster(): atlas_dsl_query = 'from %s where qualifiedName like \'%s*@%s\' limit %s' % ( atlas_type, parentPath, get_catalog_search_cluster(), limit) else: atlas_dsl_query = 'from %s where qualifiedName like \'%s*\' limit %s' % ( atlas_type, parentPath, limit) atlas_response = self._root.get('/v2/search/dsl?query=%s' % atlas_dsl_query) # Adapt Atlas entities to Navigator structure in the results if 'entities' in atlas_response: for atlas_entity in atlas_response['entities']: found_entities.append( self.adapt_atlas_entity_to_navigator(atlas_entity)) return found_entities except RestException as e: LOG.error('Failed to search for entities with search query: %s' % atlas_dsl_query) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def fetch_single_entity(self, dsl_query): ''' REQUEST: hue:8889/metadata/api/navigator/find_entity?type=database&name=default SAMPLE response for Navigator find_entity response {"status": 0, "entity": { "customProperties": null, "deleteTime": null, "fileSystemPath": "hdfs://nightly6x-1.vpc.cloudera.com:8020/user/hive/warehouse", "description": null, "params": null, "type": "DATABASE", "internalType": "hv_database", "sourceType": "HIVE", "tags": [], "deleted": false, "technicalProperties": null, "userEntity": false, "originalDescription": "Default Hive database", "metaClassName": "hv_database", "properties": {"__cloudera_internal__hueLink": "https://nightly6x-1.vpc.cloudera.com:8889/hue/metastore/tables/default"}, "identity": "23", "firstClassParentId": null, "name": null, "extractorRunId": "7##1", "sourceId": "7", "packageName": "nav", "parentPath": null, "originalName": "default"}} ''' response = {"status": 0, "entity": []} try: atlas_response = self._root.get('/v2/search/dsl?query=%s' % dsl_query, headers=self.__headers, params=self.__params) if not 'entities' in atlas_response or len( atlas_response['entities']) < 1: raise CatalogEntityDoesNotExistException( 'Could not find entity with query: %s' % dsl_query) for atlas_entity in atlas_response['entities']: response['entity'].append( self.adapt_atlas_entity_to_navigator(atlas_entity)) return response['entity'][0] except RestException as e: LOG.error('Failed to search for entities with search query: %s' % dsl_query) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def get_database(self, name): # Search with Atlas API for hive database with specific name try: dsl_query = '+'.join(['hive_db', 'where', 'name=%s']) % name atlas_response = self._root.get('/v2/search/dsl?query=%s' % dsl_query, headers=self.__headers, params=self.__params) return self.parse_atlas_response(atlas_response) except RestException, e: LOG.error('Failed to search for entities with search query: %s' % dsl_query) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def get_table(self, database_name, table_name, is_view=False): # Search with Atlas API for hive tables with specific name # TODO: Need figure out way how to identify the cluster info for exact qualifiedName or use startsWith 'db.table.column' try: qualifiedName = '%s.%s@cl1' % (database_name, table_name) dsl_query = '+'.join([ 'hive_table', 'where', 'qualifiedName=\"%s\"' ]) % qualifiedName atlas_response = self._root.get('/v2/search/dsl?query=%s' % dsl_query, headers=self.__headers, params=self.__params) return self.parse_atlas_response(atlas_response) except RestException, e: LOG.error('Failed to search for entities with search query: %s' % dsl_query) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def search_entities_interactive(self, query_s=None, limit=100, offset=0, facetFields=None, facetPrefix=None, facetRanges=None, filterQueries=None, firstClassEntitiesOnly=None, sources=None): response = { "status": 0, "results": [], "facets": { "tags": {} } } # This takes care of the list_tags endpoint if not query_s and facetFields and 'tags' in facetFields: classification_response = self._root.get('/v2/types/typedefs?type=classification') for classification_def in classification_response['classificationDefs']: if ' ' in classification_def['name']: response['facets']['tags']['"' + classification_def['name'] + '"'] = -1 else: response['facets']['tags'][classification_def['name']] = -1 return response query_s = (query_s.strip() if query_s else '').replace('*', '') atlas_type = None classification = None owner = None # Take the first classification and type facets and ignore other as we can't search multiple in Atlas. classification_facets = self.CLASSIFICATION_RE.findall(query_s) if classification_facets: classification = classification_facets[0][0] or classification_facets[0][1] query_s = self.CLASSIFICATION_RE.sub('', query_s).strip() atlas_type = 'Asset' # Filtered below to just contain hive_db, hive_table or hive_column owner_facets = self.OWNER_RE.findall(query_s) if owner_facets: owner = owner_facets[0] query_s = self.OWNER_RE.sub('', query_s).strip() type_facets = self.TYPE_RE.findall(query_s) if type_facets: atlas_type = self.NAV_TO_ATLAS_TYPE[type_facets[0].lower()] or type_facets[0] query_s = self.TYPE_RE.sub('', query_s).strip() data = { 'attributes': None, 'classification': classification, 'entityFilters': { 'condition': 'AND', 'criterion': [{ 'condition': 'OR', 'criterion': [{ 'attributeName': 'name', 'attributeValue': query_s, 'operator': 'contains' }, { 'attributeName': 'description', 'attributeValue': query_s, 'operator': 'contains' }] }] }, 'excludeDeletedEntities': True, 'includeClassificationAttributes': True, 'includeSubClassifications': True, 'includeSubTypes': True, 'limit': limit, 'offset': 0, 'tagFilters': None, 'termName': None, 'typeName': atlas_type or 'hive_table' } if get_catalog_search_cluster(): data['entityFilters']['criterion'].append({ 'attributeName': 'qualifiedName', 'operator': 'contains', 'attributeValue': '@' + get_catalog_search_cluster() }) if owner: data['entityFilters']['criterion'].append({ 'attributeName': 'owner', 'operator': 'startsWith', 'attributeValue': owner }) try: atlas_response = self._root.post('/v2/search/basic', data=json.dumps(data), contenttype=_JSON_CONTENT_TYPE) # Adapt Atlas entities to Navigator structure in the results if 'entities' in atlas_response: for atlas_entity in atlas_response['entities']: if atlas_type != 'Asset' or atlas_entity['typeName'].lower() in ['hive_db', 'hive_table', 'hive_column']: response['results'].append(self.adapt_atlas_entity_to_navigator(atlas_entity)) return response except RestException as e: LOG.error('Failed to search for entities with search query: %s' % data) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def search_entities_interactive(self, query_s=None, limit=100, offset=0, facetFields=None, facetPrefix=None, facetRanges=None, filterQueries=None, firstClassEntitiesOnly=None, sources=None): try: pagination = { 'offset': offset, 'limit': CATALOG.FETCH_SIZE_SEARCH_INTERACTIVE.get(), } f = { "outputFormat": { "type": "dynamic" }, "name": { "type": "dynamic" }, "lastModified": { "type": "date" }, "sourceType": { "type": "dynamic" }, "parentPath": { "type": "dynamic" }, "lastAccessed": { "type": "date" }, "type": { "type": "dynamic" }, "sourceId": { "type": "dynamic" }, "partitionColNames": { "type": "dynamic" }, "serDeName": { "type": "dynamic" }, "created": { "type": "date" }, "fileSystemPath": { "type": "dynamic" }, "compressed": { "type": "bool" }, "clusteredByColNames": { "type": "dynamic" }, "originalName": { "type": "dynamic" }, "owner": { "type": "dynamic" }, "extractorRunId": { "type": "dynamic" }, "userEntity": { "type": "bool" }, "sortByColNames": { "type": "dynamic" }, "inputFormat": { "type": "dynamic" }, "serDeLibName": { "type": "dynamic" }, "originalDescription": { "type": "dynamic" }, "lastModifiedBy": { "type": "dynamic" } } auto_field_facets = ["tags", "type"] + f.keys() query_s = (query_s.strip() if query_s else '') + '*' last_query_term = [term for term in query_s.split()][-1] if last_query_term and last_query_term != '*': last_query_term = last_query_term.rstrip('*') (fname, fval) = last_query_term.split( ':') if ':' in last_query_term else (last_query_term, '') auto_field_facets = [ f for f in auto_field_facets if f.startswith(fname) ] facetFields = facetFields or auto_field_facets[:5] entity_types = [] fq_type = [] if filterQueries is None: filterQueries = [] if sources: default_entity_types, entity_types = self._get_types_from_sources( sources) if 'sql' in sources or 'hive' in sources or 'impala' in sources: fq_type = default_entity_types filterQueries.append( 'sourceType:HIVE OR sourceType:IMPALA') elif 'hdfs' in sources: fq_type = entity_types elif 's3' in sources: fq_type = default_entity_types filterQueries.append('sourceType:s3') if query_s.strip().endswith( 'type:*'): # To list all available types fq_type = entity_types search_terms = [term for term in query_s.strip().split() ] if query_s else [] query = [] for term in search_terms: if ':' not in term: query.append(self._get_boosted_term(term)) else: name, val = term.split(':') if val: # Allow to type non default types, e.g for SQL: type:FIEL* if name == 'type': # Make sure type value still makes sense for the source term = '%s:%s' % (name, val.upper()) fq_type = entity_types if name.lower() not in [ 'type', 'tags', 'owner', 'originalname', 'originaldescription', 'lastmodifiedby' ]: # User Defined Properties are prefixed with 'up_', i.e. "department:sales" -> "up_department:sales" query.append('up_' + term) else: filterQueries.append(term) filterQueries.append('deleted:false') body = {'query': ' '.join(query) or '*'} if fq_type: filterQueries += [ '{!tag=type} %s' % ' OR '.join(['type:%s' % fq for fq in fq_type]) ] source_ids = self.get_cluster_source_ids() if source_ids: body['query'] = source_ids + '(' + body['query'] + ')' body['facetFields'] = facetFields or [ ] # Currently mandatory in API if facetPrefix: body['facetPrefix'] = facetPrefix if facetRanges: body['facetRanges'] = facetRanges if filterQueries: body['filterQueries'] = filterQueries if firstClassEntitiesOnly: body['firstClassEntitiesOnly'] = firstClassEntitiesOnly data = json.dumps(body) LOG.info(data) # ?typeName=hive_db # /search/dsl?query=hive_db%20where%20name='default' return self._root.post( '/search/basic?limit=%(limit)s&offset=%(offset)s' % pagination, data=data, contenttype=_JSON_CONTENT_TYPE, clear_cookies=True) except RestException, e: LOG.error('Failed to search for entities with search query: %s' % json.dumps(body)) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def search_entities(self, query_s, limit=100, offset=0, raw_query=False, **filters): """ Solr edismax query parser syntax. :param query_s: a query string of search terms (e.g. - sales quarterly); Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist of search_fields. """ sources = filters.get('sources', []) default_entity_types, entity_types = self._get_types_from_sources( sources) try: params = self.__params if not raw_query: query_s = query_s.replace('{', '\\{').replace( '}', '\\}').replace('(', '\\(').replace(')', '\\)').replace( '[', '\\[').replace(']', '\\]') search_terms = [term for term in query_s.strip().split()] query_clauses = [] user_filters = [] source_type_filter = [] for term in search_terms: if ':' not in term: if ('sql' in sources or 'hive' in sources or 'impala' in sources): if '.' in term: parent, term = term.rsplit('.', 1) user_filters.append('parentPath:"/%s"' % parent.replace('.', '/')) query_clauses.append(self._get_boosted_term(term)) else: name, val = term.split(':') if val: if name == 'type': term = '%s:%s' % (name, val.upper().strip('*')) default_entity_types = entity_types # Make sure type value still makes sense for the source user_filters.append( term + '*') # Manual filter allowed e.g. type:VIE* ca filter_query = '*' if query_clauses: filter_query = 'OR'.join( ['(%s)' % clause for clause in query_clauses]) user_filter_clause = 'AND '.join( ['(%s)' % f for f in user_filters]) or '*' source_filter_clause = 'OR'.join([ '(%s:%s)' % ('type', entity_type) for entity_type in default_entity_types ]) if 's3' in sources: source_type_filter.append('sourceType:s3') elif 'sql' in sources or 'hive' in sources or 'impala' in sources: source_type_filter.append( 'sourceType:HIVE OR sourceType:IMPALA') filter_query = '%s AND (%s) AND (%s)' % ( filter_query, user_filter_clause, source_filter_clause) if source_type_filter: filter_query += ' AND (%s)' % 'OR '.join( source_type_filter) source_ids = self.get_cluster_source_ids() if source_ids: filter_query = source_ids + '(' + filter_query + ')' else: filter_query = query_s params += ( ('query', filter_query), ('offset', offset), ('limit', CATALOG.FETCH_SIZE_SEARCH.get()), ) LOG.info(params) return self._root.get('entities', headers=self.__headers, params=params) except RestException, e: LOG.error('Failed to search for entities with search query: %s' % query_s) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e)
def search_entities_interactive(self, query_s=None, limit=100, offset=0, facetFields=None, facetPrefix=None, facetRanges=None, filterQueries=None, firstClassEntitiesOnly=None, sources=None): try: response = {"status": 0, "results": [], "facets": {"tags": {}}} # This takes care of the list_tags endpoint if not query_s and facetFields and 'tags' in facetFields: classification_response = self._root.get( '/v2/types/typedefs?type=classification') for classification_def in classification_response[ 'classificationDefs']: response['facets']['tags'][classification_def['name']] = -1 return response query_s = (query_s.strip() if query_s else '') + '*' search_terms = [term for term in query_s.strip().split() ] if query_s else [] query = [] atlas_type = None for term in search_terms: if ':' not in term: query.append(term) else: name, val = term.rstrip('*').split(':') if val and name.lower( ) == 'type' and self.NAV_TO_ATLAS_TYPE.get(val.lower()): atlas_type = self.NAV_TO_ATLAS_TYPE.get(val.lower()) if val and name.lower() in [ 'tag', 'tags', 'classification' ]: if not atlas_type: atlas_type = 'Asset' # 'Asset' contains all types of entities so we need to filter below # Atlas filters by classification name on default query.append(val + '*') data = json.dumps({ "attributes": None, "classification": None, "entityFilters": None, "excludeDeletedEntities": True, "includeClassificationAttributes": True, "includeSubClassifications": True, "includeSubTypes": True, "limit": limit, "offset": 0, "query": ' '.join(query), "tagFilters": None, "termName": None, "typeName": atlas_type or 'hive_table' }) atlas_response = self._root.post('/v2/search/basic', data=data, contenttype=_JSON_CONTENT_TYPE) # Adapt Atlas entities to Navigator structure in the results if 'entities' in atlas_response: for atlas_entity in atlas_response['entities']: if atlas_type != 'Asset' or atlas_entity['typeName'].lower( ) in ['hive_db', 'hive_table', 'hive_column']: response['results'].append( self.adapt_atlas_entity_to_navigator(atlas_entity)) return response except RestException as e: LOG.error('Failed to search for entities with search query: %s' % data) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)
def search_entities_interactive(self, query_s=None, limit=100, offset=0, facetFields=None, facetPrefix=None, facetRanges=None, filterQueries=None, firstClassEntitiesOnly=None, sources=None): try: query_data = { "excludeDeletedEntities": True, "includeSubClassifications": True, "includeSubTypes": True, "includeClassificationAttributes": True, "entityFilters": None, "tagFilters": None, "attributes": None, "query": "*", "limit": CATALOG.FETCH_SIZE_SEARCH_INTERACTIVE.get(), "offset": offset, "typeName": None, "classification": None, "termName": None } f = { "outputFormat": { "type": "dynamic" }, "name": { "type": "dynamic" }, "lastModified": { "type": "date" }, "sourceType": { "type": "dynamic" }, "parentPath": { "type": "dynamic" }, "lastAccessed": { "type": "date" }, "type": { "type": "dynamic" }, "sourceId": { "type": "dynamic" }, "partitionColNames": { "type": "dynamic" }, "serDeName": { "type": "dynamic" }, "created": { "type": "date" }, "fileSystemPath": { "type": "dynamic" }, "compressed": { "type": "bool" }, "clusteredByColNames": { "type": "dynamic" }, "originalName": { "type": "dynamic" }, "owner": { "type": "dynamic" }, "extractorRunId": { "type": "dynamic" }, "userEntity": { "type": "bool" }, "sortByColNames": { "type": "dynamic" }, "inputFormat": { "type": "dynamic" }, "serDeLibName": { "type": "dynamic" }, "originalDescription": { "type": "dynamic" }, "lastModifiedBy": { "type": "dynamic" } } auto_field_facets = ["tags", "type"] + f.keys() query_s = (query_s.strip() if query_s else '') + '*' last_query_term = [term for term in query_s.split()][-1] if last_query_term and last_query_term != '*': last_query_term = last_query_term.rstrip('*') (fname, fval) = last_query_term.split( ':') if ':' in last_query_term else (last_query_term, '') auto_field_facets = [ f for f in auto_field_facets if f.startswith(fname) ] facetFields = facetFields or auto_field_facets[:5] entity_types = [] fq_type = [] if filterQueries is None: filterQueries = [] if sources: default_entity_types, entity_types = self._get_types_from_sources( sources) if 'sql' in sources or 'hive' in sources or 'impala' in sources: fq_type = default_entity_types filterQueries.append( 'sourceType:HIVE OR sourceType:IMPALA') elif 'hdfs' in sources: fq_type = entity_types elif 's3' in sources: fq_type = default_entity_types filterQueries.append('sourceType:s3') if query_s.strip().endswith( 'type:*'): # To list all available types fq_type = entity_types search_terms = [term for term in query_s.strip().split() ] if query_s else [] query = [] for term in search_terms: query.append(term) # if ':' not in term: # query.append(self._get_boosted_term(term)) # else: # name, val = term.split(':') # if val: # Allow to type non default types, e.g for SQL: type:FIEL* # if name == 'type': # Make sure type value still makes sense for the source # term = '%s:%s' % (name, val.upper()) # fq_type = entity_types # if name.lower() not in ['type', 'tags', 'owner', 'originalname', 'originaldescription', 'lastmodifiedby']: # # User Defined Properties are prefixed with 'up_', i.e. "department:sales" -> "up_department:sales" # query.append('up_' + term) # else: # filterQueries.append(term) # filterQueries.append('deleted:false') query_data['query'] = ' '.join(query) or '*' body = {} # if fq_type: # filterQueries += ['{!tag=type} %s' % ' OR '.join(['type:%s' % fq for fq in fq_type])] # body['facetFields'] = facetFields or [] # Currently mandatory in API # if facetPrefix: # body['facetPrefix'] = facetPrefix # if facetRanges: # body['facetRanges'] = facetRanges # if filterQueries: # body['filterQueries'] = filterQueries # if firstClassEntitiesOnly: # body['firstClassEntitiesOnly'] = firstClassEntitiesOnly data = json.dumps(query_data) LOG.info(data) response = self._root.post('/search/basic', data=data, contenttype=_JSON_CONTENT_TYPE) response['results'] = [ self._massage_entity(entity) for entity in response.pop('entities', []) ] return response except RestException, e: print(e) LOG.error('Failed to search for entities with search query: %s' % json.dumps(body)) if e.code == 401: raise CatalogAuthException(_('Failed to authenticate.')) else: raise CatalogApiException(e.message)