Ejemplo n.º 1
0
  def __init__(self, user=None):
    self._api_url = '%s/%s' % (NAVIGATOR.API_URL.get().strip('/'), VERSION)
    self._username = get_navigator_auth_username()
    self._password = get_navigator_auth_password()

    self.user = user
    # Navigator does not support Kerberos authentication while other components usually requires it
    self._client = UnsecureHttpClient(self._api_url, logger=LOG)
    self._client.set_basic_auth(self._username, self._password)
    self._root = resource.Resource(self._client, urlencode=False) # For search_entities_interactive

    self.__headers = {}
    self.__params = ()
Ejemplo n.º 2
0
  def __init__(self, user=None):
    super(AtlasApi, self).__init__(user)

    self._api_url = CATALOG.API_URL.get().strip('/')
    self._username = CATALOG.SERVER_USER.get()
    self._password = get_catalog_auth_password()

    # Navigator does not support Kerberos authentication while other components usually requires it
    self._client = UnsecureHttpClient(self._api_url, logger=LOG)
    self._client.set_basic_auth(self._username, self._password)
    self._root = resource.Resource(self._client, urlencode=False) # For search_entities_interactive

    self.__headers = {}
    self.__params = ()
Ejemplo n.º 3
0
class AtlasApi(Api):
  """
  https://atlas.apache.org
  """
  DEFAULT_SEARCH_FIELDS = (('originalName', 3), ('originalDescription', 1), ('name', 10), ('description', 3), ('tags', 5))
  CATALOG_NAMESPACE = '__cloudera_internal_catalog_hue'

  NAV_TO_ATLAS_TYPE = {
    'table': 'hive_table',
    'database': 'hive_db',
    'field': 'hive_column'
  }

  ATLAS_TO_NAV_TYPE = {
    'hive_table': 'TABLE',
    'hive_db': 'DATABASE',
    'hive_column': 'FIELD'
  }

  CLASSIFICATION_RE = re.compile('(?:tag|tags|classification)\s*\:\s*(?:(?:\"([^"]+)\")|([^ ]+))\s*', re.IGNORECASE)
  TYPE_RE = re.compile('type\s*\:\s*([^ ]+)\s*', re.IGNORECASE)
  OWNER_RE = re.compile('owner\s*\:\s*([^ ]+)\s*', re.IGNORECASE)

  def __init__(self, user=None):
    super(AtlasApi, self).__init__(user)

    self._api_url = CATALOG.API_URL.get().strip('/')
    self._username = CATALOG.SERVER_USER.get()
    self._password = get_catalog_auth_password()

    # Navigator does not support Kerberos authentication while other components usually requires it
    self._client = UnsecureHttpClient(self._api_url, logger=LOG)
    self._client.set_basic_auth(self._username, self._password)
    self._root = resource.Resource(self._client, urlencode=False) # For search_entities_interactive

    self.__headers = {}
    self.__params = ()

    #self._fillup_properties() # Disabled currently


  def _get_types_from_sources(self, sources):
    default_entity_types = entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'VIEW', 'S3BUCKET', 'OPERATION', 'DIRECTORY')

    if 'sql' in sources or 'hive' in sources or 'impala' in sources:
      entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
      default_entity_types = ('TABLE', 'VIEW')
    elif 'hdfs' in sources:
      entity_types = ('FILE', 'DIRECTORY')
      default_entity_types  = ('FILE', 'DIRECTORY')
    elif 's3' in sources:
      entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET')
      default_entity_types  = ('DIRECTORY', 'S3BUCKET')

    return default_entity_types, entity_types

  def adapt_atlas_entity_to_navigator(self, atlas_entity):
    nav_entity = {
      "created": 'createTime' in atlas_entity['attributes'] and atlas_entity['attributes']['createTime'],
      "customProperties": None,
      "description": atlas_entity['attributes'].get('description'),
      "identity": atlas_entity['guid'],
      "internalType": atlas_entity['typeName'],
      "meaningNames": atlas_entity['meaningNames'], # Atlas specific
      "meanings": atlas_entity['meanings'], # Atlas specific
      "name": atlas_entity['attributes'].get('name'),
      "original_name": atlas_entity['attributes'].get('name'),
      "originalDescription": None,
      "originalName": atlas_entity['attributes'].get('name'),
      "owner": atlas_entity['attributes'].get('owner'),
      "parentPath": '', # Set below
      "properties": {}, # Set below
      "sourceType": '', # Set below
      "tags": atlas_entity['classificationNames'],
      "type": self.ATLAS_TO_NAV_TYPE.get(atlas_entity['typeName'].lower()) or atlas_entity['typeName']
    }

    # Convert Atlas qualified name of form db.tbl.col@cluster to parentPath of form /db/tbl
    if atlas_entity['typeName'].lower().startswith('hive_'):
      nav_entity['sourceType'] = 'HIVE'
      qualified_path_parts = re.sub(r'@.*$', '', atlas_entity['attributes'].get('qualifiedName')).split('.')
      qualified_path_parts.pop()  # it's just the parent path we want so remove the entity name
      nav_entity['parentPath'] = '/' + '/'.join(qualified_path_parts)

    if 'classifications' in atlas_entity:
      for atlas_classification in atlas_entity['classifications']:
        if 'attributes' in atlas_classification:
          for key, value in atlas_classification['attributes'].iteritems():
            nav_entity['properties'][key] = value

    return nav_entity

  def fetch_single_entity(self, dsl_query):
    '''
    REQUEST: hue:8889/metadata/api/navigator/find_entity?type=database&name=default
    SAMPLE response for Navigator find_entity response
    {"status": 0, "entity": {
    "customProperties": null,
    "deleteTime": null,
     "fileSystemPath": "hdfs://nightly6x-1.vpc.cloudera.com:8020/user/hive/warehouse",
     "description": null,
     "params": null,
      "type": "DATABASE",
      "internalType": "hv_database",
      "sourceType": "HIVE",
      "tags": [],
      "deleted": false, "technicalProperties": null,
      "userEntity": false,
      "originalDescription": "Default Hive database",
      "metaClassName": "hv_database",
      "properties": {"__cloudera_internal__hueLink": "https://nightly6x-1.vpc.cloudera.com:8889/hue/metastore/tables/default"},
      "identity": "23",
      "firstClassParentId": null,
      "name": null,
      "extractorRunId": "7##1",
      "sourceId": "7",
       "packageName": "nav",
       "parentPath": null, "originalName": "default"}}
    '''
    response = {
      "status": 0,
      "entity": []
    }

    try :
      atlas_response = self._root.get('/v2/search/dsl?query=%s' % dsl_query, headers=self.__headers,
                                      params=self.__params)
      if not 'entities' in atlas_response or len(atlas_response['entities']) < 1:
        raise CatalogEntityDoesNotExistException('Could not find entity with query: %s' % dsl_query)

      for atlas_entity in atlas_response['entities']:
        response['entity'].append(self.adapt_atlas_entity_to_navigator(atlas_entity))

      return response['entity'][0]
    except RestException as e:
      LOG.error('Failed to search for entities with search query: %s' % dsl_query)
      if e.code == 401:
        raise CatalogAuthException(_('Failed to authenticate.'))
      else:
        raise CatalogApiException(e.message)

  def get_database(self, name):
    # Search with Atlas API for hive database with specific name
    if get_catalog_search_cluster():
      qualifiedNameCriteria = 'qualifiedName=\'%s@%s\'' % (name, get_catalog_search_cluster())
    else:
      qualifiedNameCriteria = 'qualifiedName like \'%s@*\'' % name

    return self.fetch_single_entity('hive_db where %s' % qualifiedNameCriteria)

  def get_table(self, database_name, table_name, is_view=False):
    # Search with Atlas API for hive tables with specific name
    if get_catalog_search_cluster():
      qualifiedNameCriteria = 'qualifiedName=\'%s.%s@%s\'' % (database_name, table_name, get_catalog_search_cluster())
    else:
      qualifiedNameCriteria = 'qualifiedName like \'%s.%s@*\'' % (database_name, table_name)

    return self.fetch_single_entity('hive_table where %s' % qualifiedNameCriteria)

  def get_field(self, database_name, table_name, field_name):
    # Search with Atlas API for hive tables with specific qualified name
    if get_catalog_search_cluster():
      qualifiedNameCriteria = 'qualifiedName=\'%s.%s.%s@%s\'' % (database_name, table_name, field_name,
                                                                 get_catalog_search_cluster())
    else:
      qualifiedNameCriteria = 'qualifiedName like \'%s.%s.%s@*\'' % (database_name, table_name, field_name)

    return self.fetch_single_entity('hive_column where %s' % qualifiedNameCriteria)

  def search_entities_interactive(self, query_s=None, limit=100, offset=0, facetFields=None, facetPrefix=None, facetRanges=None, filterQueries=None, firstClassEntitiesOnly=None, sources=None):
    response = {
      "status": 0,
      "results": [],
      "facets": {
        "tags": {}
      }
    }

    # This takes care of the list_tags endpoint
    if not query_s and facetFields and 'tags' in facetFields:
      classification_response = self._root.get('/v2/types/typedefs?type=classification')
      for classification_def in classification_response['classificationDefs']:
        if ' ' in classification_def['name']:
          response['facets']['tags']['"' + classification_def['name'] + '"'] = -1
        else:
          response['facets']['tags'][classification_def['name']] = -1
      return response

    query_s = (query_s.strip() if query_s else '').replace('*', '')

    atlas_type = None
    classification = None
    owner = None

    # Take the first classification and type facets and ignore other as we can't search multiple in Atlas.
    classification_facets = self.CLASSIFICATION_RE.findall(query_s)
    if classification_facets:
      classification = classification_facets[0][0] or classification_facets[0][1]
      query_s = self.CLASSIFICATION_RE.sub('', query_s).strip()
      atlas_type = 'Asset'  # Filtered below to just contain hive_db, hive_table or hive_column

    owner_facets = self.OWNER_RE.findall(query_s)
    if owner_facets:
      owner = owner_facets[0]
      query_s = self.OWNER_RE.sub('', query_s).strip()

    type_facets = self.TYPE_RE.findall(query_s)
    if type_facets:
      atlas_type = self.NAV_TO_ATLAS_TYPE[type_facets[0].lower()] or type_facets[0]
      query_s = self.TYPE_RE.sub('', query_s).strip()

    data = {
      'attributes': None,
      'classification': classification,
      'entityFilters': {
        'condition': 'AND',
        'criterion': [{
          'condition': 'OR',
          'criterion': [{
            'attributeName': 'name',
            'attributeValue': query_s,
            'operator': 'contains'
          }, {
            'attributeName': 'description',
            'attributeValue': query_s,
            'operator': 'contains'
          }]
        }]
      },
      'excludeDeletedEntities': True,
      'includeClassificationAttributes': True,
      'includeSubClassifications': True,
      'includeSubTypes': True,
      'limit': limit,
      'offset': 0,
      'tagFilters': None,
      'termName': None,
      'typeName': atlas_type or 'hive_table'
    }

    if get_catalog_search_cluster():
      data['entityFilters']['criterion'].append({
        'attributeName': 'qualifiedName',
        'operator': 'contains',
        'attributeValue': '@' + get_catalog_search_cluster()
      })

    if owner:
      data['entityFilters']['criterion'].append({
        'attributeName': 'owner',
        'operator': 'startsWith',
        'attributeValue': owner
      })

    try:
      atlas_response = self._root.post('/v2/search/basic', data=json.dumps(data), contenttype=_JSON_CONTENT_TYPE)

      # Adapt Atlas entities to Navigator structure in the results
      if 'entities' in atlas_response:
        for atlas_entity in atlas_response['entities']:
          if atlas_type != 'Asset' or atlas_entity['typeName'].lower() in ['hive_db', 'hive_table', 'hive_column']:
            response['results'].append(self.adapt_atlas_entity_to_navigator(atlas_entity))

      return response
    except RestException as e:
      LOG.error('Failed to search for entities with search query: %s' % data)
      if e.code == 401:
        raise CatalogAuthException(_('Failed to authenticate.'))
      else:
        raise CatalogApiException(e.message)

  # search_enties is only used by the table browser to fetch child entities of a given table or database.
  def search_entities(self, query_s, limit=100, offset=0, raw_query=False, **filters):
    try:
      found_entities = []

      search_terms = [term for term in query_s.strip().split()] if query_s else []
      parentPath = None
      for term in search_terms:
        if 'parentPath:' in term:
          name, val = term.split(':')
          parentPath = val.strip('"').lstrip('/').replace('/', '.')

      if query_s == 'type:database':
        if get_catalog_search_cluster():
          atlas_dsl_query = 'from hive_db where qualifiedName like \'*@%s\' limit %s' % (
            get_catalog_search_cluster(),
            limit
          )
        else:
          atlas_dsl_query = 'from hive_db limit %s' % limit
      elif not parentPath:
        return found_entities
      else:
        atlas_type = 'hive_table' if parentPath.count('.') == 0 else 'hive_column'
        if get_catalog_search_cluster():
          atlas_dsl_query = 'from %s where qualifiedName like \'%s*@%s\' limit %s' % (
            atlas_type,
            parentPath,
            get_catalog_search_cluster(),
            limit
          )
        else:
          atlas_dsl_query = 'from %s where qualifiedName like \'%s*\' limit %s' % (atlas_type, parentPath, limit)

      atlas_response = self._root.get('/v2/search/dsl?query=%s' % atlas_dsl_query)

      # Adapt Atlas entities to Navigator structure in the results
      if 'entities' in atlas_response:
        for atlas_entity in atlas_response['entities']:
          found_entities.append(self.adapt_atlas_entity_to_navigator(atlas_entity))

      return found_entities
    except RestException as e:
      LOG.error('Failed to search for entities with search query: %s' % atlas_dsl_query)
      if e.code == 401:
        raise CatalogAuthException(_('Failed to authenticate.'))
      else:
        raise CatalogApiException(e.message)

  def suggest(self, prefix=None):
    try:
      return self._root.get('interactive/suggestions?query=%s' % (prefix or '*'))
    except RestException as e:
      msg = 'Failed to search for entities with search query: %s' % prefix
      LOG.error(msg)
      raise CatalogApiException(e.message)

  def get_entity(self, entity_id):
    """
    # TODO: get entity by Atlas __guid or qualifiedName
    GET /v2/search/dsl?query=?
    """
    try:
      return self._root.get('entities/%s' % entity_id, headers=self.__headers, params=self.__params)
    except RestException as e:
      msg = 'Failed to get entity %s: %s' % (entity_id, str(e))
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def update_entity(self, entity, **metadata):
    """
    PUT /api/v3/entities/:id
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities_-id-.html
    """
    try:
      # Workarounds NAV-6187: if we don't re-send those, they would get erased.
      properties = {
        'name': entity['name'],
        'description': entity['description'],
        'properties': entity['properties'] or {},
        'customProperties': entity['customProperties'] or {}
      }
      properties.update(metadata)
      data = json.dumps(properties)

      return self._root.put('entities/%(identity)s' % entity, params=self.__params, data=data, contenttype=_JSON_CONTENT_TYPE, allow_redirects=True, clear_cookies=True)
    except RestException as e:
      msg = 'Failed to update entity %s: %s' % (entity['identity'], e)
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def get_cluster_source_ids(self):
    return []
    # params = (
    #   ('query', 'clusterName:"%s"' % get_navigator_hue_server_name()),
    #   ('limit', 200),
    # )

    # LOG.info(params)
    # return self._root.get('entities', headers=self.__headers, params=params)


  def add_tags(self, entity_id, tags):
    entity = self.get_entity(entity_id)
    new_tags = entity['tags'] or []
    new_tags.extend(tags)
    return self.update_entity(entity, tags=new_tags)


  def delete_tags(self, entity_id, tags):
    entity = self.get_entity(entity_id)
    new_tags = entity['tags'] or []
    for tag in tags:
      if tag in new_tags:
        new_tags.remove(tag)
    return self.update_entity(entity, tags=new_tags)


  def update_properties(self, entity_id, properties, modified_custom_metadata=None, deleted_custom_metadata_keys=None):
    entity = self.get_entity(entity_id)

    if modified_custom_metadata:
      properties['properties'] = entity['properties'] or {}
      properties['properties'].update(modified_custom_metadata)
    if deleted_custom_metadata_keys:
      properties['properties'] = entity['properties'] or {}
      for key in deleted_custom_metadata_keys:
        if key in properties['properties']:
          del properties['properties'][key]
    return self.update_entity(entity, **properties)


  def delete_metadata_properties(self, entity_id, property_keys):
    entity = self.get_entity(entity_id)
    new_props = entity['properties'] or {}
    for key in property_keys:
      if key in new_props:
        del new_props[key]
    return self.update_entity(entity, properties=new_props)


  def get_lineage(self, entity_id):
    """
    GET /api/v3/lineage/entityIds=:id
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_lineage.html
    """
    try:
      params = self.__params

      params += (
        ('entityIds', entity_id),
      )

      return self._root.get('lineage', headers=self.__headers, params=params)
    except RestException as e:
      msg = 'Failed to get lineage for entity ID %s: %s' % (entity_id, str(e))
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def create_namespace(self, namespace, description=None):
    try:
      data = json.dumps({'name': namespace, 'description': description})
      return self._root.post('models/namespaces/', data=data, contenttype=_JSON_CONTENT_TYPE, clear_cookies=True)
    except RestException as e:
      msg = 'Failed to create namespace: %s' % namespace
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def get_namespace(self, namespace):
    try:
      return self._root.get('models/namespaces/%(namespace)s' % {'namespace': namespace})
    except RestException as e:
      msg = 'Failed to get namespace: %s' % namespace
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def create_namespace_property(self, namespace, properties):
    try:
      data = json.dumps(properties)
      return self._root.post('models/namespaces/%(namespace)s/properties' % {'namespace': namespace}, data=data, contenttype=_JSON_CONTENT_TYPE, clear_cookies=True)
    except RestException as e:
      msg = 'Failed to create namespace %s property' % namespace
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def get_namespace_properties(self, namespace):
    try:
      return self._root.get('models/namespaces/%(namespace)s/properties' % {'namespace': namespace})
    except RestException as e:
      msg = 'Failed to create namespace %s property' % namespace
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def map_namespace_property(self, clazz, properties):
    try:
      data = json.dumps(properties)
      return self._root.post('models/packages/nav/classes/%(class)s/properties' % {'class': clazz}, data=data, contenttype=_JSON_CONTENT_TYPE, clear_cookies=True)
    except RestException as e:
      msg = 'Failed to map class %s property' % clazz
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def get_model_properties_mapping(self):
    try:
      return self._root.get('models/properties/mappings')
    except RestException as e:
      msg = 'Failed to get models properties mappings'
      LOG.error(msg)
      raise CatalogApiException(e.message)


  def _fillup_properties(self):
    global _HAS_CATALOG_NAMESPACE

    if _HAS_CATALOG_NAMESPACE is None:
      response = self.get_namespace(namespace=AtlasApi.CATALOG_NAMESPACE)
      if not response:
        self.create_namespace(namespace=AtlasApi.CATALOG_NAMESPACE, description="Set of fields to augment the data catalog")

      properties = self.get_namespace_properties(namespace=AtlasApi.CATALOG_NAMESPACE)

      if not [_property for _property in properties if _property['name'] == 'relatedDocuments']:
        self.create_namespace_property(namespace=AtlasApi.CATALOG_NAMESPACE, properties={
          "name": "relatedDocuments",
          "displayName": "Related documents",
          "description": "List of Hue document UUIDs related to this entity",
          "multiValued": True,
          "maxLength": 36,
          "pattern": ".*", # UUID
          "enumValues": None,
          "type": "TEXT"
         })

        # Might want to check if the mapping is already done
        for clazz in ('hv_table', 'hv_view'):
          self.map_namespace_property(clazz, properties=[{
             "namespace": AtlasApi.CATALOG_NAMESPACE,
             "name": "relatedDocuments"
          }])

      _HAS_CATALOG_NAMESPACE = True


  def _get_boosted_term(self, term):
    return 'AND'.join([
      '(%s)' % 'OR'.join(['(%s:%s*^%s)' % (field, term, weight) for (field, weight) in AtlasApi.DEFAULT_SEARCH_FIELDS]),  # Matching fields
      '(%s)' % 'OR'.join(['(%s:[* TO *])' % field for (field, weight) in AtlasApi.DEFAULT_SEARCH_FIELDS]) # Boost entities with enriched fields
      # Could add certain customProperties and properties
    ])

  def _clean_path(self, path):
    return path.rstrip('/').split('/')[-1], self._escape_slashes(path.rstrip('/'))


  def _escape_slashes(self, s):
    return s.replace('/', '\/')
Ejemplo n.º 4
0
class AtlasApi(Api):
    """
  https://atlas.apache.org
  """
    DEFAULT_SEARCH_FIELDS = (('originalName', 3), ('originalDescription', 1),
                             ('name', 10), ('description', 3), ('tags', 5))
    CATALOG_NAMESPACE = '__cloudera_internal_catalog_hue'

    def __init__(self, user=None):
        super(AtlasApi, self).__init__(user)

        self._api_url = CATALOG.API_URL.get().strip('/')
        self._username = get_catalog_auth_username()
        self._password = get_catalog_auth_password()

        # Navigator does not support Kerberos authentication while other components usually requires it
        self._client = UnsecureHttpClient(self._api_url, logger=LOG)
        self._client.set_basic_auth(self._username, self._password)
        self._root = resource.Resource(
            self._client, urlencode=False)  # For search_entities_interactive

        self.__headers = {}
        self.__params = ()

        #self._fillup_properties() # Disabled currently

    def _get_types_from_sources(self, sources):
        default_entity_types = entity_types = ('DATABASE', 'TABLE',
                                               'PARTITION', 'FIELD', 'FILE',
                                               'VIEW', 'S3BUCKET', 'OPERATION',
                                               'DIRECTORY')

        if 'sql' in sources or 'hive' in sources or 'impala' in sources:
            entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
            default_entity_types = ('TABLE', 'VIEW')
        elif 'hdfs' in sources:
            entity_types = ('FILE', 'DIRECTORY')
            default_entity_types = ('FILE', 'DIRECTORY')
        elif 's3' in sources:
            entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET')
            default_entity_types = ('DIRECTORY', 'S3BUCKET')

        return default_entity_types, entity_types

    def search_entities_interactive(self,
                                    query_s=None,
                                    limit=100,
                                    offset=0,
                                    facetFields=None,
                                    facetPrefix=None,
                                    facetRanges=None,
                                    filterQueries=None,
                                    firstClassEntitiesOnly=None,
                                    sources=None):
        try:
            pagination = {
                'offset': offset,
                'limit': CATALOG.FETCH_SIZE_SEARCH_INTERACTIVE.get(),
            }

            f = {
                "outputFormat": {
                    "type": "dynamic"
                },
                "name": {
                    "type": "dynamic"
                },
                "lastModified": {
                    "type": "date"
                },
                "sourceType": {
                    "type": "dynamic"
                },
                "parentPath": {
                    "type": "dynamic"
                },
                "lastAccessed": {
                    "type": "date"
                },
                "type": {
                    "type": "dynamic"
                },
                "sourceId": {
                    "type": "dynamic"
                },
                "partitionColNames": {
                    "type": "dynamic"
                },
                "serDeName": {
                    "type": "dynamic"
                },
                "created": {
                    "type": "date"
                },
                "fileSystemPath": {
                    "type": "dynamic"
                },
                "compressed": {
                    "type": "bool"
                },
                "clusteredByColNames": {
                    "type": "dynamic"
                },
                "originalName": {
                    "type": "dynamic"
                },
                "owner": {
                    "type": "dynamic"
                },
                "extractorRunId": {
                    "type": "dynamic"
                },
                "userEntity": {
                    "type": "bool"
                },
                "sortByColNames": {
                    "type": "dynamic"
                },
                "inputFormat": {
                    "type": "dynamic"
                },
                "serDeLibName": {
                    "type": "dynamic"
                },
                "originalDescription": {
                    "type": "dynamic"
                },
                "lastModifiedBy": {
                    "type": "dynamic"
                }
            }

            auto_field_facets = ["tags", "type"] + f.keys()
            query_s = (query_s.strip() if query_s else '') + '*'

            last_query_term = [term for term in query_s.split()][-1]

            if last_query_term and last_query_term != '*':
                last_query_term = last_query_term.rstrip('*')
                (fname, fval) = last_query_term.split(
                    ':') if ':' in last_query_term else (last_query_term, '')
                auto_field_facets = [
                    f for f in auto_field_facets if f.startswith(fname)
                ]

            facetFields = facetFields or auto_field_facets[:5]

            entity_types = []
            fq_type = []
            if filterQueries is None:
                filterQueries = []

            if sources:
                default_entity_types, entity_types = self._get_types_from_sources(
                    sources)

                if 'sql' in sources or 'hive' in sources or 'impala' in sources:
                    fq_type = default_entity_types
                    filterQueries.append(
                        'sourceType:HIVE OR sourceType:IMPALA')
                elif 'hdfs' in sources:
                    fq_type = entity_types
                elif 's3' in sources:
                    fq_type = default_entity_types
                    filterQueries.append('sourceType:s3')

                if query_s.strip().endswith(
                        'type:*'):  # To list all available types
                    fq_type = entity_types

            search_terms = [term for term in query_s.strip().split()
                            ] if query_s else []
            query = []
            for term in search_terms:
                if ':' not in term:
                    query.append(self._get_boosted_term(term))
                else:
                    name, val = term.split(':')
                    if val:  # Allow to type non default types, e.g for SQL: type:FIEL*
                        if name == 'type':  # Make sure type value still makes sense for the source
                            term = '%s:%s' % (name, val.upper())
                            fq_type = entity_types
                        if name.lower() not in [
                                'type', 'tags', 'owner', 'originalname',
                                'originaldescription', 'lastmodifiedby'
                        ]:
                            # User Defined Properties are prefixed with 'up_', i.e. "department:sales" -> "up_department:sales"
                            query.append('up_' + term)
                        else:
                            filterQueries.append(term)

            filterQueries.append('deleted:false')

            body = {'query': ' '.join(query) or '*'}
            if fq_type:
                filterQueries += [
                    '{!tag=type} %s' %
                    ' OR '.join(['type:%s' % fq for fq in fq_type])
                ]

            source_ids = self.get_cluster_source_ids()
            if source_ids:
                body['query'] = source_ids + '(' + body['query'] + ')'

            body['facetFields'] = facetFields or [
            ]  # Currently mandatory in API
            if facetPrefix:
                body['facetPrefix'] = facetPrefix
            if facetRanges:
                body['facetRanges'] = facetRanges
            if filterQueries:
                body['filterQueries'] = filterQueries
            if firstClassEntitiesOnly:
                body['firstClassEntitiesOnly'] = firstClassEntitiesOnly

            data = json.dumps(body)
            LOG.info(data)
            # ?typeName=hive_db
            # /search/dsl?query=hive_db%20where%20name='default'
            return self._root.post(
                '/search/basic?limit=%(limit)s&offset=%(offset)s' % pagination,
                data=data,
                contenttype=_JSON_CONTENT_TYPE,
                clear_cookies=True)
        except RestException, e:
            LOG.error('Failed to search for entities with search query: %s' %
                      json.dumps(body))
            if e.code == 401:
                raise CatalogAuthException(_('Failed to authenticate.'))
            else:
                raise CatalogApiException(e.message)
Ejemplo n.º 5
0
class AtlasApi(Api):
    """
  https://atlas.apache.org
  """
    DEFAULT_SEARCH_FIELDS = (('originalName', 3), ('originalDescription', 1),
                             ('name', 10), ('description', 3), ('tags', 5))
    CATALOG_NAMESPACE = '__cloudera_internal_catalog_hue'

    NAV_TO_ATLAS_TYPE = {
        'table': 'hive_table',
        'database': 'hive_db',
        'field': 'hive_column'
    }

    ATLAS_TO_NAV_TYPE = {
        'hive_table': 'TABLE',
        'hive_db': 'DATABASE',
        'hive_column': 'FIELD'
    }

    def __init__(self, user=None):
        super(AtlasApi, self).__init__(user)

        self._api_url = CATALOG.API_URL.get().strip('/')
        self._username = CATALOG.SERVER_USER.get()
        self._password = get_catalog_auth_password()

        # Navigator does not support Kerberos authentication while other components usually requires it
        self._client = UnsecureHttpClient(self._api_url, logger=LOG)
        self._client.set_basic_auth(self._username, self._password)
        self._root = resource.Resource(
            self._client, urlencode=False)  # For search_entities_interactive

        self.__headers = {}
        self.__params = ()

        #self._fillup_properties() # Disabled currently

    def _get_types_from_sources(self, sources):
        default_entity_types = entity_types = ('DATABASE', 'TABLE',
                                               'PARTITION', 'FIELD', 'FILE',
                                               'VIEW', 'S3BUCKET', 'OPERATION',
                                               'DIRECTORY')

        if 'sql' in sources or 'hive' in sources or 'impala' in sources:
            entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
            default_entity_types = ('TABLE', 'VIEW')
        elif 'hdfs' in sources:
            entity_types = ('FILE', 'DIRECTORY')
            default_entity_types = ('FILE', 'DIRECTORY')
        elif 's3' in sources:
            entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET')
            default_entity_types = ('DIRECTORY', 'S3BUCKET')

        return default_entity_types, entity_types

    def adapt_atlas_entity_to_navigator(self, atlas_entity):
        nav_entity = {
            "created":
            'createTime' in atlas_entity['attributes']
            and atlas_entity['attributes']['createTime'],
            "customProperties":
            None,
            "description":
            atlas_entity['attributes'].get('description'),
            "identity":
            atlas_entity['guid'],
            "internalType":
            atlas_entity['typeName'],
            "meaningNames":
            atlas_entity['meaningNames'],  # Atlas specific
            "meanings":
            atlas_entity['meanings'],  # Atlas specific
            "name":
            atlas_entity['attributes'].get('name'),
            "original_name":
            atlas_entity['attributes'].get('name'),
            "originalDescription":
            None,
            "originalName":
            atlas_entity['attributes'].get('name'),
            "owner":
            atlas_entity.get('owner'),
            "parentPath":
            '',  # Set below
            "properties": {},  # Set below
            "sourceType":
            '',  # Set below
            "tags":
            atlas_entity['classificationNames'],
            "type":
            self.ATLAS_TO_NAV_TYPE.get(atlas_entity['typeName'].lower())
            or atlas_entity['typeName']
        }

        # Convert Atlas qualified name of form db.tbl.col@cluster to parentPath of form /db/tbl
        if atlas_entity['typeName'].lower().startswith('hive_'):
            nav_entity['sourceType'] = 'HIVE'
            qualified_path_parts = re.sub(
                r'@.*$', '',
                atlas_entity['attributes'].get('qualifiedName')).split('.')
            qualified_path_parts.pop(
            )  # it's just the parent path we want so remove the entity name
            nav_entity['parentPath'] = '/' + '/'.join(qualified_path_parts)

        if 'classifications' in atlas_entity:
            for atlas_classification in atlas_entity['classifications']:
                if 'attributes' in atlas_classification:
                    for key, value in atlas_classification[
                            'attributes'].iteritems():
                        nav_entity['properties'][key] = value

        return nav_entity

    def parse_atlas_response(self, atlas_response):
        '''
    REQUEST: hue:8889/metadata/api/navigator/find_entity?type=database&name=default
    SAMPLE response for Navigator find_entity response
    {"status": 0, "entity": {
    "customProperties": null,
    "deleteTime": null,
     "fileSystemPath": "hdfs://nightly6x-1.vpc.cloudera.com:8020/user/hive/warehouse",
     "description": null,
     "params": null,
      "type": "DATABASE",
      "internalType": "hv_database",
      "sourceType": "HIVE",
      "tags": [],
      "deleted": false, "technicalProperties": null,
      "userEntity": false,
      "originalDescription": "Default Hive database",
      "metaClassName": "hv_database",
      "properties": {"__cloudera_internal__hueLink": "https://nightly6x-1.vpc.cloudera.com:8889/hue/metastore/tables/default"},
      "identity": "23",
      "firstClassParentId": null,
      "name": null,
      "extractorRunId": "7##1",
      "sourceId": "7",
       "packageName": "nav",
       "parentPath": null, "originalName": "default"}}
    '''
        response = {"status": 0, "entity": []}
        if not atlas_response['entities']:
            LOG.error('No entities in atlas response to parse: %s' %
                      json.dumps(atlas_response))
        for atlas_entity in atlas_response['entities']:
            response['entity'].append(
                self.adapt_atlas_entity_to_navigator(atlas_entity))
        return response['entity'][0]

    def get_database(self, name):
        # Search with Atlas API for hive database with specific name
        try:
            dsl_query = '+'.join(['hive_db', 'where', 'name=%s']) % name
            atlas_response = self._root.get('/v2/search/dsl?query=%s' %
                                            dsl_query,
                                            headers=self.__headers,
                                            params=self.__params)
            return self.parse_atlas_response(atlas_response)
        except RestException, e:
            LOG.error('Failed to search for entities with search query: %s' %
                      dsl_query)
            if e.code == 401:
                raise CatalogAuthException(_('Failed to authenticate.'))
            else:
                raise CatalogApiException(e.message)
Ejemplo n.º 6
0
class NavigatorApi(Api):
    """
  http://cloudera.github.io/navigator/apidocs/v3/index.html
  """
    DEFAULT_SEARCH_FIELDS = (('originalName', 3), ('originalDescription', 1),
                             ('name', 10), ('description', 3), ('tags', 5))
    CATALOG_NAMESPACE = '__cloudera_internal_catalog_hue'

    def __init__(self, user=None):
        super(NavigatorApi, self).__init__(user)

        self._api_url = '%s/%s' % (NAVIGATOR.API_URL.get().strip('/'), VERSION)
        self._username = get_navigator_auth_username()
        self._password = get_navigator_auth_password()

        # Navigator does not support Kerberos authentication while other components usually requires it
        self._client = UnsecureHttpClient(self._api_url, logger=LOG)
        self._client.set_basic_auth(self._username, self._password)
        self._root = resource.Resource(
            self._client, urlencode=False)  # For search_entities_interactive

        self.__headers = {}
        self.__params = {}

        #self._fillup_properties() # Disabled currently

    def _get_types_from_sources(self, sources):
        default_entity_types = entity_types = ('DATABASE', 'TABLE',
                                               'PARTITION', 'FIELD', 'FILE',
                                               'VIEW', 'S3BUCKET', 'OPERATION',
                                               'DIRECTORY')

        if 'sql' in sources or 'hive' in sources or 'impala' in sources:
            entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
            default_entity_types = ('TABLE', 'VIEW')
        elif 'hdfs' in sources:
            entity_types = ('FILE', 'DIRECTORY')
            default_entity_types = ('FILE', 'DIRECTORY')
        elif 's3' in sources:
            entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET')
            default_entity_types = ('DIRECTORY', 'S3BUCKET')

        return default_entity_types, entity_types

    def search_entities_interactive(self,
                                    query_s=None,
                                    limit=100,
                                    offset=0,
                                    facetFields=None,
                                    facetPrefix=None,
                                    facetRanges=None,
                                    filterQueries=None,
                                    firstClassEntitiesOnly=None,
                                    sources=None):
        try:
            pagination = {
                'offset': offset,
                'limit': NAVIGATOR.FETCH_SIZE_SEARCH_INTERACTIVE.get(),
            }

            f = {
                "outputFormat": {
                    "type": "dynamic"
                },
                "name": {
                    "type": "dynamic"
                },
                "lastModified": {
                    "type": "date"
                },
                "sourceType": {
                    "type": "dynamic"
                },
                "parentPath": {
                    "type": "dynamic"
                },
                "lastAccessed": {
                    "type": "date"
                },
                "type": {
                    "type": "dynamic"
                },
                "sourceId": {
                    "type": "dynamic"
                },
                "partitionColNames": {
                    "type": "dynamic"
                },
                "serDeName": {
                    "type": "dynamic"
                },
                "created": {
                    "type": "date"
                },
                "fileSystemPath": {
                    "type": "dynamic"
                },
                "compressed": {
                    "type": "bool"
                },
                "clusteredByColNames": {
                    "type": "dynamic"
                },
                "originalName": {
                    "type": "dynamic"
                },
                "owner": {
                    "type": "dynamic"
                },
                "extractorRunId": {
                    "type": "dynamic"
                },
                "userEntity": {
                    "type": "bool"
                },
                "sortByColNames": {
                    "type": "dynamic"
                },
                "inputFormat": {
                    "type": "dynamic"
                },
                "serDeLibName": {
                    "type": "dynamic"
                },
                "originalDescription": {
                    "type": "dynamic"
                },
                "lastModifiedBy": {
                    "type": "dynamic"
                }
            }

            auto_field_facets = ["tags", "type"] + list(f.keys())
            query_s = (query_s.strip() if query_s else '') + '*'

            query_s = query_s.replace('tag:',
                                      'tags:').replace('classification:',
                                                       'tags:')

            last_query_term = [term for term in query_s.split()][-1]

            if last_query_term and last_query_term != '*':
                last_query_term = last_query_term.rstrip('*')
                (fname, fval) = last_query_term.split(
                    ':') if ':' in last_query_term else (last_query_term, '')
                auto_field_facets = [
                    f for f in auto_field_facets if f.startswith(fname)
                ]

            facetFields = facetFields or auto_field_facets[:5]

            entity_types = []
            fq_type = []
            if filterQueries is None:
                filterQueries = []

            if sources:
                default_entity_types, entity_types = self._get_types_from_sources(
                    sources)

                if 'sql' in sources or 'hive' in sources or 'impala' in sources:
                    fq_type = default_entity_types
                    filterQueries.append(
                        'sourceType:HIVE OR sourceType:IMPALA')
                elif 'hdfs' in sources:
                    fq_type = entity_types
                elif 's3' in sources:
                    fq_type = default_entity_types
                    filterQueries.append('sourceType:s3')

                if query_s.strip().endswith(
                        'type:*'):  # To list all available types
                    fq_type = entity_types

            search_terms = [term for term in query_s.strip().split()
                            ] if query_s else []
            query = []
            for term in search_terms:
                if ':' not in term:
                    query.append(self._get_boosted_term(term))
                else:
                    name, val = term.split(':')
                    if val:  # Allow to type non default types, e.g for SQL: type:FIEL*
                        if name == 'type':  # Make sure type value still makes sense for the source
                            term = '%s:%s' % (name, val.upper())
                            fq_type = entity_types
                        if name.lower() not in [
                                'type', 'tags', 'owner', 'originalname',
                                'originaldescription', 'lastmodifiedby'
                        ]:
                            # User Defined Properties are prefixed with 'up_', i.e. "department:sales" -> "up_department:sales"
                            query.append('up_' + term)
                        else:
                            filterQueries.append(term)

            filterQueries.append('deleted:false')

            body = {'query': ' '.join(query) or '*'}
            if fq_type:
                filterQueries += [
                    '{!tag=type} %s' %
                    ' OR '.join(['type:%s' % fq for fq in fq_type])
                ]

            source_ids = get_cluster_source_ids(self)
            if source_ids:
                body['query'] = source_ids + '(' + body['query'] + ')'

            body['facetFields'] = facetFields or [
            ]  # Currently mandatory in API
            if facetPrefix:
                body['facetPrefix'] = facetPrefix
            if facetRanges:
                body['facetRanges'] = facetRanges
            if filterQueries:
                body['filterQueries'] = filterQueries
            if firstClassEntitiesOnly:
                body['firstClassEntitiesOnly'] = firstClassEntitiesOnly

            data = json.dumps(body)
            LOG.info(data)
            response = self._root.post(
                'interactive/entities?limit=%(limit)s&offset=%(offset)s' %
                pagination,
                data=data,
                contenttype=_JSON_CONTENT_TYPE,
                clear_cookies=True)

            response['results'] = list(
                islice(self._secure_results(response['results']),
                       limit))  # Apply Sentry perms

            return response
        except RestException as e:
            LOG.error('Failed to search for entities with search query: %s' %
                      json.dumps(body))
            if e.code == 401:
                raise CatalogAuthException(_('Failed to authenticate.'))
            else:
                raise CatalogApiException(e.message)

    def search_entities(self,
                        query_s,
                        limit=100,
                        offset=0,
                        raw_query=False,
                        **filters):
        """
    Solr edismax query parser syntax.

    :param query_s: a query string of search terms (e.g. - sales quarterly);
      Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist of search_fields.
    """
        sources = filters.get('sources', [])
        default_entity_types, entity_types = self._get_types_from_sources(
            sources)

        try:
            params = self.__params
            if not raw_query:
                query_s = query_s.replace('{', '\\{').replace(
                    '}', '\\}').replace('(',
                                        '\\(').replace(')', '\\)').replace(
                                            '[', '\\[').replace(']', '\\]')

                search_terms = [term for term in query_s.strip().split()]

                query_clauses = []
                user_filters = []
                source_type_filter = []

                for term in search_terms:
                    if ':' not in term:
                        if ('sql' in sources or 'hive' in sources
                                or 'impala' in sources):
                            if '.' in term:
                                parent, term = term.rsplit('.', 1)
                                user_filters.append('parentPath:"/%s"' %
                                                    parent.replace('.', '/'))
                        query_clauses.append(self._get_boosted_term(term))
                    else:
                        name, val = term.split(':')
                        if val:
                            if name == 'type':
                                term = '%s:%s' % (name, val.upper().strip('*'))
                                default_entity_types = entity_types  # Make sure type value still makes sense for the source
                            user_filters.append(
                                term +
                                '*')  # Manual filter allowed e.g. type:VIE* ca

                filter_query = '*'

                if query_clauses:
                    filter_query = 'OR'.join(
                        ['(%s)' % clause for clause in query_clauses])

                user_filter_clause = 'AND '.join(
                    ['(%s)' % f for f in user_filters]) or '*'
                source_filter_clause = 'OR'.join([
                    '(%s:%s)' % ('type', entity_type)
                    for entity_type in default_entity_types
                ])

                if 's3' in sources:
                    source_type_filter.append('sourceType:s3')
                elif 'sql' in sources or 'hive' in sources or 'impala' in sources:
                    source_type_filter.append(
                        'sourceType:HIVE OR sourceType:IMPALA')

                filter_query = '%s AND (%s) AND (%s)' % (
                    filter_query, user_filter_clause, source_filter_clause)
                if source_type_filter:
                    filter_query += ' AND (%s)' % 'OR '.join(
                        source_type_filter)

                source_ids = get_cluster_source_ids(self)
                if source_ids:
                    filter_query = source_ids + '(' + filter_query + ')'
            else:
                filter_query = query_s

            params.update({
                'query': filter_query,
                'offset': offset,
                'limit': NAVIGATOR.FETCH_SIZE_SEARCH.get()
            })

            LOG.info(params)
            response = self._root.get('entities',
                                      headers=self.__headers,
                                      params=params)

            response = list(islice(self._secure_results(response),
                                   limit))  # Apply Sentry perms

            return response
        except RestException as e:
            LOG.error('Failed to search for entities with search query: %s' %
                      query_s)
            if e.code == 401:
                raise CatalogAuthException(_('Failed to authenticate.'))
            else:
                raise CatalogApiException(e)

    def _secure_results(self, results, checker=None):
        # TODO: to move directly to Catalog API
        if NAVIGATOR.APPLY_SENTRY_PERMISSIONS.get():
            checker = get_checker(self.user, checker)
            action = 'SELECT'

            def getkey(result):
                key = {u'server': get_hive_sentry_provider()}

                if result['type'] == 'TABLE' or result['type'] == 'VIEW':
                    key['db'] = result.get('parentPath', '') and result.get(
                        'parentPath', '').strip('/')
                    key['table'] = result.get('originalName', '')
                elif result['type'] == 'DATABASE':
                    key['db'] = result.get('originalName', '')
                elif result['type'] == 'FIELD':
                    parents = result.get('parentPath',
                                         '').strip('/').split('/')
                    if len(parents) == 2:
                        key['db'], key['table'] = parents
                        key['column'] = result.get('originalName', '')

                return key

            return checker.filter_objects(results, action, key=getkey)
        else:
            return results

    def suggest(self, prefix=None):
        try:
            return self._root.get('interactive/suggestions?query=%s' %
                                  (prefix or '*'))
        except RestException as e:
            msg = 'Failed to search for entities with search query: %s' % prefix
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def find_entity(self, source_type, type, name, **filters):
        """
    GET /api/v3/entities?query=((sourceType:<source_type>)AND(type:<type>)AND(originalName:<name>))
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities.html
    """
        try:
            params = self.__params

            query_filters = {
                'sourceType': source_type,
                'originalName': name,
                'deleted': 'false'
            }

            for key, value in list(filters.items()):
                query_filters[key] = value

            filter_query = 'AND'.join(
                '(%s:%s)' % (key, value)
                for key, value in list(query_filters.items()))
            filter_query = '%(type)s AND %(filter_query)s' % {
                'type':
                '(type:%s)' % 'TABLE OR type:VIEW' if type == 'TABLE' else
                type,  # Impala don't always say that a table is actually a view
                'filter_query': filter_query
            }

            source_ids = get_cluster_source_ids(self)
            if source_ids:
                filter_query = source_ids + '(' + filter_query + ')'

            params.update({
                'query': filter_query,
                'offset': 0,
                'limit':
                2  # We are looking for single entity, so limit to 2 to check for multiple results
            })

            response = self._root.get('entities',
                                      headers=self.__headers,
                                      params=params)

            if not response:
                raise CatalogEntityDoesNotExistException(
                    'Could not find entity with query filters: %s' %
                    str(query_filters))
            elif len(response) > 1:
                raise CatalogApiException(
                    'Found more than 1 entity with query filters: %s' %
                    str(query_filters))

            return response[0]
        except RestException as e:
            msg = 'Failed to find entity: %s' % str(e)
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def get_entity(self, entity_id):
        """
    GET /api/v3/entities/:id
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities_-id-.html
    """
        try:
            return self._root.get('entities/%s' % entity_id,
                                  headers=self.__headers,
                                  params=self.__params)
        except RestException as e:
            msg = 'Failed to get entity %s: %s' % (entity_id, str(e))
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def update_entity(self, entity, **metadata):
        """
    PUT /api/v3/entities/:id
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities_-id-.html
    """
        try:
            # Workarounds NAV-6187: if we don't re-send those, they would get erased.
            properties = {
                'name': entity['name'],
                'description': entity['description'],
                'properties': entity['properties'] or {},
                'customProperties': entity['customProperties'] or {}
            }
            properties.update(metadata)
            data = json.dumps(properties)

            return self._root.put('entities/%(identity)s' % entity,
                                  params=self.__params,
                                  data=data,
                                  contenttype=_JSON_CONTENT_TYPE,
                                  allow_redirects=True,
                                  clear_cookies=True)

        except RestException as e:
            msg = 'Failed to update entity %s: %s' % (entity['identity'], e)
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def get_cluster_source_ids(self):
        params = {
            'query': 'clusterName:"%s"' % get_navigator_hue_server_name(),
            'limit': 200
        }

        LOG.info(params)
        return self._root.get('entities',
                              headers=self.__headers,
                              params=params)

    def add_tags(self, entity_id, tags):
        entity = self.get_entity(entity_id)
        new_tags = entity['tags'] or []
        new_tags.extend(tags)
        return self.update_entity(entity, tags=new_tags)

    def delete_tags(self, entity_id, tags):
        entity = self.get_entity(entity_id)
        new_tags = entity['tags'] or []
        for tag in tags:
            if tag in new_tags:
                new_tags.remove(tag)
        return self.update_entity(entity, tags=new_tags)

    def update_properties(self,
                          entity_id,
                          properties,
                          modified_custom_metadata=None,
                          deleted_custom_metadata_keys=None):
        entity = self.get_entity(entity_id)

        if modified_custom_metadata:
            properties['properties'] = entity['properties'] or {}
            properties['properties'].update(modified_custom_metadata)
        if deleted_custom_metadata_keys:
            properties['properties'] = entity['properties'] or {}
            for key in deleted_custom_metadata_keys:
                if key in properties['properties']:
                    del properties['properties'][key]
        return self.update_entity(entity, **properties)

    def delete_metadata_properties(self, entity_id, property_keys):
        entity = self.get_entity(entity_id)
        new_props = entity['properties'] or {}
        for key in property_keys:
            if key in new_props:
                del new_props[key]
        return self.update_entity(entity, properties=new_props)

    def get_lineage(self, entity_id):
        """
    GET /api/v3/lineage/entityIds=:id
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_lineage.html
    """
        try:
            params = self.__params

            params.update({'entityIds': entity_id})

            return self._root.get('lineage',
                                  headers=self.__headers,
                                  params=params)
        except RestException as e:
            msg = 'Failed to get lineage for entity ID %s: %s' % (entity_id,
                                                                  str(e))
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def create_namespace(self, namespace, description=None):
        try:
            data = json.dumps({'name': namespace, 'description': description})
            return self._root.post('models/namespaces/',
                                   data=data,
                                   contenttype=_JSON_CONTENT_TYPE,
                                   clear_cookies=True)
        except RestException as e:
            msg = 'Failed to create namespace: %s' % namespace
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def get_namespace(self, namespace):
        try:
            return self._root.get('models/namespaces/%(namespace)s' %
                                  {'namespace': namespace})
        except RestException as e:
            msg = 'Failed to get namespace: %s' % namespace
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def create_namespace_property(self, namespace, properties):
        try:
            data = json.dumps(properties)
            return self._root.post(
                'models/namespaces/%(namespace)s/properties' %
                {'namespace': namespace},
                data=data,
                contenttype=_JSON_CONTENT_TYPE,
                clear_cookies=True)

        except RestException as e:
            msg = 'Failed to create namespace %s property' % namespace
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def get_namespace_properties(self, namespace):
        try:
            return self._root.get(
                'models/namespaces/%(namespace)s/properties' %
                {'namespace': namespace})
        except RestException as e:
            msg = 'Failed to create namespace %s property' % namespace
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def map_namespace_property(self, clazz, properties):
        try:
            data = json.dumps(properties)
            return self._root.post(
                'models/packages/nav/classes/%(class)s/properties' %
                {'class': clazz},
                data=data,
                contenttype=_JSON_CONTENT_TYPE,
                clear_cookies=True)

        except RestException as e:
            msg = 'Failed to map class %s property' % clazz
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def get_model_properties_mapping(self):
        try:
            return self._root.get('models/properties/mappings')
        except RestException as e:
            msg = 'Failed to get models properties mappings'
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def _fillup_properties(self):
        global _HAS_CATALOG_NAMESPACE

        if _HAS_CATALOG_NAMESPACE is None:
            response = self.get_namespace(
                namespace=NavigatorApi.CATALOG_NAMESPACE)
            if not response:
                self.create_namespace(
                    namespace=NavigatorApi.CATALOG_NAMESPACE,
                    description="Set of fields to augment the data catalog")

            properties = self.get_namespace_properties(
                namespace=NavigatorApi.CATALOG_NAMESPACE)

            if not [
                    _property for _property in properties
                    if _property['name'] == 'relatedDocuments'
            ]:
                self.create_namespace_property(
                    namespace=NavigatorApi.CATALOG_NAMESPACE,
                    properties={
                        "name": "relatedDocuments",
                        "displayName": "Related documents",
                        "description":
                        "List of Hue document UUIDs related to this entity",
                        "multiValued": True,
                        "maxLength": 36,
                        "pattern": ".*",  # UUID
                        "enumValues": None,
                        "type": "TEXT"
                    })

                # Might want to check if the mapping is already done
                for clazz in ('hv_table', 'hv_view'):
                    self.map_namespace_property(
                        clazz,
                        properties=[{
                            "namespace": NavigatorApi.CATALOG_NAMESPACE,
                            "name": "relatedDocuments"
                        }])

            _HAS_CATALOG_NAMESPACE = True

    def _get_boosted_term(self, term):
        return 'AND'.join([
            # Matching fields
            '(%s)' % 'OR'.join([
                '(%s:%s*^%s)' % (field, term, weight)
                for (field, weight) in NavigatorApi.DEFAULT_SEARCH_FIELDS
            ]),

            # Boost entities with enriched fields
            '(%s)' % 'OR'.join([
                '(%s:[* TO *])' % field
                for (field, weight) in NavigatorApi.DEFAULT_SEARCH_FIELDS
            ])

            # Could add certain customProperties and properties
        ])

    def _clean_path(self, path):
        return path.rstrip('/').split('/')[-1], self._escape_slashes(
            path.rstrip('/'))

    def _escape_slashes(self, s):
        return s.replace('/', '\/')
Ejemplo n.º 7
0
class NavigatorApi(object):
    """
  http://cloudera.github.io/navigator/apidocs/v3/index.html
  """
    def __init__(self, user=None):
        self._api_url = '%s/%s' % (NAVIGATOR.API_URL.get().strip('/'), VERSION)
        self._username = get_navigator_auth_username()
        self._password = get_navigator_auth_password()

        self.user = user
        # Navigator does not support Kerberos authentication while other components usually requires it
        self._client = UnsecureHttpClient(self._api_url, logger=LOG)
        self._client.set_basic_auth(self._username, self._password)
        self._root = resource.Resource(
            self._client, urlencode=False)  # For search_entities_interactive

        self.__headers = {}
        self.__params = ()

    def _get_types_from_sources(self, sources):
        default_entity_types = entity_types = ('DATABASE', 'TABLE',
                                               'PARTITION', 'FIELD', 'FILE',
                                               'VIEW', 'S3BUCKET', 'OPERATION',
                                               'DIRECTORY')

        if 'sql' in sources or 'hive' in sources or 'impala' in sources:
            entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
            default_entity_types = ('TABLE', 'VIEW')
        elif 'hdfs' in sources:
            entity_types = ('FILE', 'DIRECTORY')
            default_entity_types = ('FILE', 'DIRECTORY')
        elif 's3' in sources:
            entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET')
            default_entity_types = ('DIRECTORY', 'S3BUCKET')

        return default_entity_types, entity_types

    def search_entities(self, query_s, limit=100, offset=0, **filters):
        """
    Solr edismax query parser syntax.

    :param query_s: a query string of search terms (e.g. - sales quarterly);
      Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist
      of search_fields.
    """
        search_fields = ('originalName', 'originalDescription', 'name',
                         'description', 'tags')

        sources = filters.get('sources', [])
        default_entity_types, entity_types = self._get_types_from_sources(
            sources)

        try:
            query_s = query_s.replace('{', '\\{').replace('}', '\\}').replace(
                '(', '\\(').replace(')',
                                    '\\)').replace('[',
                                                   '\\[').replace(']', '\\]')
            params = self.__params

            search_terms = [term for term in query_s.strip().split()]

            query_clauses = []
            user_filters = []
            source_type_filter = []

            for term in search_terms:
                if ':' not in term:
                    if ('sql' in sources or 'hive' in sources
                            or 'impala' in sources):
                        if '.' in term:
                            parent, term = term.rsplit('.', 1)
                            user_filters.append('parentPath:"/%s"' %
                                                parent.replace('.', '/'))
                    query_clauses.append('OR'.join([
                        '(%s:*%s*)' % (field, term) for field in search_fields
                    ]))
                else:
                    name, val = term.split(':')
                    if val:
                        if name == 'type':
                            term = '%s:%s' % (name, val.upper().strip('*'))
                            default_entity_types = entity_types  # Make sure type value still makes sense for the source
                        user_filters.append(
                            term +
                            '*')  # Manual filter allowed e.g. type:VIE* ca

            filter_query = '*'

            if query_clauses:
                filter_query = 'OR'.join(
                    ['(%s)' % clause for clause in query_clauses])

            user_filter_clause = 'AND '.join(
                ['(%s)' % f for f in user_filters]) or '*'
            source_filter_clause = 'OR'.join([
                '(%s:%s)' % ('type', entity_type)
                for entity_type in default_entity_types
            ])

            if 's3' in sources:
                source_type_filter.append('sourceType:s3')
            elif 'sql' in sources or 'hive' in sources or 'impala' in sources:
                source_type_filter.append(
                    'sourceType:HIVE OR sourceType:IMPALA')

            filter_query = '%s AND (%s) AND (%s)' % (
                filter_query, user_filter_clause, source_filter_clause)
            if source_type_filter:
                filter_query += ' AND (%s)' % 'OR '.join(source_type_filter)

            source_ids = get_cluster_source_ids(self)
            if source_ids:
                filter_query = source_ids + '(' + filter_query + ')'

            params += (
                ('query', filter_query),
                ('offset', offset),
                ('limit', NAVIGATOR.FETCH_SIZE_SEARCH.get()),
            )

            LOG.info(params)
            response = self._root.get('entities',
                                      headers=self.__headers,
                                      params=params)

            response = list(islice(self._secure_results(response),
                                   limit))  # Apply Sentry perms

            return response
        except RestException, e:
            LOG.error('Failed to search for entities with search query: %s' %
                      query_s)
            if e.code == 401:
                raise NavigathorAuthException(_('Failed to authenticate.'))
            else:
                raise NavigatorApiException(e)