Example #1
0
class ManagerApi(object):
    """
  https://cloudera.github.io/cm_api/
  """
    def __init__(self,
                 user=None,
                 security_enabled=False,
                 ssl_cert_ca_verify=False):
        self._api_url = '%s/%s' % (MANAGER.API_URL.get().strip('/'), VERSION)
        self._username = get_navigator_auth_username()
        self._password = get_navigator_auth_password()

        self.user = user
        self._client = HttpClient(self._api_url, logger=LOG)

        if security_enabled:
            self._client.set_kerberos_auth()
        else:
            self._client.set_basic_auth(self._username, self._password)

        self._client.set_verify(ssl_cert_ca_verify)
        self._root = Resource(self._client)

    def tools_echo(self):
        try:
            params = (('message', 'hello'), )

            LOG.info(params)
            return self._root.get('tools/echo', params=params)
        except RestException, e:
            raise ManagerApiException(e)
Example #2
0
class NavigatorApi(object):
  """
  http://cloudera.github.io/navigator/apidocs/v3/index.html
  """

  def __init__(self, api_url=None, user=None, password=None):
    self._api_url = '%s/%s' % ((api_url or NAVIGATOR.API_URL.get()).strip('/'), VERSION)
    self._username = user or NAVIGATOR.AUTH_USERNAME.get()
    self._password = password or NAVIGATOR.AUTH_PASSWORD.get()

    self._client = HttpClient(self._api_url, logger=LOG)
    self._client.set_basic_auth(self._username, self._password)
    self._root = resource.Resource(self._client)

    self.__headers = {}
    self.__params = ()


  def search_entities(self, query_s, limit=100, offset=0, **filters):
    """
    GET /api/v3/entities?query=()
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities.html
    :param query_s: a query string of search terms (e.g. - sales quarterly);
      Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist
      of search_fields.
      TODO: support smarter boolean searching with arbitrary ordering and precedence of conditionals
    :param filters: TODO: IMPLEMENT ME, required to support property search
    """
    search_fields = ('originalName', 'originalDescription', 'name', 'description', 'tags')
    entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'OPERATION')

    try:
      params = self.__params

      search_terms = [term.lower() for term in query_s.strip().split()]

      query_clauses = []
      for term in search_terms:
        query_clauses.append('OR'.join(['(%s:*%s*)' % (field, term) for field in search_fields]))

      filter_query = '(originalName:*.*)'
      if search_terms:
        filter_query = 'OR'.join(['(%s)' % clause for clause in query_clauses])

      type_filter_clause = 'OR'.join(['(%s:%s)' % ('type', entity_type) for entity_type in entity_types])
      filter_query = '%sAND(%s)' % (filter_query, type_filter_clause)

      params += (
        ('query', filter_query),
        ('offset', offset),
        ('limit', limit),
      )

      response = self._root.get('entities', headers=self.__headers, params=params)

      return response
    except RestException, e:
      msg = 'Failed to search for entities with search query: %s' % query_s
      LOG.exception(msg)
      raise NavigatorApiException(msg)
Example #3
0
class THttpClient(TTransportBase):
    """
  HTTP transport mode for Thrift.

  HTTPS and Kerberos support with Request.

  e.g.
  mode = THttpClient('http://hbase-thrift-v1.com:9090')
  mode = THttpClient('http://hive-localhost:10001/cliservice')
  """
    def __init__(self, base_url):
        self._base_url = base_url
        self._client = HttpClient(self._base_url, logger=LOG)
        self._data = None
        self._headers = None
        self._wbuf = buffer_writer()

    def open(self):
        pass

    def set_kerberos_auth(self, service="HTTP"):
        self._client.set_kerberos_auth(service=service)

    def set_basic_auth(self, username, password):
        self._client.set_basic_auth(username, password)

    def set_bearer_auth(self, token):
        self._client.set_bearer_auth(token)

    def set_verify(self, verify=True):
        self._client.set_verify(verify)

    def close(self):
        self._headers = None
        # Close session too?

    def isOpen(self):
        return self._client is not None

    def setTimeout(self, ms):
        if not self._headers:
            self._headers = {}
        self._headers.update(timeout=str(int(ms / 1000)))

    def setCustomHeaders(self, headers):
        self._headers = headers

    def read(self, sz):
        return self._data

    def write(self, buf):
        self._wbuf.write(buf)

    def flush(self):
        data = self._wbuf.getvalue()
        self._wbuf = buffer_writer()

        # POST
        self._root = Resource(self._client)
        self._data = self._root.post('', data=data, headers=self._headers)
Example #4
0
class ManagerApi(object):
    """
  https://cloudera.github.io/cm_api/
  """
    def __init__(self,
                 user=None,
                 security_enabled=False,
                 ssl_cert_ca_verify=False):
        self._api_url = '%s/%s' % (MANAGER.API_URL.get().strip('/'), VERSION)
        self._username = get_navigator_auth_username()
        self._password = get_navigator_auth_password()

        self.user = user
        self._client = HttpClient(self._api_url, logger=LOG)

        if security_enabled:
            self._client.set_kerberos_auth()
        else:
            self._client.set_basic_auth(self._username, self._password)

        self._client.set_verify(ssl_cert_ca_verify)
        self._root = Resource(self._client)

    def has_service(self, service_name, cluster_name=None):
        cluster = self._get_cluster(cluster_name)
        try:
            services = self._root.get(
                'clusters/%(cluster_name)s/serviceTypes' % {
                    'cluster_name': cluster['name'],
                    'service_name': service_name
                })['items']

            return service_name in services
        except RestException, e:
            raise ManagerApiException(e)
Example #5
0
class THttpClient(TTransportBase):
    """
  HTTP transport mode for Thrift.

  HTTPS and Kerberos support with Request.

  e.g.
  mode = THttpClient('http://hbase-thrift-v1.com:9090')
  mode = THttpClient('http://hive-localhost:10001/cliservice')
  """
    def __init__(self, base_url, cert_validate=True):
        self._base_url = base_url
        self._client = HttpClient(self._base_url,
                                  logger=LOG,
                                  cert_validate=cert_validate)
        self._data = None
        self._headers = None
        self._wbuf = StringIO()

    def open(self):
        pass

    def set_basic_auth(self, username, password):
        self._client.set_basic_auth(username, password)

    def set_kerberos_auth(self):
        self._client.set_kerberos_auth()

    def close(self):
        self._headers = None
        # Close session too?

    def isOpen(self):
        return self._client is not None

    def setTimeout(self, ms):
        pass

    def setCustomHeaders(self, headers):
        self._headers = headers

    def read(self, sz):
        return self._data

    def write(self, buf):
        self._wbuf.write(buf)

    def flush(self):
        if self.isOpen():
            self.close()
        self.open()

        data = self._wbuf.getvalue()
        self._wbuf = StringIO()

        # POST
        self._root = Resource(self._client)
        self._data = self._root.post('', data=data)
Example #6
0
class THttpClient(TTransportBase):
  """
  HTTP transport mode for Thrift.

  HTTPS and Kerberos support with Request.

  e.g.
  mode = THttpClient('http://hbase-thrift-v1.com:9090')
  mode = THttpClient('http://hive-localhost:10001/cliservice')
  """

  def __init__(self, base_url, cert_validate=True):
    self._base_url = base_url
    self._client = HttpClient(self._base_url, logger=LOG, cert_validate=cert_validate)
    self._data = None
    self._headers = None
    self._wbuf = StringIO()

  def open(self):
    pass

  def set_basic_auth(self, username, password):
    self._client.set_basic_auth(username, password)

  def set_kerberos_auth(self):
      self._client.set_kerberos_auth()

  def close(self):
    self._headers = None
    # Close session too?

  def isOpen(self):
    return self._client is not None

  def setTimeout(self, ms):
    pass

  def setCustomHeaders(self, headers):
    self._headers = headers

  def read(self, sz):
    return self._data

  def write(self, buf):
    self._wbuf.write(buf)

  def flush(self):
    if self.isOpen():
      self.close()
    self.open()

    data = self._wbuf.getvalue()
    self._wbuf = StringIO()

    # POST
    self._root = Resource(self._client)
    self._data = self._root.post('', data=data)
Example #7
0
class NavigatorApi(object):
  """
  http://cloudera.github.io/navigator/apidocs/v2/index.html
  """

  def __init__(self, api_url=None, user=None, password=None):
    self._api_url = (api_url or NAVIGATOR.API_URL.get()).strip('/')
    self._username = user or NAVIGATOR.AUTH_USERNAME.get()
    self._password = password or NAVIGATOR.AUTH_PASSWORD.get()

    self._client = HttpClient(self._api_url, logger=LOG)
    self._client.set_basic_auth(self._username, self._password)
    self._root = resource.Resource(self._client)

    self.__headers = {}
    self.__params = ()


  def find_entity(self, source_type, type, name, **filters):
    """
    GET /api/v2/entities?query=((sourceType:<source_type>)AND(type:<type>)AND(originalName:<name>))
    http://cloudera.github.io/navigator/apidocs/v2/path__v2_entities.html
    """
    try:
      params = self.__params

      query_filters = {
        'sourceType': source_type,
        'type': type,
        'originalName': name,
        'deleted': 'false'
      }
      for key, value in filters.items():
        query_filters[key] = value

      filter_query = 'AND'.join('(%s:%s)' % (key, value) for key, value in query_filters.items())

      params += (
        ('query', filter_query),
        ('offset', 0),
        ('limit', 2),  # We are looking for single entity, so limit to 2 to check for multiple results
      )

      response = self._root.get('entities', headers=self.__headers, params=params)

      if not response:
        raise NavigatorApiException('Could not find entity with query filters: %s' % str(query_filters))
      elif len(response) > 1:
        raise NavigatorApiException('Found more than 1 entity with query filters: %s' % str(query_filters))

      return response[0]
    except RestException, e:
      msg = 'Failed to find entity: %s' % str(e)
      LOG.exception(msg)
      raise NavigatorApiException(msg)
Example #8
0
class NavigatorApi(object):
  """
  http://cloudera.github.io/navigator/apidocs/v2/index.html
  """

  def __init__(self, api_url=None, user=None, password=None):
    self._api_url = (api_url or NAVIGATOR.API_URL.get()).strip('/')
    self._username = user or NAVIGATOR.AUTH_USERNAME.get()
    self._password = password or NAVIGATOR.AUTH_PASSWORD.get()

    self._client = HttpClient(self._api_url, logger=LOG)
    self._client.set_basic_auth(self._username, self._password)
    self._root = resource.Resource(self._client)

    self.__headers = {}
    self.__params = ()


  def find_entity(self, source_type, type, name, **filters):
    """
    GET /api/v2/entities?query=((sourceType:<source_type>)AND(type:<type>)AND(originalName:<name>))
    http://cloudera.github.io/navigator/apidocs/v2/path__v2_entities.html
    """
    try:
      params = self.__params

      query_filters = {
        'sourceType': source_type,
        'type': type,
        'originalName': name,
        'deleted': 'false'
      }
      for key, value in filters.items():
        query_filters[key] = value

      filter_query = 'AND'.join('(%s:%s)' % (key, value) for key, value in query_filters.items())

      params += (
        ('query', filter_query),
        ('offset', 0),
        ('limit', 2),  # We are looking for single entity, so limit to 2 to check for multiple results
      )

      response = self._root.get('entities', headers=self.__headers, params=params)

      if not response:
        raise NavigatorApiException('Could not find entity with query filters: %s' % str(query_filters))
      elif len(response) > 1:
        raise NavigatorApiException('Found more than 1 entity with query filters: %s' % str(query_filters))

      return response[0]
    except RestException, e:
      msg = 'Failed to find entity: %s' % str(e)
      LOG.exception(msg)
      raise NavigatorApiException(msg)
Example #9
0
class ManagerApi(object):
    """
  https://cloudera.github.io/cm_api/
  """
    def __init__(self,
                 user=None,
                 security_enabled=False,
                 ssl_cert_ca_verify=False):
        self._api_url = '%s/%s' % (MANAGER.API_URL.get().strip('/'), VERSION)
        self._username = get_navigator_auth_username()
        self._password = get_navigator_auth_password()

        self.user = user
        self._client = HttpClient(self._api_url, logger=LOG)

        if security_enabled:
            self._client.set_kerberos_auth()
        else:
            self._client.set_basic_auth(self._username, self._password)

        self._client.set_verify(ssl_cert_ca_verify)
        self._root = Resource(self._client)

    def has_service(self, service_name, cluster_name=None):
        cluster = self._get_cluster(cluster_name)
        try:
            services = self._root.get(
                'clusters/%(cluster_name)s/serviceTypes' % {
                    'cluster_name': cluster['name'],
                    'service_name': service_name
                })['items']

            return service_name in services
        except RestException as e:
            raise ManagerApiException(e)

    def get_spark_history_server_configs(self, cluster_name=None):
        service_name = "SPARK_ON_YARN"
        shs_role_type = "SPARK_YARN_HISTORY_SERVER"

        try:
            cluster = self._get_cluster(cluster_name)
            services = self._root.get('clusters/%(cluster_name)s/services' % {
                'cluster_name': cluster['name'],
                'service_name': service_name
            })['items']

            service_display_names = [
                service['displayName'] for service in services
                if service['type'] == service_name
            ]

            if service_display_names:
                spark_service_display_name = service_display_names[0]

                servers = self._root.get(
                    'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles'
                    % {
                        'cluster_name': cluster['name'],
                        'spark_service_display_name':
                        spark_service_display_name
                    })['items']

                shs_server_names = [
                    server['name'] for server in servers
                    if server['type'] == shs_role_type
                ]
                shs_server_name = shs_server_names[
                    0] if shs_server_names else None
                shs_server_hostRef = [
                    server['hostRef'] for server in servers
                    if server['type'] == shs_role_type
                ]
                shs_server_hostId = shs_server_hostRef[0][
                    'hostId'] if shs_server_hostRef else None

                if shs_server_name and shs_server_hostId:
                    shs_server_configs = self._root.get(
                        'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles/%(shs_server_name)s/config'
                        % {
                            'cluster_name': cluster['name'],
                            'spark_service_display_name':
                            spark_service_display_name,
                            'shs_server_name': shs_server_name
                        },
                        params={'view': 'full'})['items']
                    return shs_server_hostId, shs_server_configs
        except Exception as e:
            LOG.warning("Check Spark History Server via ManagerApi: %s" % e)

        return None, None

    def get_spark_history_server_url(self, cluster_name=None):
        shs_server_hostId, shs_server_configs = self.get_spark_history_server_configs(
            cluster_name=cluster_name)

        if shs_server_hostId and shs_server_configs:
            shs_ui_port = None
            shs_ssl_port = None
            shs_ssl_enabled = None
            for config in shs_server_configs:
                if 'relatedName' in config and 'default' in config:
                    if config['relatedName'] == 'spark.history.ui.port':
                        shs_ui_port = config['default']
                    if config['relatedName'] == 'spark.ssl.historyServer.port':
                        shs_ssl_port = config['default']
                    if config[
                            'relatedName'] == 'spark.ssl.historyServer.enabled':
                        shs_ssl_enabled = config['default']
            shs_ui_host = self._root.get('hosts/%(hostId)s' %
                                         {'hostId': shs_server_hostId})
            shs_ui_hostname = shs_ui_host['hostname'] if shs_ui_host else None

            return self.assemble_shs_url(shs_ui_hostname, shs_ui_port,
                                         shs_ssl_port, shs_ssl_enabled)

        return None

    def get_spark_history_server_security_enabled(self, cluster_name=None):
        shs_server_hostId, shs_server_configs = self.get_spark_history_server_configs(
            cluster_name=cluster_name)

        if shs_server_configs:
            for config in shs_server_configs:
                if 'relatedName' in config and 'default' in config and config[
                        'relatedName'] == 'history_server_spnego_enabled':
                    shs_security_enabled = config['default']
                    return shs_security_enabled and shs_security_enabled == 'true'

        return False

    def assemble_shs_url(self,
                         shs_ui_hostname,
                         shs_ui_port=None,
                         shs_ssl_port=None,
                         shs_ssl_enabled=None):
        if not shs_ui_hostname or not shs_ui_port or not shs_ssl_port or not shs_ssl_enabled:
            LOG.warning("Spark conf not found!")
            return None

        protocol = 'https' if shs_ssl_enabled.lower() == 'true' else 'http'
        shs_url = '%(protocol)s://%(hostname)s:%(port)s' % {
            'protocol':
            protocol,
            'hostname':
            shs_ui_hostname,
            'port':
            shs_ssl_port if shs_ssl_enabled.lower() == 'true' else shs_ui_port,
        }

        return shs_url

    def tools_echo(self):
        try:
            params = (('message', 'hello'), )

            LOG.info(params)
            return self._root.get('tools/echo', params=params)
        except RestException as e:
            raise ManagerApiException(e)

    def get_kafka_brokers(self, cluster_name=None):
        try:

            hosts = self._get_hosts('KAFKA',
                                    'KAFKA_BROKER',
                                    cluster_name=cluster_name)

            brokers_hosts = [host['hostname'] + ':9092' for host in hosts]

            return ','.join(brokers_hosts)
        except RestException as e:
            raise ManagerApiException(e)

    def get_kudu_master(self, cluster_name=None):
        try:
            cluster = self._get_cluster(cluster_name)
            services = self._root.get('clusters/%(name)s/services' %
                                      cluster)['items']

            service = [
                service for service in services if service['type'] == 'KUDU'
            ][0]
            master = self._get_roles(cluster['name'], service['name'],
                                     'KUDU_MASTER')[0]

            master_host = self._root.get('hosts/%(hostId)s' %
                                         master['hostRef'])

            return master_host['hostname']
        except RestException as e:
            raise ManagerApiException(e)

    def get_kafka_topics(self, broker_host):
        try:
            client = HttpClient('http://%s:24042' % broker_host, logger=LOG)
            root = Resource(client)

            return root.get('/api/topics')
        except RestException as e:
            raise ManagerApiException(e)

    def update_flume_config(self, cluster_name, config_name, config_value):
        service = 'FLUME-1'
        cluster = self._get_cluster(cluster_name)
        roleConfigGroup = [
            role['roleConfigGroupRef']['roleConfigGroupName']
            for role in self._get_roles(cluster['name'], service, 'AGENT')
        ]
        data = {
            u'items': [{
                u'url':
                u'/api/v8/clusters/%(cluster_name)s/services/%(service)s/roleConfigGroups/%(roleConfigGroups)s/config?message=Updated%20service%20and%20role%20type%20configurations.'
                .replace('%(cluster_name)s',
                         urllib_quote(cluster['name'])).replace(
                             '%(service)s',
                             service).replace('%(roleConfigGroups)s',
                                              roleConfigGroup[0]),
                u'body': {
                    u'items': [{
                        u'name': config_name,
                        u'value': config_value
                    }]
                },
                u'contentType':
                u'application/json',
                u'method':
                u'PUT'
            }]
        }

        return self.batch(items=data)

    def get_flume_agents(self, cluster_name=None):
        return [
            host['hostname'] for host in self._get_hosts(
                'FLUME', 'AGENT', cluster_name=cluster_name)
        ]

    def _get_hosts(self, service_name, role_name, cluster_name=None):
        try:
            cluster = self._get_cluster(cluster_name)
            services = self._root.get('clusters/%(name)s/services' %
                                      cluster)['items']

            service = [
                service for service in services
                if service['type'] == service_name
            ][0]
            hosts = self._get_roles(cluster['name'], service['name'],
                                    role_name)
            hosts_ids = [host['hostRef']['hostId'] for host in hosts]

            hosts = self._root.get('hosts')['items']
            return [host for host in hosts if host['hostId'] in hosts_ids]
        except RestException as e:
            raise ManagerApiException(e)

    def refresh_flume(self, cluster_name, restart=False):
        service = 'FLUME-1'
        cluster = self._get_cluster(cluster_name)
        roles = [
            role['name']
            for role in self._get_roles(cluster['name'], service, 'AGENT')
        ]

        if restart:
            return self.restart_services(cluster['name'], service, roles)
        else:
            return self.refresh_configs(cluster['name'], service, roles)

    def refresh_configs(self, cluster_name, service=None, roles=None):
        try:
            if service is None:
                return self._root.post(
                    'clusters/%(cluster_name)s/commands/refresh' %
                    {'cluster_name': cluster_name},
                    contenttype="application/json")
            elif roles is None:
                return self._root.post(
                    'clusters/%(cluster_name)s/services/%(service)s/roleCommands/refresh'
                    % {
                        'cluster_name': cluster_name,
                        'service': service
                    },
                    contenttype="application/json")
            else:
                return self._root.post(
                    'clusters/%(cluster_name)s/services/%(service)s/roleCommands/refresh'
                    % {
                        'cluster_name': cluster_name,
                        'service': service
                    },
                    data=json.dumps({"items": roles}),
                    contenttype="application/json")
        except RestException as e:
            raise ManagerApiException(e)

    def restart_services(self, cluster_name, service=None, roles=None):
        try:
            if service is None:
                return self._root.post(
                    'clusters/%(cluster_name)s/commands/restart' %
                    {'cluster_name': cluster_name},
                    contenttype="application/json")
            elif roles is None:
                return self._root.post(
                    'clusters/%(cluster_name)s/services/%(service)s/roleCommands/restart'
                    % {
                        'cluster_name': cluster_name,
                        'service': service
                    },
                    contenttype="application/json")
            else:
                return self._root.post(
                    'clusters/%(cluster_name)s/services/%(service)s/roleCommands/restart'
                    % {
                        'cluster_name': cluster_name,
                        'service': service
                    },
                    data=json.dumps({"items": roles}),
                    contenttype="application/json")
        except RestException as e:
            raise ManagerApiException(e)

    def batch(self, items):
        try:
            return self._root.post('batch',
                                   data=json.dumps(items),
                                   contenttype='application/json')
        except RestException as e:
            raise ManagerApiException(e)

    def _get_cluster(self, cluster_name=None):
        clusters = self._root.get('clusters/')['items']

        if cluster_name is not None:
            cluster = [
                cluster for cluster in clusters
                if cluster['name'] == cluster_name
            ][0]
        else:
            cluster = clusters[0]

        return cluster

    def _get_roles(self, cluster_name, service_name, role_type):
        roles = self._root.get(
            'clusters/%(cluster_name)s/services/%(service_name)s/roles' % {
                'cluster_name': cluster_name,
                'service_name': service_name
            })['items']
        return [role for role in roles if role['type'] == role_type]

    def get_impalad_config(self,
                           key=None,
                           impalad_host=None,
                           cluster_name=None):
        if not key or not impalad_host:
            return None

        service_name = "IMPALA"
        role_type = 'IMPALAD'

        try:
            cluster = self._get_cluster(cluster_name)
            services = self._root.get('clusters/%(cluster_name)s/services' % {
                'cluster_name': cluster['name'],
                'service_name': service_name
            })['items']

            service_display_names = [
                service['displayName'] for service in services
                if service['type'] == service_name
            ]

            hosts = self._root.get('hosts')['items']
            impalad_hostIds = [
                host['hostId'] for host in hosts
                if host['hostname'] == impalad_host
            ]

            if impalad_hostIds and service_display_names:
                impalad_hostId = impalad_hostIds[0]
                impala_service_display_name = service_display_names[0]

                servers = self._root.get(
                    'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles'
                    % {
                        'cluster_name': cluster['name'],
                        'spark_service_display_name':
                        impala_service_display_name
                    })['items']

                impalad_server_names = [
                    server['name'] for server in servers
                    if server['type'] == role_type
                    and server['hostRef']['hostId'] == impalad_hostId
                ]
                impalad_server_name = impalad_server_names[
                    0] if impalad_server_names else None

                if impalad_server_name:
                    server_configs = self._root.get(
                        'clusters/%(cluster_name)s/services/%(spark_service_display_name)s/roles/%(shs_server_name)s/config'
                        % {
                            'cluster_name': cluster['name'],
                            'spark_service_display_name':
                            impala_service_display_name,
                            'shs_server_name': impalad_server_name
                        },
                        params={'view': 'full'})['items']

                    for config in server_configs:
                        if 'relatedName' in config and 'value' in config:
                            if config['relatedName'] == key:
                                return config['value']

        except Exception as e:
            LOG.warning(
                "Get Impala Daemon API configurations via ManangerAPI: %s" % e)

        return None
Example #10
0
class ImpalaDaemonApi(object):
    def __init__(self, server_url):
        self._url = server_url
        self._client = HttpClient(self._url, logger=LOG)
        self._root = Resource(self._client)
        self._security_enabled = is_kerberos_enabled()
        self._webserver_spnego_enabled = is_webserver_spnego_enabled()
        self._thread_local = threading.local()

        # You can set username/password for Impala Web UI which overrides kerberos
        if DAEMON_API_USERNAME.get() is not None and DAEMON_API_PASSWORD.get(
        ) is not None:
            if DAEMON_API_AUTH_SCHEME.get().lower() == 'basic':
                self._client.set_basic_auth(DAEMON_API_USERNAME.get(),
                                            DAEMON_API_PASSWORD.get())
                LOG.info(
                    "Using username and password for basic authentication")
            else:
                self._client.set_digest_auth(DAEMON_API_USERNAME.get(),
                                             DAEMON_API_PASSWORD.get())
                LOG.info(
                    'Using username and password for digest authentication')
        elif self._webserver_spnego_enabled or self._security_enabled:
            self._client.set_kerberos_auth()
            LOG.info('Using kerberos principal for authentication')

    def __str__(self):
        return "ImpalaDaemonApi at %s" % self._url

    @property
    def url(self):
        return self._url

    @property
    def security_enabled(self):
        return self._security_enabled

    @property
    def user(self):
        return self._thread_local.user

    def set_user(self, user):
        if hasattr(user, 'username'):
            self._thread_local.user = user.username
        else:
            self._thread_local.user = user

    def get_queries(self):
        params = {'json': 'true'}

        resp = self._root.get('queries', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi did not return valid JSON: %s' % e)

    def get_query(self, query_id):
        params = {'query_id': query_id, 'json': 'true'}

        resp = self._root.get('query_plan', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi did not return valid JSON: %s' % e)

    def get_query_profile(self, query_id):
        params = {'query_id': query_id, 'json': 'true'}

        resp = self._root.get('query_profile', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi query_profile did not return valid JSON: %s' %
                e)

    def get_query_memory(self, query_id):
        params = {'query_id': query_id, 'json': 'true'}

        resp = self._root.get('query_memory', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi query_memory did not return valid JSON: %s' %
                e)

    def kill(self, query_id):
        params = {'query_id': query_id, 'json': 'true'}
        resp = self._root.get('cancel_query', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi kill did not return valid JSON: %s' % e)

    def get_query_backends(self, query_id):
        params = {'query_id': query_id, 'json': 'true'}

        resp = self._root.get('query_backends', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi query_backends did not return valid JSON: %s'
                % e)

    def get_query_finstances(self, query_id):
        params = {'query_id': query_id, 'json': 'true'}

        resp = self._root.get('query_finstances', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi query_finstances did not return valid JSON: %s'
                % e)

    def get_query_summary(self, query_id):
        params = {'query_id': query_id, 'json': 'true'}

        resp = self._root.get('query_summary', params=params)
        try:
            if isinstance(resp, basestring):
                return json.loads(resp)
            else:
                return resp
        except ValueError as e:
            raise ImpalaDaemonApiException(
                'ImpalaDaemonApi query_summary did not return valid JSON: %s' %
                e)

    def get_query_profile_encoded(self, query_id):
        params = {'query_id': query_id}

        return self._root.get('query_profile_encoded', params=params)
Example #11
0
class NavigatorApi(object):
    """
  http://cloudera.github.io/navigator/apidocs/v3/index.html
  """
    def __init__(self, api_url=None, user=None, password=None):
        self._api_url = '%s/%s' % (
            (api_url or NAVIGATOR.API_URL.get()).strip('/'), VERSION)
        self._username = user or NAVIGATOR.AUTH_USERNAME.get()
        self._password = password or NAVIGATOR.AUTH_PASSWORD.get()

        self._client = HttpClient(self._api_url, logger=LOG)
        self._client.set_basic_auth(self._username, self._password)
        self._root = resource.Resource(
            self._client, urlencode=False)  # For search_entities_interactive

        self.__headers = {}
        self.__params = ()

    def _get_types_from_sources(self, sources):
        default_entity_types = entity_types = ('DATABASE', 'TABLE',
                                               'PARTITION', 'FIELD', 'FILE',
                                               'VIEW', 'OPERATION',
                                               'DIRECTORY')

        if 'sql' in sources or 'hive' in sources or 'impala' in sources:
            default_entity_types = ('TABLE', 'VIEW')
            entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
        elif 'hdfs' in sources:
            entity_types = ('FILE', 'DIRECTORY')
            default_entity_types = ('FILE', 'DIRECTORY')

        return default_entity_types, entity_types

    def search_entities(self, query_s, limit=100, offset=0, **filters):
        """
    Solr edismax query parser syntax.

    :param query_s: a query string of search terms (e.g. - sales quarterly);
      Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist
      of search_fields.
      TODO: support smarter boolean searching with arbitrary ordering and precedence of conditionals
    """
        search_fields = ('originalName', 'originalDescription', 'name',
                         'description', 'tags')

        sources = filters.get('sources', [])
        default_entity_types, entity_types = self._get_types_from_sources(
            sources)

        try:
            params = self.__params

            search_terms = [term for term in query_s.strip().split()]

            query_clauses = []
            user_filters = []
            for term in search_terms:
                if ':' not in term:
                    query_clauses.append('OR'.join([
                        '(%s:*%s*)' % (field, term) for field in search_fields
                    ]))
                else:
                    name, val = term.split(':')
                    if val and (name != 'type' or val
                                in entity_types):  # Manual filter allowed
                        user_filters.append(term + '*')  # e.g. type:VIEW ca

            filter_query = '*'

            if query_clauses:
                filter_query = 'OR'.join(
                    ['(%s)' % clause for clause in query_clauses])

            user_filter_clause = 'OR '.join(['(%s)' % f
                                             for f in user_filters]) or '*'
            source_filter_clause = 'OR'.join([
                '(%s:%s)' % ('type', entity_type)
                for entity_type in default_entity_types
            ])

            filter_query = '%s AND (%s) AND (%s)' % (
                filter_query, user_filter_clause, source_filter_clause)

            params += (
                ('query', filter_query),
                ('offset', offset),
                ('limit', limit),
            )

            LOG.info(params)
            response = self._root.get('entities',
                                      headers=self.__headers,
                                      params=params)

            return response
        except RestException, e:
            msg = 'Failed to search for entities with search query: %s' % query_s
            LOG.exception(msg)
            raise NavigatorApiException(msg)
Example #12
0
class AtlasApi(Api):
    """
  https://atlas.apache.org
  """
    DEFAULT_SEARCH_FIELDS = (('originalName', 3), ('originalDescription', 1),
                             ('name', 10), ('description', 3), ('tags', 5))
    CATALOG_NAMESPACE = '__cloudera_internal_catalog_hue'

    NAV_TO_ATLAS_TYPE = {
        'table': 'hive_table',
        'database': 'hive_db',
        'field': 'hive_column'
    }

    ATLAS_TO_NAV_TYPE = {
        'hive_table': 'TABLE',
        'hive_db': 'DATABASE',
        'hive_column': 'FIELD'
    }

    CLASSIFICATION_RE = re.compile(
        '(?:tag|tags|classification)\s*\:\s*(?:(?:\"([^"]+)\")|([^ ]+))\s*',
        re.IGNORECASE)
    TYPE_RE = re.compile('type\s*\:\s*([^ ]+)\s*', re.IGNORECASE)
    OWNER_RE = re.compile('owner\s*\:\s*([^ ]+)\s*', re.IGNORECASE)

    def __init__(self, user=None):
        super(AtlasApi, self).__init__(user)

        self._api_url = CATALOG.API_URL.get().strip('/') + "/api/atlas"
        self._username = CATALOG.SERVER_USER.get()
        self._password = CATALOG.SERVER_PASSWORD.get()

        self._client = HttpClient(self._api_url, logger=LOG)
        if CATALOG.KERBEROS_ENABLED.get():
            self._client.set_kerberos_auth()
        elif self._password:
            self._client.set_basic_auth(self._username, self._password)

        self._root = resource.Resource(
            self._client, urlencode=False)  # For search_entities_interactive

        self.__headers = {}
        self.__params = ()

        #self._fillup_properties() # Disabled currently

    def _get_types_from_sources(self, sources):
        default_entity_types = entity_types = ('DATABASE', 'TABLE',
                                               'PARTITION', 'FIELD', 'FILE',
                                               'VIEW', 'S3BUCKET', 'OPERATION',
                                               'DIRECTORY')

        if 'sql' in sources or 'hive' in sources or 'impala' in sources:
            entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
            default_entity_types = ('TABLE', 'VIEW')
        elif 'hdfs' in sources:
            entity_types = ('FILE', 'DIRECTORY')
            default_entity_types = ('FILE', 'DIRECTORY')
        elif 's3' in sources:
            entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET')
            default_entity_types = ('DIRECTORY', 'S3BUCKET')

        return default_entity_types, entity_types

    def adapt_atlas_entity_to_navigator(self, atlas_entity):
        nav_entity = {
            "created":
            'createTime' in atlas_entity['attributes']
            and atlas_entity['attributes']['createTime'],
            "customProperties":
            None,
            "description":
            atlas_entity['attributes'].get('description'),
            "identity":
            atlas_entity['guid'],
            "internalType":
            atlas_entity['typeName'],
            "meaningNames":
            atlas_entity['meaningNames'],  # Atlas specific
            "meanings":
            atlas_entity['meanings'],  # Atlas specific
            "name":
            atlas_entity['attributes'].get('name'),
            "original_name":
            atlas_entity['attributes'].get('name'),
            "originalDescription":
            None,
            "originalName":
            atlas_entity['attributes'].get('name'),
            "owner":
            atlas_entity['attributes'].get('owner'),
            "parentPath":
            '',  # Set below
            "properties": {},  # Set below
            "sourceType":
            '',  # Set below
            "classifications": [],
            "tags":
            atlas_entity['classificationNames'],
            "type":
            self.ATLAS_TO_NAV_TYPE.get(atlas_entity['typeName'].lower())
            or atlas_entity['typeName']
        }

        # Convert Atlas qualified name of form db.tbl.col@cluster to parentPath of form /db/tbl
        if atlas_entity['typeName'].lower().startswith('hive_'):
            nav_entity['sourceType'] = 'HIVE'
            qualified_path_parts = re.sub(
                r'@.*$', '',
                atlas_entity['attributes'].get('qualifiedName')).split('.')
            qualified_path_parts.pop(
            )  # it's just the parent path we want so remove the entity name
            nav_entity['parentPath'] = '/' + '/'.join(qualified_path_parts)

        if 'classifications' in atlas_entity:
            nav_entity['classifications'] = atlas_entity['classifications']
            for atlas_classification in atlas_entity['classifications']:
                if 'attributes' in atlas_classification:
                    for key, value in atlas_classification[
                            'attributes'].iteritems():
                        nav_entity['properties'][key] = value

        return nav_entity

    def fetch_single_entity(self, dsl_query):
        '''
    REQUEST: hue:8889/metadata/api/navigator/find_entity?type=database&name=default
    SAMPLE response for Navigator find_entity response
    {"status": 0, "entity": {
    "customProperties": null,
    "deleteTime": null,
     "fileSystemPath": "hdfs://nightly6x-1.vpc.cloudera.com:8020/user/hive/warehouse",
     "description": null,
     "params": null,
      "type": "DATABASE",
      "internalType": "hv_database",
      "sourceType": "HIVE",
      "tags": [],
      "deleted": false, "technicalProperties": null,
      "userEntity": false,
      "originalDescription": "Default Hive database",
      "metaClassName": "hv_database",
      "properties": {"__cloudera_internal__hueLink": "https://nightly6x-1.vpc.cloudera.com:8889/hue/metastore/tables/default"},
      "identity": "23",
      "firstClassParentId": null,
      "name": null,
      "extractorRunId": "7##1",
      "sourceId": "7",
       "packageName": "nav",
       "parentPath": null, "originalName": "default"}}
    '''
        response = {"status": 0, "entity": []}

        try:
            atlas_response = self._root.get('/v2/search/dsl?query=%s' %
                                            dsl_query,
                                            headers=self.__headers,
                                            params=self.__params)
            if not 'entities' in atlas_response or len(
                    atlas_response['entities']) < 1:
                raise CatalogEntityDoesNotExistException(
                    'Could not find entity with query: %s' % dsl_query)

            for atlas_entity in atlas_response['entities']:
                response['entity'].append(
                    self.adapt_atlas_entity_to_navigator(atlas_entity))

            return response['entity'][0]
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Hue could not query Atlas',
                                            detail=e)

    def get_database(self, name):
        # Search with Atlas API for hive database with specific name
        if get_catalog_search_cluster():
            qualifiedNameCriteria = 'qualifiedName=\'%s@%s\'' % (
                name, get_catalog_search_cluster())
        else:
            qualifiedNameCriteria = 'qualifiedName like \'%s@*\'' % name

        return self.fetch_single_entity('hive_db where %s' %
                                        qualifiedNameCriteria)

    def get_table(self, database_name, table_name, is_view=False):
        # Search with Atlas API for hive tables with specific name
        if get_catalog_search_cluster():
            qualifiedNameCriteria = 'qualifiedName=\'%s.%s@%s\'' % (
                database_name, table_name, get_catalog_search_cluster())
        else:
            qualifiedNameCriteria = 'qualifiedName like \'%s.%s@*\'' % (
                database_name, table_name)

        return self.fetch_single_entity('hive_table where %s' %
                                        qualifiedNameCriteria)

    def get_field(self, database_name, table_name, field_name):
        # Search with Atlas API for hive tables with specific qualified name
        if get_catalog_search_cluster():
            qualifiedNameCriteria = 'qualifiedName=\'%s.%s.%s@%s\'' % (
                database_name, table_name, field_name,
                get_catalog_search_cluster())
        else:
            qualifiedNameCriteria = 'qualifiedName like \'%s.%s.%s@*\'' % (
                database_name, table_name, field_name)

        return self.fetch_single_entity('hive_column where %s' %
                                        qualifiedNameCriteria)

    def search_entities_interactive(self,
                                    query_s=None,
                                    limit=100,
                                    offset=0,
                                    facetFields=None,
                                    facetPrefix=None,
                                    facetRanges=None,
                                    filterQueries=None,
                                    firstClassEntitiesOnly=None,
                                    sources=None):
        response = {"status": 0, "results": [], "facets": {"tags": {}}}

        # This takes care of the list_tags endpoint
        if not query_s and facetFields and 'tags' in facetFields:
            classification_response = self._root.get(
                '/v2/types/typedefs?type=classification')
            for classification_def in classification_response[
                    'classificationDefs']:
                if ' ' in classification_def['name']:
                    response['facets']['tags']['"' +
                                               classification_def['name'] +
                                               '"'] = -1
                else:
                    response['facets']['tags'][classification_def['name']] = -1
            return response

        query_s = (query_s.strip() if query_s else '').replace('*', '')

        atlas_type = None
        classification = None
        owner = None

        # Take the first classification and type facets and ignore other as we can't search multiple in Atlas.
        classification_facets = self.CLASSIFICATION_RE.findall(query_s)
        if classification_facets:
            classification = classification_facets[0][
                0] or classification_facets[0][1]
            query_s = self.CLASSIFICATION_RE.sub('', query_s).strip()
            atlas_type = 'Asset'  # Filtered below to just contain hive_db, hive_table or hive_column

        owner_facets = self.OWNER_RE.findall(query_s)
        if owner_facets:
            owner = owner_facets[0]
            query_s = self.OWNER_RE.sub('', query_s).strip()

        type_facets = self.TYPE_RE.findall(query_s)
        if type_facets:
            atlas_type = self.NAV_TO_ATLAS_TYPE[
                type_facets[0].lower()] or type_facets[0]
            query_s = self.TYPE_RE.sub('', query_s).strip()

        data = {
            'attributes': None,
            'classification': classification,
            'entityFilters': {
                'condition':
                'AND',
                'criterion': [{
                    'condition':
                    'OR',
                    'criterion': [{
                        'attributeName': 'name',
                        'attributeValue': query_s,
                        'operator': 'contains'
                    }, {
                        'attributeName': 'description',
                        'attributeValue': query_s,
                        'operator': 'contains'
                    }]
                }]
            },
            'excludeDeletedEntities': True,
            'includeClassificationAttributes': True,
            'includeSubClassifications': True,
            'includeSubTypes': True,
            'limit': limit,
            'offset': 0,
            'tagFilters': None,
            'termName': None,
            'typeName': atlas_type or 'hive_table'
        }

        if get_catalog_search_cluster():
            data['entityFilters']['criterion'].append({
                'attributeName':
                'qualifiedName',
                'operator':
                'contains',
                'attributeValue':
                '@' + get_catalog_search_cluster()
            })

        if owner:
            data['entityFilters']['criterion'].append({
                'attributeName': 'owner',
                'operator': 'startsWith',
                'attributeValue': owner
            })

        try:
            atlas_response = self._root.post('/v2/search/basic',
                                             data=json.dumps(data),
                                             contenttype=_JSON_CONTENT_TYPE)

            # Adapt Atlas entities to Navigator structure in the results
            if 'entities' in atlas_response:
                for atlas_entity in atlas_response['entities']:
                    if atlas_type != 'Asset' or atlas_entity['typeName'].lower(
                    ) in ['hive_db', 'hive_table', 'hive_column']:
                        response['results'].append(
                            self.adapt_atlas_entity_to_navigator(atlas_entity))

            return response
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Hue could not query Atlas',
                                            detail=e)

    # search_enties is only used by the table browser to fetch child entities of a given table or database.
    def search_entities(self,
                        query_s,
                        limit=100,
                        offset=0,
                        raw_query=False,
                        **filters):
        try:
            found_entities = []

            search_terms = [term for term in query_s.strip().split()
                            ] if query_s else []
            parentPath = None
            for term in search_terms:
                if 'parentPath:' in term:
                    name, val = term.split(':')
                    parentPath = val.strip('"').lstrip('/').replace('/', '.')

            if query_s == 'type:database':
                if get_catalog_search_cluster():
                    atlas_dsl_query = 'from hive_db where qualifiedName like \'*@%s\' limit %s' % (
                        get_catalog_search_cluster(), limit)
                else:
                    atlas_dsl_query = 'from hive_db limit %s' % limit
            elif not parentPath:
                return found_entities
            else:
                atlas_type = 'hive_table' if parentPath.count(
                    '.') == 0 else 'hive_column'
                if get_catalog_search_cluster():
                    atlas_dsl_query = 'from %s where qualifiedName like \'%s*@%s\' limit %s' % (
                        atlas_type, parentPath, get_catalog_search_cluster(),
                        limit)
                else:
                    atlas_dsl_query = 'from %s where qualifiedName like \'%s*\' limit %s' % (
                        atlas_type, parentPath, limit)

            atlas_response = self._root.get('/v2/search/dsl?query=%s' %
                                            atlas_dsl_query)

            # Adapt Atlas entities to Navigator structure in the results
            if 'entities' in atlas_response:
                for atlas_entity in atlas_response['entities']:
                    found_entities.append(
                        self.adapt_atlas_entity_to_navigator(atlas_entity))

            return found_entities
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Hue could not query Atlas',
                                            detail=e)

    def suggest(self, prefix=None):
        try:
            return self._root.get('interactive/suggestions?query=%s' %
                                  (prefix or '*'))
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to search for entities',
                                            detail=e)

    def get_entity(self, entity_id):
        """
    # TODO: get entity by Atlas __guid or qualifiedName
    GET /v2/search/dsl?query=?
    """
        try:
            return self._root.get('entities/%s' % entity_id,
                                  headers=self.__headers,
                                  params=self.__params)
        except RestException as e:
            msg = 'Failed to get entity %s: %s' % (entity_id, str(e))
            LOG.error(msg)
            raise CatalogApiException(e.message)

    def update_entity(self, entity, **metadata):
        """
    PUT /api/v3/entities/:id
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities_-id-.html
    """
        try:
            # Workarounds NAV-6187: if we don't re-send those, they would get erased.
            properties = {
                'name': entity['name'],
                'description': entity['description'],
                'properties': entity['properties'] or {},
                'customProperties': entity['customProperties'] or {}
            }
            properties.update(metadata)
            data = json.dumps(properties)

            return self._root.put('entities/%(identity)s' % entity,
                                  params=self.__params,
                                  data=data,
                                  contenttype=_JSON_CONTENT_TYPE,
                                  allow_redirects=True,
                                  clear_cookies=True)
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to update entity',
                                            detail=e)

    def get_cluster_source_ids(self):
        return []
        # params = (
        #   ('query', 'clusterName:"%s"' % get_navigator_hue_server_name()),
        #   ('limit', 200),
        # )

        # LOG.info(params)
        # return self._root.get('entities', headers=self.__headers, params=params)

    def add_tags(self, entity_id, tags):
        entity = self.get_entity(entity_id)
        new_tags = entity['tags'] or []
        new_tags.extend(tags)
        return self.update_entity(entity, tags=new_tags)

    def delete_tags(self, entity_id, tags):
        entity = self.get_entity(entity_id)
        new_tags = entity['tags'] or []
        for tag in tags:
            if tag in new_tags:
                new_tags.remove(tag)
        return self.update_entity(entity, tags=new_tags)

    def update_properties(self,
                          entity_id,
                          properties,
                          modified_custom_metadata=None,
                          deleted_custom_metadata_keys=None):
        entity = self.get_entity(entity_id)

        if modified_custom_metadata:
            properties['properties'] = entity['properties'] or {}
            properties['properties'].update(modified_custom_metadata)
        if deleted_custom_metadata_keys:
            properties['properties'] = entity['properties'] or {}
            for key in deleted_custom_metadata_keys:
                if key in properties['properties']:
                    del properties['properties'][key]
        return self.update_entity(entity, **properties)

    def delete_metadata_properties(self, entity_id, property_keys):
        entity = self.get_entity(entity_id)
        new_props = entity['properties'] or {}
        for key in property_keys:
            if key in new_props:
                del new_props[key]
        return self.update_entity(entity, properties=new_props)

    def get_lineage(self, entity_id):
        """
    GET /api/v3/lineage/entityIds=:id
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_lineage.html
    """
        try:
            params = self.__params

            params += (('entityIds', entity_id), )

            return self._root.get('lineage',
                                  headers=self.__headers,
                                  params=params)
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to get lineage', detail=e)

    def create_namespace(self, namespace, description=None):
        try:
            data = json.dumps({'name': namespace, 'description': description})
            return self._root.post('models/namespaces/',
                                   data=data,
                                   contenttype=_JSON_CONTENT_TYPE,
                                   clear_cookies=True)
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to create namespace',
                                            detail=e)

    def get_namespace(self, namespace):
        try:
            return self._root.get('models/namespaces/%(namespace)s' %
                                  {'namespace': namespace})
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to get namespace',
                                            detail=e)

    def create_namespace_property(self, namespace, properties):
        try:
            data = json.dumps(properties)
            return self._root.post(
                'models/namespaces/%(namespace)s/properties' %
                {'namespace': namespace},
                data=data,
                contenttype=_JSON_CONTENT_TYPE,
                clear_cookies=True)
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to create namespace',
                                            detail=e)

    def get_namespace_properties(self, namespace):
        try:
            return self._root.get(
                'models/namespaces/%(namespace)s/properties' %
                {'namespace': namespace})
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to create namespace',
                                            detail=e)

    def map_namespace_property(self, clazz, properties):
        try:
            data = json.dumps(properties)
            return self._root.post(
                'models/packages/nav/classes/%(class)s/properties' %
                {'class': clazz},
                data=data,
                contenttype=_JSON_CONTENT_TYPE,
                clear_cookies=True)
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception('Failed to map class', detail=e)

    def get_model_properties_mapping(self):
        try:
            return self._root.get('models/properties/mappings')
        except RestException as e:
            if e.code == 401:
                raise raise_popup_exception(
                    'Hue could not authenticate to Atlas', detail=e)
            else:
                raise raise_popup_exception(
                    'Failed to get models properties mappings', detail=e)

    def _fillup_properties(self):
        global _HAS_CATALOG_NAMESPACE

        if _HAS_CATALOG_NAMESPACE is None:
            response = self.get_namespace(namespace=AtlasApi.CATALOG_NAMESPACE)
            if not response:
                self.create_namespace(
                    namespace=AtlasApi.CATALOG_NAMESPACE,
                    description="Set of fields to augment the data catalog")

            properties = self.get_namespace_properties(
                namespace=AtlasApi.CATALOG_NAMESPACE)

            if not [
                    _property for _property in properties
                    if _property['name'] == 'relatedDocuments'
            ]:
                self.create_namespace_property(
                    namespace=AtlasApi.CATALOG_NAMESPACE,
                    properties={
                        "name": "relatedDocuments",
                        "displayName": "Related documents",
                        "description":
                        "List of Hue document UUIDs related to this entity",
                        "multiValued": True,
                        "maxLength": 36,
                        "pattern": ".*",  # UUID
                        "enumValues": None,
                        "type": "TEXT"
                    })

                # Might want to check if the mapping is already done
                for clazz in ('hv_table', 'hv_view'):
                    self.map_namespace_property(clazz,
                                                properties=[{
                                                    "namespace":
                                                    AtlasApi.CATALOG_NAMESPACE,
                                                    "name":
                                                    "relatedDocuments"
                                                }])

            _HAS_CATALOG_NAMESPACE = True

    def _get_boosted_term(self, term):
        return 'AND'.join([
            '(%s)' % 'OR'.join([
                '(%s:%s*^%s)' % (field, term, weight)
                for (field, weight) in AtlasApi.DEFAULT_SEARCH_FIELDS
            ]),  # Matching fields
            '(%s)' % 'OR'.join([
                '(%s:[* TO *])' % field
                for (field, weight) in AtlasApi.DEFAULT_SEARCH_FIELDS
            ])  # Boost entities with enriched fields
            # Could add certain customProperties and properties
        ])

    def _clean_path(self, path):
        return path.rstrip('/').split('/')[-1], self._escape_slashes(
            path.rstrip('/'))

    def _escape_slashes(self, s):
        return s.replace('/', '\/')
Example #13
0
class NavigatorApi(object):
    """
  http://cloudera.github.io/navigator/apidocs/v3/index.html
  """
    def __init__(self, api_url=None, user=None, password=None):
        self._api_url = '%s/%s' % (
            (api_url or NAVIGATOR.API_URL.get()).strip('/'), VERSION)
        self._username = user or NAVIGATOR.AUTH_USERNAME.get()
        self._password = password or NAVIGATOR.AUTH_PASSWORD.get()

        self._client = HttpClient(self._api_url, logger=LOG)
        self._client.set_basic_auth(self._username, self._password)
        self._root = resource.Resource(self._client)

        self.__headers = {}
        self.__params = ()

    def search_entities(self, query_s, limit=100, offset=0, **filters):
        """
    GET /api/v3/entities?query=()
    http://cloudera.github.io/navigator/apidocs/v3/path__v3_entities.html
    :param query_s: a query string of search terms (e.g. - sales quarterly);
      Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist
      of search_fields.
      TODO: support smarter boolean searching with arbitrary ordering and precedence of conditionals
    :param filters: TODO: IMPLEMENT ME, required to support property search
    """
        search_fields = ('originalName', 'originalDescription', 'name',
                         'description', 'tags')
        entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE',
                        'OPERATION')

        try:
            params = self.__params

            search_terms = [term.lower() for term in query_s.strip().split()]

            query_clauses = []
            for term in search_terms:
                query_clauses.append('OR'.join(
                    ['(%s:*%s*)' % (field, term) for field in search_fields]))

            filter_query = '(originalName:*.*)'
            if search_terms:
                filter_query = 'OR'.join(
                    ['(%s)' % clause for clause in query_clauses])

            type_filter_clause = 'OR'.join([
                '(%s:%s)' % ('type', entity_type)
                for entity_type in entity_types
            ])
            filter_query = '%sAND(%s)' % (filter_query, type_filter_clause)

            params += (
                ('query', filter_query),
                ('offset', offset),
                ('limit', limit),
            )

            response = self._root.get('entities',
                                      headers=self.__headers,
                                      params=params)

            return response
        except RestException, e:
            msg = 'Failed to search for entities with search query: %s' % query_s
            LOG.exception(msg)
            raise NavigatorApiException(msg)
Example #14
0
class NavigatorApi(object):
    """
  http://cloudera.github.io/navigator/apidocs/v3/index.html
  """
    def __init__(self, user=None):
        self._api_url = '%s/%s' % (NAVIGATOR.API_URL.get().strip('/'), VERSION)
        self._username = NAVIGATOR.AUTH_USERNAME.get()
        self._password = NAVIGATOR.AUTH_PASSWORD.get()

        self.user = user
        self._client = HttpClient(self._api_url, logger=LOG)
        self._client.set_basic_auth(self._username, self._password)
        self._root = resource.Resource(
            self._client, urlencode=False)  # For search_entities_interactive

        self.__headers = {}
        self.__params = ()

    def _get_types_from_sources(self, sources):
        default_entity_types = entity_types = ('DATABASE', 'TABLE',
                                               'PARTITION', 'FIELD', 'FILE',
                                               'VIEW', 'S3BUCKET', 'OPERATION',
                                               'DIRECTORY')

        if 'sql' in sources or 'hive' in sources or 'impala' in sources:
            entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
            default_entity_types = ('TABLE', 'VIEW')
        elif 'hdfs' in sources:
            entity_types = ('FILE', 'DIRECTORY')
            default_entity_types = ('FILE', 'DIRECTORY')
        elif 's3' in sources:
            entity_types = ('FILE', 'DIRECTORY', 'S3BUCKET')
            default_entity_types = ('DIRECTORY', 'S3BUCKET')

        return default_entity_types, entity_types

    def search_entities(self, query_s, limit=100, offset=0, **filters):
        """
    Solr edismax query parser syntax.

    :param query_s: a query string of search terms (e.g. - sales quarterly);
      Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist
      of search_fields.
    """
        search_fields = ('originalName', 'originalDescription', 'name',
                         'description', 'tags')

        sources = filters.get('sources', [])
        default_entity_types, entity_types = self._get_types_from_sources(
            sources)

        try:
            params = self.__params

            search_terms = [term for term in query_s.strip().split()]

            query_clauses = []
            user_filters = []
            source_type_filter = []

            for term in search_terms:
                if ':' not in term:
                    query_clauses.append('OR'.join([
                        '(%s:*%s*)' % (field, term) for field in search_fields
                    ]))
                else:
                    name, val = term.split(':')
                    if val:
                        if name == 'type':
                            term = '%s:%s' % (name, val.upper().strip('*'))
                            default_entity_types = entity_types  # Make sure type value still makes sense for the source
                        user_filters.append(
                            term +
                            '*')  # Manual filter allowed e.g. type:VIE* ca

            filter_query = '*'

            if query_clauses:
                filter_query = 'OR'.join(
                    ['(%s)' % clause for clause in query_clauses])

            user_filter_clause = 'OR '.join(['(%s)' % f
                                             for f in user_filters]) or '*'
            source_filter_clause = 'OR'.join([
                '(%s:%s)' % ('type', entity_type)
                for entity_type in default_entity_types
            ])
            if 's3' in sources:
                source_type_filter.append('sourceType:s3')

            filter_query = '%s AND (%s) AND (%s)' % (
                filter_query, user_filter_clause, source_filter_clause)
            if source_type_filter:
                filter_query += ' AND (%s)' % 'OR '.join(source_type_filter)
            if get_navigator_hue_server_name():
                filter_query += 'AND clusterName:%s' % get_navigator_hue_server_name(
                )

            params += (
                ('query', filter_query),
                ('offset', offset),
                ('limit', NAVIGATOR.FETCH_SIZE_SEARCH.get()),
            )

            LOG.info(params)
            response = self._root.get('entities',
                                      headers=self.__headers,
                                      params=params)

            response = list(islice(self._secure_results(response),
                                   limit))  # Apply Sentry perms

            return response
        except RestException, e:
            msg = 'Failed to search for entities with search query: %s' % query_s
            LOG.exception(msg)
            raise NavigatorApiException(msg)
Example #15
0
class NavigatorApi(object):
  """
  http://cloudera.github.io/navigator/apidocs/v3/index.html
  """

  def __init__(self, user=None):
    self._api_url = '%s/%s' % (NAVIGATOR.API_URL.get().strip('/'), VERSION)
    self._username = NAVIGATOR.AUTH_USERNAME.get()
    self._password = NAVIGATOR.AUTH_PASSWORD.get()

    self.user = user
    self._client = HttpClient(self._api_url, logger=LOG)
    self._client.set_basic_auth(self._username, self._password)
    self._root = resource.Resource(self._client, urlencode=False) # For search_entities_interactive

    self.__headers = {}
    self.__params = ()


  def _get_types_from_sources(self, sources):
    default_entity_types = entity_types = ('DATABASE', 'TABLE', 'PARTITION', 'FIELD', 'FILE', 'VIEW', 'OPERATION', 'DIRECTORY')

    if 'sql' in sources or 'hive' in sources or 'impala' in sources:
      default_entity_types = ('TABLE', 'VIEW')
      entity_types = ('TABLE', 'VIEW', 'DATABASE', 'PARTITION', 'FIELD')
    elif 'hdfs' in sources:
      entity_types = ('FILE', 'DIRECTORY')
      default_entity_types  = ('FILE', 'DIRECTORY')

    return default_entity_types, entity_types


  def search_entities(self, query_s, limit=100, offset=0, **filters):
    """
    Solr edismax query parser syntax.

    :param query_s: a query string of search terms (e.g. - sales quarterly);
      Currently the search will perform an OR boolean search for all terms (split on whitespace), against a whitelist
      of search_fields.
    """
    search_fields = ('originalName', 'originalDescription', 'name', 'description', 'tags')

    sources = filters.get('sources', [])
    default_entity_types, entity_types = self._get_types_from_sources(sources)

    try:
      params = self.__params

      search_terms = [term for term in query_s.strip().split()]

      query_clauses = []
      user_filters = []
      for term in search_terms:
        if ':' not in term:
          query_clauses.append('OR'.join(['(%s:*%s*)' % (field, term) for field in search_fields]))
        else:
          name, val = term.split(':')
          if val:
            if name == 'type':
              term = '%s:%s' % (name, val.upper().strip('*'))
              default_entity_types = entity_types # Make sure type value still makes sense for the source
            user_filters.append(term + '*') # Manual filter allowed e.g. type:VIE* ca

      filter_query = '*'

      if query_clauses:
        filter_query = 'OR'.join(['(%s)' % clause for clause in query_clauses])

      user_filter_clause = 'OR '.join(['(%s)' % f for f in user_filters]) or '*'
      source_filter_clause = 'OR'.join(['(%s:%s)' % ('type', entity_type) for entity_type in default_entity_types])

      filter_query = '%s AND (%s) AND (%s)' % (filter_query, user_filter_clause, source_filter_clause)

      params += (
        ('query', filter_query),
        ('offset', offset),
        ('limit', limit),
      )

      LOG.info(params)
      response = self._root.get('entities', headers=self.__headers, params=params)

      self._secure_results(response)

      return response
    except RestException, e:
      msg = 'Failed to search for entities with search query: %s' % query_s
      LOG.exception(msg)
      raise NavigatorApiException(msg)