Beispiel #1
0
 def rename_star(self, old_dir, new_dir):
   """Equivalent to `mv old_dir/* new"""
   if not self.isdir(old_dir):
     raise IOError(errno.ENOTDIR, _("'%s' is not a directory") % old_dir)
   if not self.exists(new_dir):
     self.mkdir(new_dir)
   elif not self.isdir(new_dir):
     raise IOError(errno.ENOTDIR, _("'%s' is not a directory") % new_dir)
   ls = self.listdir(old_dir)
   for dirent in ls:
     self.rename(Hdfs.join(old_dir, dirent), Hdfs.join(new_dir, dirent))
Beispiel #2
0
 def _create_deployment_dir(self):
   """
   Return the job deployment directory in HDFS, creating it if necessary.
   The actual deployment dir should be 0711 owned by the user
   """
   path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time()))
   self._create_dir(path)
   return path
Beispiel #3
0
 def rename(self, old, new):
   """rename(old, new)"""
   old = Hdfs.normpath(old)
   if not new.startswith('/'):
     new = Hdfs.join(Hdfs.dirname(old), new)
   new = Hdfs.normpath(new)
   params = self._getparams()
   params['op'] = 'RENAME'
   # Encode `new' because it's in the params
   params['destination'] = smart_str(new)
   result = self._root.put(old, params)
   if not result['boolean']:
     error_message = "Rename failed:"
     if self.isdir(new) and not self.isdir(old):
       error_message = "Move failed:"
       new = Hdfs.join(new, self.basename(old))
     raise IOError(_(error_message + " %s -> %s") %
                   (str(smart_str(old)), str(smart_str(new))))
Beispiel #4
0
 def _create_deployment_dir(self):
   """
   Return the job deployment directory in HDFS, creating it if necessary.
   The actual deployment dir should be 0711 owned by the user
   """
   if self.user != self.job.owner:
     path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time()))
     self.fs.copy_remote_dir(self.job.deployment_dir, path, owner=self.user, dir_mode=0711)
   else:
     path = self.job.deployment_dir
     self._create_dir(path)
   return path
Beispiel #5
0
 def rename(self, old, new):
     """rename(old, new)"""
     old = Hdfs.normpath(old)
     if not new.startswith("/"):
         new = Hdfs.join(Hdfs.dirname(old), new)
     new = Hdfs.normpath(new)
     params = self._getparams()
     params["op"] = "RENAME"
     # Encode `new' because it's in the params
     params["destination"] = smart_str(new)
     result = self._root.put(old, params)
     if not result["boolean"]:
         raise IOError("Rename failed: %s -> %s" % (smart_str(old), smart_str(new)))
Beispiel #6
0
def create_directories(fs, directory_list=[]):
  # If needed, create the remote home, deployment and data directories
  directories = [REMOTE_DEPLOYMENT_DIR.get()] + directory_list

  for directory in directories:
    if not fs.do_as_user(fs.DEFAULT_USER, fs.exists, directory):
      remote_home_dir = Hdfs.join('/user', fs.DEFAULT_USER)
      if directory.startswith(remote_home_dir):
        # Home is 755
        fs.do_as_user(fs.DEFAULT_USER, fs.create_home_dir, remote_home_dir)
      # Shared by all the users
      fs.do_as_user(fs.DEFAULT_USER, fs.mkdir, directory, 01777)
      fs.do_as_user(fs.DEFAULT_USER, fs.chmod, directory, 01777) # To remove after https://issues.apache.org/jira/browse/HDFS-3491
def create_directories(fs):
  # If needed, create the remote home, deployment and data directories
  directories = (REMOTE_DEPLOYMENT_DIR.get(), REMOTE_SAMPLE_DIR.get())

  for directory in directories:
    if not fs.do_as_user("hdfs", fs.exists, directory):
      remote_home_dir = Hdfs.join('/user', "hdfs")
      if directory.startswith(remote_home_dir):
        # Home is 755
        fs.do_as_user("hdfs", fs.create_home_dir, remote_home_dir)
      # Shared by all the users
      fs.do_as_user("hdfs", fs.mkdir, directory, 511)
      fs.do_as_user("hdfs", fs.chmod, directory, 511) # To remove after https://issues.apache.org/jira/browse/HDFS-3491

  return REMOTE_SAMPLE_DIR.get()
Beispiel #8
0
def create_directories(fs):
  # If needed, create the remote home, deployment and data directories
  directories = (REMOTE_DEPLOYMENT_DIR.get(), REMOTE_SAMPLE_DIR.get())

  for directory in directories:
    if not fs.do_as_user(fs.DEFAULT_USER, fs.exists, directory):
      remote_home_dir = Hdfs.join('/user', fs.DEFAULT_USER)
      if directory.startswith(remote_home_dir):
        # Home is 755
        fs.do_as_user(fs.DEFAULT_USER, fs.create_home_dir, remote_home_dir)
      # Shared by all the users
      fs.do_as_user(fs.DEFAULT_USER, fs.mkdir, directory, 01777)
      fs.do_as_user(fs.DEFAULT_USER, fs.chmod, directory, 01777) # To remove after https://issues.apache.org/jira/browse/HDFS-3491

  return REMOTE_SAMPLE_DIR.get()
Beispiel #9
0
 def rename(self, old, new):
     """rename(old, new)"""
     old = Hdfs.normpath(old)
     if not new.startswith('/'):
         new = Hdfs.join(Hdfs.dirname(old), new)
     new = Hdfs.normpath(new)
     params = self._getparams()
     params['op'] = 'RENAME'
     # Encode `new' because it's in the params
     params['destination'] = smart_str(new)
     result = self._root.put(old, params)
     if not result['boolean']:
         raise IOError(
             _("Rename failed: %s -> %s") %
             (str(smart_str(old)), str(smart_str(new))))
Beispiel #10
0
  def __init__(self, file_status, parent_path):
    self.path = Hdfs.join(parent_path, decode_fs_path(file_status['pathSuffix']))
    self.isDir = file_status['type'] == 'DIRECTORY'
    self.atime = file_status['accessTime'] / 1000
    self.mtime = file_status['modificationTime'] / 1000
    self.user = file_status['owner']
    self.group = file_status['group']
    self.size = file_status['length']
    self.blockSize = file_status['blockSize']
    self.replication = file_status['replication']

    self.mode = int(file_status['permission'], 8)
    if self.isDir:
      self.mode |= stat.S_IFDIR
    else:
      self.mode |= stat.S_IFREG
Beispiel #11
0
 def rename(self, old, new):
     """rename(old, new)"""
     old = self.strip_normpath(old)
     if not self.is_absolute(new):
         new = Hdfs.join(Hdfs.dirname(old), new)
     new = self.strip_normpath(new)
     params = self._getparams()
     params['op'] = 'RENAME'
     # Encode `new' because it's in the params
     params['destination'] = smart_str(new)
     headers = self._getheaders()
     result = self._root.put(old, params, headers=headers)
     if not result['boolean']:
         raise IOError(
             _("Rename failed: %s -> %s") % (smart_str(
                 old, errors='replace'), smart_str(new, errors='replace')))
Beispiel #12
0
 def _create_deployment_dir(self):
     """
 Return the job deployment directory in HDFS, creating it if necessary.
 The actual deployment dir should be 0711 owned by the user
 """
     if self.user != self.job.owner:
         path = Hdfs.join(
             REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' %
             (self.user.username, self.job.id, time.time()))
         self.fs.copy_remote_dir(self.job.deployment_dir,
                                 path,
                                 owner=self.user,
                                 dir_mode=0711)
     else:
         path = self.job.deployment_dir
         self._create_dir(path)
     return path
Beispiel #13
0
  def create_data_dir(cls, fs):
    # If needed, create the remote home and data directories
    remote_data_dir = conf.REMOTE_DATA_DIR.get()
    user = fs.user

    try:
      fs.setuser(fs.DEFAULT_USER)
      if not fs.exists(remote_data_dir):
        remote_home_dir = Hdfs.join('/user', fs.user)
        if remote_data_dir.startswith(remote_home_dir):
          # Home is 755
          fs.create_home_dir(remote_home_dir)
        # Shared by all the users
        fs.mkdir(remote_data_dir, 01777)
    finally:
      fs.setuser(user)

    return remote_data_dir
Beispiel #14
0
    def create_data_dir(cls, fs):
        # If needed, create the remote home and data directories
        remote_data_dir = conf.REMOTE_DATA_DIR.get()
        user = fs.user

        try:
            fs.setuser(fs.DEFAULT_USER)
            if not fs.exists(remote_data_dir):
                remote_home_dir = Hdfs.join('/user', fs.user)
                if remote_data_dir.startswith(remote_home_dir):
                    # Home is 755
                    fs.create_home_dir(remote_home_dir)
                # Shared by all the users
                fs.mkdir(remote_data_dir, 01777)
        finally:
            fs.setuser(user)

        return remote_data_dir
Beispiel #15
0
    def __init__(self, file_status, parent_path):
        self.name = decode_fs_path(file_status['pathSuffix'])
        self.path = Hdfs.join(parent_path, self.name)
        self.isDir = file_status['type'] == 'DIRECTORY'
        self.type = file_status['type']
        self.atime = file_status['accessTime'] / 1000
        self.mtime = file_status['modificationTime'] / 1000
        self.user = file_status['owner']
        self.group = file_status['group']
        self.size = file_status['length']
        self.blockSize = file_status['blockSize']
        self.replication = file_status['replication']

        self.mode = int(file_status['permission'], 8)
        if self.isDir:
            self.mode |= stat.S_IFDIR
        else:
            self.mode |= stat.S_IFREG
Beispiel #16
0
  def _create_deployment_dir(self):
    """
    Return the job deployment directory in HDFS, creating it if necessary.
    The actual deployment dir should be 0711 owned by the user
    """
    # Automatic setup of the required directories if needed
    create_directories(self.fs)

    # Case of a shared job
    if self.user != self.job.owner:
      path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time()))
      # Shared coords or bundles might not have any existing workspaces
      if self.fs.exists(self.job.deployment_dir):
        self.fs.copy_remote_dir(self.job.deployment_dir, path, owner=self.user, dir_mode=0711)
      else:
        self._create_dir(path)
    else:
      path = self.job.deployment_dir
      self._create_dir(path)
    return path
Beispiel #17
0
  def _create_deployment_dir(self):
    """
    Return the job deployment directory in HDFS, creating it if necessary.
    The actual deployment dir should be 0711 owned by the user
    """
    # Automatic setup of the required directories if needed
    create_directories(self.fs)

    # Case of a shared job
    if self.user != self.job.owner:
      path = Hdfs.join(REMOTE_DEPLOYMENT_DIR.get(), '_%s_-oozie-%s-%s' % (self.user.username, self.job.id, time.time()))
      # Shared coords or bundles might not have any existing workspaces
      if self.fs.exists(self.job.deployment_dir):
        self.fs.copy_remote_dir(self.job.deployment_dir, path, owner=self.user, dir_mode=0711)
      else:
        self._create_dir(path)
    else:
      path = self.job.deployment_dir
      self._create_dir(path)
    return path
Beispiel #18
0
def create_data_dir(fs):
  # If needed, create the remote home, deployment and data directories
  directories = (REMOTE_DEPLOYMENT_DIR.get(), REMOTE_SAMPLE_DIR.get())
  user = fs.user

  try:
    fs.setuser(fs.DEFAULT_USER)
    for directory in directories:
      if not fs.exists(directory):
        remote_home_dir = Hdfs.join('/user', fs.user)
        if directory.startswith(remote_home_dir):
          # Home is 755
          fs.create_home_dir(remote_home_dir)
        # Shared by all the users
        fs.mkdir(directory, 01777)
        fs.chmod(directory, 01777) # To remove after https://issues.apache.org/jira/browse/HDFS-3491
  finally:
    fs.setuser(user)

  return REMOTE_SAMPLE_DIR.get()
Beispiel #19
0
class Submission(object):
    """
  Represents one unique Oozie submission.

  Actions are:
  - submit
  - rerun
  """
    def __init__(self,
                 user,
                 job=None,
                 fs=None,
                 jt=None,
                 properties=None,
                 oozie_id=None,
                 local_tz=None):
        self.job = job
        self.user = user
        self.fs = fs
        self.jt = jt  # Deprecated with YARN, we now use logical names only for RM
        self.oozie_id = oozie_id
        self.api = get_oozie(self.user)

        if properties is not None:
            self.properties = properties
        else:
            self.properties = {}

        if local_tz and isinstance(self.job.data, dict):
            local_tz = self.job.data.get('properties')['timezone']

        # Modify start_date & end_date only when it's a coordinator
        from oozie.models2 import Coordinator
        if type(self.job) is Coordinator:
            if 'start_date' in self.properties:
                properties['start_date'] = convert_to_server_timezone(
                    self.properties['start_date'], local_tz)
            if 'end_date' in self.properties:
                properties['end_date'] = convert_to_server_timezone(
                    self.properties['end_date'], local_tz)

        if 'nominal_time' in self.properties:
            properties['nominal_time'] = convert_to_server_timezone(
                self.properties['nominal_time'], local_tz)

        self.properties['security_enabled'] = self.api.security_enabled

    def __str__(self):
        if self.oozie_id:
            res = "Submission for job '%s'." % (self.oozie_id, )
        else:
            res = "Submission for job '%s' (id %s, owner %s)." % (
                self.job.name, self.job.id, self.user)
        if self.oozie_id:
            res += " -- " + self.oozie_id
        return res

    @submit_dryrun
    def run(self, deployment_dir=None):
        """
    Take care of all the actions of submitting a Oozie workflow.
    Returns the oozie job id if all goes well.
    """

        if self.properties and 'oozie.use.system.libpath' not in self.properties:
            self.properties['oozie.use.system.libpath'] = 'true'

        self.oozie_id = self.api.submit_job(self.properties)
        LOG.info("Submitted: %s" % (self, ))

        if self._is_workflow():
            self.api.job_control(self.oozie_id, 'start')
            LOG.info("Started: %s" % (self, ))

        return self.oozie_id

    def rerun(self, deployment_dir, fail_nodes=None, skip_nodes=None):
        jt_address = cluster.get_cluster_addr_for_job_submission()

        self._update_properties(jt_address, deployment_dir)
        self.properties.update({'oozie.wf.application.path': deployment_dir})

        if 'oozie.coord.application.path' in self.properties:
            self.properties.pop('oozie.coord.application.path')

        if 'oozie.bundle.application.path' in self.properties:
            self.properties.pop('oozie.bundle.application.path')

        if fail_nodes:
            self.properties.update({'oozie.wf.rerun.failnodes': fail_nodes})
        elif not skip_nodes:
            self.properties.update({'oozie.wf.rerun.failnodes':
                                    'false'})  # Case empty 'skip_nodes' list
        else:
            self.properties.update({'oozie.wf.rerun.skip.nodes': skip_nodes})

        self.api.rerun(self.oozie_id, properties=self.properties)

        LOG.info("Rerun: %s" % (self, ))

        return self.oozie_id

    def rerun_coord(self, deployment_dir, params):
        jt_address = cluster.get_cluster_addr_for_job_submission()

        self._update_properties(jt_address, deployment_dir)
        self.properties.update(
            {'oozie.coord.application.path': deployment_dir})

        self.api.job_control(self.oozie_id,
                             action='coord-rerun',
                             properties=self.properties,
                             parameters=params)
        LOG.info("Rerun: %s" % (self, ))

        return self.oozie_id

    def update_coord(self):
        self.api = get_oozie(self.user, api_version="v2")
        self.api.job_control(self.oozie_id,
                             action='update',
                             properties=self.properties,
                             parameters=None)
        LOG.info("Update: %s" % (self, ))

        return self.oozie_id

    def rerun_bundle(self, deployment_dir, params):
        jt_address = cluster.get_cluster_addr_for_job_submission()

        self._update_properties(jt_address, deployment_dir)
        self.properties.update(
            {'oozie.bundle.application.path': deployment_dir})
        self.api.job_control(self.oozie_id,
                             action='bundle-rerun',
                             properties=self.properties,
                             parameters=params)
        LOG.info("Rerun: %s" % (self, ))

        return self.oozie_id

    def deploy(self, deployment_dir=None):
        try:
            if not deployment_dir:
                deployment_dir = self._create_deployment_dir()
        except Exception, ex:
            msg = _("Failed to create deployment directory: %s" % ex)
            LOG.exception(msg)
            raise PopupException(message=msg, detail=str(ex))

        if self.api.security_enabled:
            jt_address = cluster.get_cluster_addr_for_job_submission()
            self._update_properties(
                jt_address
            )  # Needed for coordinator deploying workflows with credentials

        if hasattr(self.job, 'nodes'):
            for action in self.job.nodes:
                # Make sure XML is there
                # Don't support more than one level sub-workflow
                if action.data['type'] == 'subworkflow':
                    from oozie.models2 import Workflow
                    workflow = Workflow(document=Document2.objects.get_by_uuid(
                        user=self.user,
                        uuid=action.data['properties']['workflow']))
                    sub_deploy = Submission(self.user, workflow, self.fs,
                                            self.jt, self.properties)
                    workspace = sub_deploy.deploy()

                    self.job.override_subworkflow_id(
                        action,
                        workflow.id)  # For displaying the correct graph
                    self.properties[
                        'workspace_%s' % workflow.
                        uuid] = workspace  # For pointing to the correct workspace

                elif action.data['type'] == 'altus':
                    service = 'dataeng'  # action.data['properties'].get('script_path')
                    auth_key_id = ALTUS.AUTH_KEY_ID.get()
                    auth_key_secret = ALTUS.AUTH_KEY_SECRET.get().replace(
                        '\\n', '\n')
                    shell_script = self._generate_altus_action_script(
                        service=service,
                        auth_key_id=auth_key_id,
                        auth_key_secret=auth_key_secret)
                    self._create_file(deployment_dir,
                                      action.data['name'] + '.py',
                                      shell_script)
                    self.fs.do_as_user(
                        self.user, self.fs.copyFromLocal,
                        os.path.join(get_desktop_root(), 'core', 'ext-py',
                                     'navoptapi-0.1.0'),
                        self.job.deployment_dir)

                elif action.data['type'] == 'impala' or action.data[
                        'type'] == 'impala-document':
                    from oozie.models2 import _get_impala_url
                    from impala.impala_flags import get_ssl_server_certificate

                    if action.data['type'] == 'impala-document':
                        from notebook.models import Notebook
                        if action.data['properties'].get('uuid'):
                            notebook = Notebook(
                                document=Document2.objects.get_by_uuid(
                                    user=self.user,
                                    uuid=action.data['properties']['uuid']))
                            statements = notebook.get_str()
                            statements = Template(statements).safe_substitute(
                                **self.properties)
                            script_name = action.data['name'] + '.sql'
                            self._create_file(deployment_dir, script_name,
                                              statements)
                    else:
                        script_name = os.path.basename(
                            action.data['properties'].get('script_path'))

                    if self.api.security_enabled:
                        kinit = 'kinit -k -t *.keytab %(user_principal)s' % {
                            'user_principal':
                            self.properties.get(
                                'user_principal', action.data['properties'].
                                get('user_principal'))
                        }
                    else:
                        kinit = ''

                    shell_script = """#!/bin/bash

# Needed to launch impala shell in oozie
export PYTHON_EGG_CACHE=./myeggs

%(kinit)s

impala-shell %(kerberos_option)s %(ssl_option)s -i %(impalad_host)s -f %(query_file)s""" % {
                        'impalad_host':
                        action.data['properties'].get('impalad_host')
                        or _get_impala_url(),
                        'kerberos_option':
                        '-k' if self.api.security_enabled else '',
                        'ssl_option':
                        '--ssl' if get_ssl_server_certificate() else '',
                        'query_file':
                        script_name,
                        'kinit':
                        kinit
                    }

                    self._create_file(deployment_dir,
                                      action.data['name'] + '.sh',
                                      shell_script)

                elif action.data['type'] == 'hive-document':
                    from notebook.models import Notebook
                    if action.data['properties'].get('uuid'):
                        notebook = Notebook(
                            document=Document2.objects.get_by_uuid(
                                user=self.user,
                                uuid=action.data['properties']['uuid']))
                        statements = notebook.get_str()
                    else:
                        statements = action.data['properties'].get(
                            'statements')

                    if self.properties.get('send_result_path'):
                        statements = """
INSERT OVERWRITE DIRECTORY '%s'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   "separatorChar" = "\t",
   "quoteChar"     = "'",
   "escapeChar"    = "\\"
)
STORED AS TEXTFILE %s""" % (self.properties.get('send_result_path'),
                            '\n\n\n'.join([
                                snippet['statement_raw']
                                for snippet in notebook.get_data()['snippets']
                            ]))

                    if statements is not None:
                        self._create_file(deployment_dir,
                                          action.data['name'] + '.sql',
                                          statements)

                elif action.data['type'] in ('java-document', 'java',
                                             'mapreduce-document'):
                    if action.data['type'] == 'java-document' or action.data[
                            'type'] == 'mapreduce-document':
                        from notebook.models import Notebook
                        notebook = Notebook(
                            document=Document2.objects.get_by_uuid(
                                user=self.user,
                                uuid=action.data['properties']['uuid']))
                        properties = notebook.get_data(
                        )['snippets'][0]['properties']
                    else:
                        properties = action.data['properties']

                    if properties.get('app_jar'):
                        LOG.debug("Adding to oozie.libpath %s" %
                                  properties['app_jar'])
                        paths = [properties['app_jar']]
                        if self.properties.get('oozie.libpath'):
                            paths.append(self.properties['oozie.libpath'])
                        self.properties['oozie.libpath'] = ','.join(paths)

                elif action.data['type'] == 'pig-document':
                    from notebook.models import Notebook
                    notebook = Notebook(document=Document2.objects.get_by_uuid(
                        user=self.user, uuid=action.data['properties']
                        ['uuid']))
                    statements = notebook.get_data(
                    )['snippets'][0]['statement_raw']

                    self._create_file(deployment_dir,
                                      action.data['name'] + '.pig', statements)
                elif action.data['type'] == 'spark' or action.data[
                        'type'] == 'spark-document':
                    if not [
                            f for f in action.data.get('properties').get(
                                'files', [])
                            if f.get('value').endswith('hive-site.xml')
                    ]:
                        hive_site_lib = Hdfs.join(deployment_dir + '/lib/',
                                                  'hive-site.xml')
                        hive_site_content = get_hive_site_content()
                        if not self.fs.do_as_user(
                                self.user, self.fs.exists,
                                hive_site_lib) and hive_site_content:
                            self.fs.do_as_user(
                                self.user,
                                self.fs.create,
                                hive_site_lib,
                                overwrite=True,
                                permission=0700,
                                data=smart_str(hive_site_content))

        oozie_xml = self.job.to_xml(self.properties)
        self._do_as(self.user.username, self._copy_files, deployment_dir,
                    oozie_xml, self.properties)

        return deployment_dir