Example #1
0
 def _mk_tmp_dir(self):
     query_timestamp =\
         datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
     query_dir = os.path.join(
         self.config.HIVE_QUERIES_DIR, self.config.USER, '%s_%s_%s' %
         (query_timestamp, utils.get_random_string(), self.config.NAME))
     hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config,
                                                  command='mkdir -p %s' %
                                                  query_dir)
     return query_dir
Example #2
0
 def _mk_tmp_dir(self):
     query_timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
     query_dir = os.path.join(
         self.config.HIVE_QUERIES_DIR,
         self.config.USER,
         "{0!s}_{1!s}_{2!s}".format(query_timestamp, utils.get_random_string(), self.config.NAME),
     )
     hadoop_utils.run_and_check_command_in_hadoop(
         self.hadoop_host_config, command="mkdir -p {0!s}".format(query_dir)
     )
     return query_dir
Example #3
0
 def _mk_tmp_dir(self):
     query_timestamp =\
         datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
     query_dir = os.path.join(
         self.config.HIVE_QUERIES_DIR,
         self.config.USER,
         '%s_%s_%s' % (query_timestamp,
                       utils.get_random_string(),
                       self.config.NAME))
     hadoop_utils.run_and_check_command_in_hadoop(
         self.hadoop_host_config,
         command='mkdir -p %s' % query_dir)
     return query_dir
Example #4
0
    def run_hadoop_job(self,
                       class_name,
                       jobconf_args=None,
                       extra_args=None,
                       extra_jars=None):
        """Run a Hadoop job in Qubole cluster.

        We assume extra_jars are stored on s3 and the path looks like:
            s3://pinball/%{USER}/some_jar_dir/

        We fail the entire command if pulling the JARs down from s3 fails,
        so we use "&&" to connect shell commands.
        """
        jobconf_args = jobconf_args if jobconf_args else {}
        extra_args = extra_args if extra_args else []
        extra_jars = extra_jars if extra_jars else []

        # The place where all jars are stored in s3.
        s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars
        # The place where all jars will be copied to locally.
        local_jar_dir = '/tmp/hadoop_users/%s/%s' % \
                        (self.config.USER, utils.get_random_string())
        download_jar_cmds = [
            'hadoop fs -get %s %s' % (s3_dir, local_jar_dir)
            for s3_dir in s3_jar_dirs
        ]
        download_jar_cmd = ' && '.join(download_jar_cmds)
        appjar_name = s3_utils.extract_file_name_from_s3_path(
            self.config.USER_APPJAR_PATH)
        download_jar_cmd += ' && hadoop fs -get %s %s/%s' % (
            self.config.USER_APPJAR_PATH, local_jar_dir, appjar_name)

        # Set default JobConf args.
        jobconf_args = {} if jobconf_args is None else jobconf_args.copy()
        if self.config.SCHEDULER_QUEUE:
            jobconf_args[self.config.SCHEDULER_PARAM] = \
                self.config.SCHEDULER_QUEUE
        jobconf_args['mapred.job.name'] = self.job_name

        # Create arguments.
        arguments = ' '.join('-D%s=%s' % (k, v)
                             for k, v in jobconf_args.iteritems())
        arguments += ' '
        arguments += ' '.join(extra_args)

        libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir)
        hadoop_classpath = '%s/*' % local_jar_dir

        cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s'

        files_to_be_deleted = []
        for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST:
            files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar))
        if files_to_be_deleted:
            cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted))

        # Generate command.
        var_dict = {
            'class_name': class_name,
            'arguments': arguments,
            'appjar_name': appjar_name,
            'download_jar_cmd': download_jar_cmd,
            'local_jar_dir': local_jar_dir,
            'hadoop_classpath': hadoop_classpath,
            'libjars': libjars,
        }
        cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s'
                ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s'
                ' %(class_name)s'
                ' -libjars %(libjars)s'
                ' %(arguments)s')
        cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;'
        cmd = cmd % var_dict

        # Log command messages.
        self.log.info('Full command: %s' % cmd)

        # Run command.
        hc, output, stderr, job_ids = self.run_shell_command(cmd)
        return output, stderr, job_ids
Example #5
0
    def run_hadoop_job(self,
                       class_name,
                       jobconf_args=None,
                       extra_args=None,
                       extra_jars=None):
        """Run a Hadoop job in Qubole cluster.

        We assume extra_jars are stored on s3 and the path looks like:
            s3://pinball/%{USER}/some_jar_dir/

        We fail the entire command if pulling the JARs down from s3 fails,
        so we use "&&" to connect shell commands.
        """
        jobconf_args = jobconf_args if jobconf_args else {}
        extra_args = extra_args if extra_args else []
        extra_jars = extra_jars if extra_jars else []

        # The place where all jars are stored in s3.
        s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars
        # The place where all jars will be copied to locally.
        local_jar_dir = '/tmp/hadoop_users/%s/%s' % \
                        (self.config.USER, utils.get_random_string())
        download_jar_cmds = ['hadoop fs -get %s %s' % (s3_dir, local_jar_dir)
                             for s3_dir in s3_jar_dirs]
        download_jar_cmd = ' && '.join(download_jar_cmds)
        appjar_name = s3_utils.extract_file_name_from_s3_path(
            self.config.USER_APPJAR_PATH)
        download_jar_cmd += ' && hadoop fs -get %s %s/%s' % (
            self.config.USER_APPJAR_PATH,
            local_jar_dir,
            appjar_name
        )

        # Set default JobConf args.
        jobconf_args = {} if jobconf_args is None else jobconf_args.copy()
        if self.config.SCHEDULER_QUEUE:
            jobconf_args[self.config.SCHEDULER_PARAM] = \
                self.config.SCHEDULER_QUEUE
        jobconf_args['mapred.job.name'] = self.job_name

        # Create arguments.
        arguments = ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems())
        arguments += ' '
        arguments += ' '.join(extra_args)

        libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir)
        hadoop_classpath = '%s/*' % local_jar_dir

        cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s'

        files_to_be_deleted = []
        for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST:
            files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar))
        if files_to_be_deleted:
            cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted))

        # Generate command.
        var_dict = {
            'class_name': class_name,
            'arguments': arguments,
            'appjar_name': appjar_name,
            'download_jar_cmd': download_jar_cmd,
            'local_jar_dir': local_jar_dir,
            'hadoop_classpath': hadoop_classpath,
            'libjars': libjars,
        }
        cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s'
                ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s'
                ' %(class_name)s'
                ' -libjars %(libjars)s'
                ' %(arguments)s')
        cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;'
        cmd = cmd % var_dict

        # Log command messages.
        self.log.info('Full command: %s' % cmd)

        # Run command.
        hc, output, stderr, job_ids = self.run_shell_command(cmd)
        return output, stderr, job_ids