コード例 #1
0
ファイル: emr_executor.py プロジェクト: runt18/pinball
    def run_hive_query(self, query_str, upload_archive=False):
        full_query_string = self._generate_hive_query_header(upload_archive=upload_archive)
        full_query_string += self._get_scheduler_hive_setting()
        full_query_string += query_str

        self.log.info("Running query:\n {0!s}".format(full_query_string))

        query_dir = self._mk_tmp_dir()
        hadoop_utils.put_string_to_hadoop(
            self.hadoop_host_config, full_query_string, "{0!s}/query.ql".format(query_dir)
        )

        hadoop_utils.run_and_check_command_in_hadoop(
            self.hadoop_host_config,
            "set -o pipefail; /home/hadoop/hive/bin/hive -f %s/query.ql 2>&1 "
            "> %s/out.csv | tee %s/out.err" % (query_dir, query_dir, query_dir),
            log_line_processor=self._hive_query_log_line_processor,
        )

        q_stdout = self._get_raw_query_result("{0!s}/out.csv".format(query_dir))
        q_stderr = self._get_raw_query_result("{0!s}/out.err".format(query_dir))

        self._check_for_hive_failure_message(q_stderr)

        self.log.info(
            "Output has {0:d} rows. First 10 rows:\n\t{1!s}".format(
                len(q_stdout), "\n\t".join([str(o) for o in q_stdout[:9]])
            )
        )

        return q_stdout, q_stderr, self.job_ids
コード例 #2
0
ファイル: emr_executor.py プロジェクト: Betterment/pinball
    def run_hive_query(self, query_str, upload_archive=False):
        full_query_string = \
            self._generate_hive_query_header(upload_archive=upload_archive)
        full_query_string += self._get_scheduler_hive_setting()
        full_query_string += query_str

        self.log.info('Running query:\n %s' % full_query_string)

        query_dir = self._mk_tmp_dir()
        hadoop_utils.put_string_to_hadoop(
            self.hadoop_host_config,
            full_query_string,
            '%s/query.ql' % query_dir)

        hadoop_utils.run_and_check_command_in_hadoop(
            self.hadoop_host_config,
            'set -o pipefail; /home/hadoop/hive/bin/hive -f %s/query.ql 2>&1 '
            '> %s/out.csv | tee %s/out.err'
            % (query_dir, query_dir, query_dir),
            log_line_processor=self._hive_query_log_line_processor)

        q_stdout = self._get_raw_query_result('%s/out.csv' % query_dir)
        q_stderr = self._get_raw_query_result('%s/out.err' % query_dir)

        self._check_for_hive_failure_message(q_stderr)

        self.log.info("Output has %d rows. First 10 rows:\n\t%s"
                      % (len(q_stdout),
                         '\n\t'.join([str(o) for o in q_stdout[:9]])))

        return q_stdout, q_stderr, self.job_ids
コード例 #3
0
ファイル: emr_executor.py プロジェクト: zhengge2017/pinball
    def run_hive_query(self, query_str, upload_archive=False):
        full_query_string = \
            self._generate_hive_query_header(upload_archive=upload_archive)
        full_query_string += self._get_scheduler_hive_setting()
        full_query_string += query_str

        self.log.info('Running query:\n %s' % full_query_string)

        query_dir = self._mk_tmp_dir()
        hadoop_utils.put_string_to_hadoop(self.hadoop_host_config,
                                          full_query_string,
                                          '%s/query.ql' % query_dir)

        hadoop_utils.run_and_check_command_in_hadoop(
            self.hadoop_host_config,
            'set -o pipefail; /home/hadoop/hive/bin/hive -f %s/query.ql 2>&1 '
            '> %s/out.csv | tee %s/out.err' %
            (query_dir, query_dir, query_dir),
            log_line_processor=self._hive_query_log_line_processor)

        q_stdout = self._get_raw_query_result('%s/out.csv' % query_dir)
        q_stderr = self._get_raw_query_result('%s/out.err' % query_dir)

        self._check_for_hive_failure_message(q_stderr)

        self.log.info(
            "Output has %d rows. First 10 rows:\n\t%s" %
            (len(q_stdout), '\n\t'.join([str(o) for o in q_stdout[:9]])))

        return q_stdout, q_stderr, self.job_ids
コード例 #4
0
ファイル: emr_executor.py プロジェクト: runt18/pinball
    def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None):
        jobconf_args = jobconf_args if jobconf_args else {}
        extra_args = extra_args if extra_args else []
        extra_jars = extra_jars if extra_jars else []

        # Set default jobconf args
        jobconf_args = jobconf_args.copy()
        if self.config.SCHEDULER_QUEUE:
            jobconf_args[self.config.SCHEDULER_PARAM] = self.config.SCHEDULER_QUEUE
        jobconf_args["mapred.job.name"] = self.job_name

        # create arguments string
        arguments = " ".join("-D{0!s}={1!s}".format(k, v) for k, v in jobconf_args.iteritems())
        arguments += " "
        arguments += " ".join(extra_args)

        base_dir = self.get_job_resource_dir(self.config.USER)
        libjars_glob = " ".join(["{0!s}/{1!s}/*.jar".format(base_dir, d) for d in self.config.USER_LIBJAR_DIRS])
        libjars = "`echo {0!s} | tr ' ' ','`".format(libjars_glob)

        user_jar_dirs = ["{0!s}/{1!s}/*".format(base_dir, d) for d in self.config.USER_LIBJAR_DIRS]
        hadoop_classpath = ":".join(user_jar_dirs)

        app_jar_path = "{0!s}/{1!s}".format(base_dir, self.config.USER_APPJAR_PATH)

        # temp dir for holding stdout and stderr
        query_dir = self._mk_tmp_dir()

        # generate command
        var_dict = {
            "class_name": class_name,
            "arguments": arguments,
            "query_dir": query_dir,
            "app_jar": app_jar_path,
            "libjars": libjars,
            "hadoop_classpath": hadoop_classpath,
        }
        cmd = (
            "set -o pipefail; HADOOP_CLASSPATH=%(hadoop_classpath)s"
            " hadoop jar %(app_jar)s"
            " %(class_name)s"
            " -libjars %(libjars)s"
            " %(arguments)s"
            " 2>&1 > %(query_dir)s/out.csv | tee %(query_dir)s/out.err"
        ) % var_dict

        self.log.info("Running class:{0!s} with arguments:{1!s}".format(class_name, arguments))
        self.log.info("Full command: {0!s}".format(cmd))

        # run command
        hadoop_utils.run_and_check_command_in_hadoop(
            self.hadoop_host_config, cmd, log_line_processor=self._hadoop_job_log_line_processor
        )

        rows = self._get_raw_query_result("{0!s}/out.csv".format(query_dir))
        stderr = self._get_raw_query_result("{0!s}/out.err".format(query_dir))

        return rows, stderr, self.job_ids
コード例 #5
0
ファイル: emr_executor.py プロジェクト: zhengge2017/pinball
 def _mk_tmp_dir(self):
     query_timestamp =\
         datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
     query_dir = os.path.join(
         self.config.HIVE_QUERIES_DIR, self.config.USER, '%s_%s_%s' %
         (query_timestamp, utils.get_random_string(), self.config.NAME))
     hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config,
                                                  command='mkdir -p %s' %
                                                  query_dir)
     return query_dir
コード例 #6
0
ファイル: emr_executor.py プロジェクト: runt18/pinball
 def _mk_tmp_dir(self):
     query_timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
     query_dir = os.path.join(
         self.config.HIVE_QUERIES_DIR,
         self.config.USER,
         "{0!s}_{1!s}_{2!s}".format(query_timestamp, utils.get_random_string(), self.config.NAME),
     )
     hadoop_utils.run_and_check_command_in_hadoop(
         self.hadoop_host_config, command="mkdir -p {0!s}".format(query_dir)
     )
     return query_dir
コード例 #7
0
ファイル: emr_executor.py プロジェクト: Betterment/pinball
 def _mk_tmp_dir(self):
     query_timestamp =\
         datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S")
     query_dir = os.path.join(
         self.config.HIVE_QUERIES_DIR,
         self.config.USER,
         '%s_%s_%s' % (query_timestamp,
                       utils.get_random_string(),
                       self.config.NAME))
     hadoop_utils.run_and_check_command_in_hadoop(
         self.hadoop_host_config,
         command='mkdir -p %s' % query_dir)
     return query_dir
コード例 #8
0
ファイル: emr_executor.py プロジェクト: runt18/pinball
 def kill_job(self, job_id):
     """Kills a EMR job with the given job_id."""
     cmd = "hadoop job -kill {0!s}".format(job_id)
     hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config, cmd)
コード例 #9
0
ファイル: emr_executor.py プロジェクト: Betterment/pinball
    def run_hadoop_job(self,
                       class_name,
                       jobconf_args=None,
                       extra_args=None,
                       extra_jars=None):
        jobconf_args = jobconf_args if jobconf_args else {}
        extra_args = extra_args if extra_args else []
        extra_jars = extra_jars if extra_jars else []

        # Set default jobconf args
        jobconf_args = jobconf_args.copy()
        if self.config.SCHEDULER_QUEUE:
            jobconf_args[self.config.SCHEDULER_PARAM] = \
                self.config.SCHEDULER_QUEUE
        jobconf_args['mapred.job.name'] = self.job_name

        # create arguments string
        arguments = \
            ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems())
        arguments += ' '
        arguments += ' '.join(extra_args)

        base_dir = self.get_job_resource_dir(self.config.USER)
        libjars_glob = ' '.join(
            ['%s/%s/*.jar' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS])
        libjars = '`echo %s | tr \' \' \',\'`' % libjars_glob

        user_jar_dirs = ['%s/%s/*' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS]
        hadoop_classpath = ':'.join(user_jar_dirs)

        app_jar_path = '%s/%s' % (base_dir, self.config.USER_APPJAR_PATH)

        # temp dir for holding stdout and stderr
        query_dir = self._mk_tmp_dir()

        # generate command
        var_dict = {
            'class_name': class_name,
            'arguments': arguments,
            'query_dir': query_dir,
            'app_jar': app_jar_path,
            'libjars': libjars,
            'hadoop_classpath': hadoop_classpath,
        }
        cmd = ('set -o pipefail; HADOOP_CLASSPATH=%(hadoop_classpath)s'
               ' hadoop jar %(app_jar)s'
               ' %(class_name)s'
               ' -libjars %(libjars)s'
               ' %(arguments)s'
               ' 2>&1 > %(query_dir)s/out.csv | tee %(query_dir)s/out.err') % \
            var_dict

        self.log.info('Running class:%s with arguments:%s' % (class_name, arguments))
        self.log.info('Full command: %s' % cmd)

        # run command
        hadoop_utils.run_and_check_command_in_hadoop(
            self.hadoop_host_config,
            cmd,
            log_line_processor=self._hadoop_job_log_line_processor)

        rows = self._get_raw_query_result('%s/out.csv' % query_dir)
        stderr = self._get_raw_query_result('%s/out.err' % query_dir)

        return rows, stderr, self.job_ids
コード例 #10
0
ファイル: emr_executor.py プロジェクト: Betterment/pinball
 def kill_job(self, job_id):
     """Kills a EMR job with the given job_id."""
     cmd = 'hadoop job -kill %s' % job_id
     hadoop_utils.run_and_check_command_in_hadoop(
         self.hadoop_host_config,
         cmd)
コード例 #11
0
ファイル: emr_executor.py プロジェクト: zhengge2017/pinball
    def run_hadoop_job(self,
                       class_name,
                       jobconf_args=None,
                       extra_args=None,
                       extra_jars=None):
        jobconf_args = jobconf_args if jobconf_args else {}
        extra_args = extra_args if extra_args else []
        extra_jars = extra_jars if extra_jars else []

        # Set default jobconf args
        jobconf_args = jobconf_args.copy()
        if self.config.SCHEDULER_QUEUE:
            jobconf_args[self.config.SCHEDULER_PARAM] = \
                self.config.SCHEDULER_QUEUE
        jobconf_args['mapred.job.name'] = self.job_name

        # create arguments string
        arguments = \
            ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems())
        arguments += ' '
        arguments += ' '.join(extra_args)

        base_dir = self.get_job_resource_dir(self.config.USER)
        libjars_glob = ' '.join([
            '%s/%s/*.jar' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS
        ])
        libjars = '`echo %s | tr \' \' \',\'`' % libjars_glob

        user_jar_dirs = [
            '%s/%s/*' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS
        ]
        hadoop_classpath = ':'.join(user_jar_dirs)

        app_jar_path = '%s/%s' % (base_dir, self.config.USER_APPJAR_PATH)

        # temp dir for holding stdout and stderr
        query_dir = self._mk_tmp_dir()

        # generate command
        var_dict = {
            'class_name': class_name,
            'arguments': arguments,
            'query_dir': query_dir,
            'app_jar': app_jar_path,
            'libjars': libjars,
            'hadoop_classpath': hadoop_classpath,
        }
        cmd = ('set -o pipefail; HADOOP_CLASSPATH=%(hadoop_classpath)s'
               ' hadoop jar %(app_jar)s'
               ' %(class_name)s'
               ' -libjars %(libjars)s'
               ' %(arguments)s'
               ' 2>&1 > %(query_dir)s/out.csv | tee %(query_dir)s/out.err') % \
            var_dict

        self.log.info('Running class:%s with arguments:%s' %
                      (class_name, arguments))
        self.log.info('Full command: %s' % cmd)

        # run command
        hadoop_utils.run_and_check_command_in_hadoop(
            self.hadoop_host_config,
            cmd,
            log_line_processor=self._hadoop_job_log_line_processor)

        rows = self._get_raw_query_result('%s/out.csv' % query_dir)
        stderr = self._get_raw_query_result('%s/out.err' % query_dir)

        return rows, stderr, self.job_ids
コード例 #12
0
ファイル: emr_executor.py プロジェクト: zhengge2017/pinball
 def kill_job(self, job_id):
     """Kills a EMR job with the given job_id."""
     cmd = 'hadoop job -kill %s' % job_id
     hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config,
                                                  cmd)