def run_hive_query(self, query_str, upload_archive=False): full_query_string = self._generate_hive_query_header(upload_archive=upload_archive) full_query_string += self._get_scheduler_hive_setting() full_query_string += query_str self.log.info("Running query:\n {0!s}".format(full_query_string)) query_dir = self._mk_tmp_dir() hadoop_utils.put_string_to_hadoop( self.hadoop_host_config, full_query_string, "{0!s}/query.ql".format(query_dir) ) hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, "set -o pipefail; /home/hadoop/hive/bin/hive -f %s/query.ql 2>&1 " "> %s/out.csv | tee %s/out.err" % (query_dir, query_dir, query_dir), log_line_processor=self._hive_query_log_line_processor, ) q_stdout = self._get_raw_query_result("{0!s}/out.csv".format(query_dir)) q_stderr = self._get_raw_query_result("{0!s}/out.err".format(query_dir)) self._check_for_hive_failure_message(q_stderr) self.log.info( "Output has {0:d} rows. First 10 rows:\n\t{1!s}".format( len(q_stdout), "\n\t".join([str(o) for o in q_stdout[:9]]) ) ) return q_stdout, q_stderr, self.job_ids
def run_hive_query(self, query_str, upload_archive=False): full_query_string = \ self._generate_hive_query_header(upload_archive=upload_archive) full_query_string += self._get_scheduler_hive_setting() full_query_string += query_str self.log.info('Running query:\n %s' % full_query_string) query_dir = self._mk_tmp_dir() hadoop_utils.put_string_to_hadoop( self.hadoop_host_config, full_query_string, '%s/query.ql' % query_dir) hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, 'set -o pipefail; /home/hadoop/hive/bin/hive -f %s/query.ql 2>&1 ' '> %s/out.csv | tee %s/out.err' % (query_dir, query_dir, query_dir), log_line_processor=self._hive_query_log_line_processor) q_stdout = self._get_raw_query_result('%s/out.csv' % query_dir) q_stderr = self._get_raw_query_result('%s/out.err' % query_dir) self._check_for_hive_failure_message(q_stderr) self.log.info("Output has %d rows. First 10 rows:\n\t%s" % (len(q_stdout), '\n\t'.join([str(o) for o in q_stdout[:9]]))) return q_stdout, q_stderr, self.job_ids
def run_hive_query(self, query_str, upload_archive=False): full_query_string = \ self._generate_hive_query_header(upload_archive=upload_archive) full_query_string += self._get_scheduler_hive_setting() full_query_string += query_str self.log.info('Running query:\n %s' % full_query_string) query_dir = self._mk_tmp_dir() hadoop_utils.put_string_to_hadoop(self.hadoop_host_config, full_query_string, '%s/query.ql' % query_dir) hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, 'set -o pipefail; /home/hadoop/hive/bin/hive -f %s/query.ql 2>&1 ' '> %s/out.csv | tee %s/out.err' % (query_dir, query_dir, query_dir), log_line_processor=self._hive_query_log_line_processor) q_stdout = self._get_raw_query_result('%s/out.csv' % query_dir) q_stderr = self._get_raw_query_result('%s/out.err' % query_dir) self._check_for_hive_failure_message(q_stderr) self.log.info( "Output has %d rows. First 10 rows:\n\t%s" % (len(q_stdout), '\n\t'.join([str(o) for o in q_stdout[:9]]))) return q_stdout, q_stderr, self.job_ids
def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None): jobconf_args = jobconf_args if jobconf_args else {} extra_args = extra_args if extra_args else [] extra_jars = extra_jars if extra_jars else [] # Set default jobconf args jobconf_args = jobconf_args.copy() if self.config.SCHEDULER_QUEUE: jobconf_args[self.config.SCHEDULER_PARAM] = self.config.SCHEDULER_QUEUE jobconf_args["mapred.job.name"] = self.job_name # create arguments string arguments = " ".join("-D{0!s}={1!s}".format(k, v) for k, v in jobconf_args.iteritems()) arguments += " " arguments += " ".join(extra_args) base_dir = self.get_job_resource_dir(self.config.USER) libjars_glob = " ".join(["{0!s}/{1!s}/*.jar".format(base_dir, d) for d in self.config.USER_LIBJAR_DIRS]) libjars = "`echo {0!s} | tr ' ' ','`".format(libjars_glob) user_jar_dirs = ["{0!s}/{1!s}/*".format(base_dir, d) for d in self.config.USER_LIBJAR_DIRS] hadoop_classpath = ":".join(user_jar_dirs) app_jar_path = "{0!s}/{1!s}".format(base_dir, self.config.USER_APPJAR_PATH) # temp dir for holding stdout and stderr query_dir = self._mk_tmp_dir() # generate command var_dict = { "class_name": class_name, "arguments": arguments, "query_dir": query_dir, "app_jar": app_jar_path, "libjars": libjars, "hadoop_classpath": hadoop_classpath, } cmd = ( "set -o pipefail; HADOOP_CLASSPATH=%(hadoop_classpath)s" " hadoop jar %(app_jar)s" " %(class_name)s" " -libjars %(libjars)s" " %(arguments)s" " 2>&1 > %(query_dir)s/out.csv | tee %(query_dir)s/out.err" ) % var_dict self.log.info("Running class:{0!s} with arguments:{1!s}".format(class_name, arguments)) self.log.info("Full command: {0!s}".format(cmd)) # run command hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, cmd, log_line_processor=self._hadoop_job_log_line_processor ) rows = self._get_raw_query_result("{0!s}/out.csv".format(query_dir)) stderr = self._get_raw_query_result("{0!s}/out.err".format(query_dir)) return rows, stderr, self.job_ids
def _mk_tmp_dir(self): query_timestamp =\ datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") query_dir = os.path.join( self.config.HIVE_QUERIES_DIR, self.config.USER, '%s_%s_%s' % (query_timestamp, utils.get_random_string(), self.config.NAME)) hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config, command='mkdir -p %s' % query_dir) return query_dir
def _mk_tmp_dir(self): query_timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") query_dir = os.path.join( self.config.HIVE_QUERIES_DIR, self.config.USER, "{0!s}_{1!s}_{2!s}".format(query_timestamp, utils.get_random_string(), self.config.NAME), ) hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, command="mkdir -p {0!s}".format(query_dir) ) return query_dir
def _mk_tmp_dir(self): query_timestamp =\ datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") query_dir = os.path.join( self.config.HIVE_QUERIES_DIR, self.config.USER, '%s_%s_%s' % (query_timestamp, utils.get_random_string(), self.config.NAME)) hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, command='mkdir -p %s' % query_dir) return query_dir
def kill_job(self, job_id): """Kills a EMR job with the given job_id.""" cmd = "hadoop job -kill {0!s}".format(job_id) hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config, cmd)
def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None): jobconf_args = jobconf_args if jobconf_args else {} extra_args = extra_args if extra_args else [] extra_jars = extra_jars if extra_jars else [] # Set default jobconf args jobconf_args = jobconf_args.copy() if self.config.SCHEDULER_QUEUE: jobconf_args[self.config.SCHEDULER_PARAM] = \ self.config.SCHEDULER_QUEUE jobconf_args['mapred.job.name'] = self.job_name # create arguments string arguments = \ ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems()) arguments += ' ' arguments += ' '.join(extra_args) base_dir = self.get_job_resource_dir(self.config.USER) libjars_glob = ' '.join( ['%s/%s/*.jar' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS]) libjars = '`echo %s | tr \' \' \',\'`' % libjars_glob user_jar_dirs = ['%s/%s/*' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS] hadoop_classpath = ':'.join(user_jar_dirs) app_jar_path = '%s/%s' % (base_dir, self.config.USER_APPJAR_PATH) # temp dir for holding stdout and stderr query_dir = self._mk_tmp_dir() # generate command var_dict = { 'class_name': class_name, 'arguments': arguments, 'query_dir': query_dir, 'app_jar': app_jar_path, 'libjars': libjars, 'hadoop_classpath': hadoop_classpath, } cmd = ('set -o pipefail; HADOOP_CLASSPATH=%(hadoop_classpath)s' ' hadoop jar %(app_jar)s' ' %(class_name)s' ' -libjars %(libjars)s' ' %(arguments)s' ' 2>&1 > %(query_dir)s/out.csv | tee %(query_dir)s/out.err') % \ var_dict self.log.info('Running class:%s with arguments:%s' % (class_name, arguments)) self.log.info('Full command: %s' % cmd) # run command hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, cmd, log_line_processor=self._hadoop_job_log_line_processor) rows = self._get_raw_query_result('%s/out.csv' % query_dir) stderr = self._get_raw_query_result('%s/out.err' % query_dir) return rows, stderr, self.job_ids
def kill_job(self, job_id): """Kills a EMR job with the given job_id.""" cmd = 'hadoop job -kill %s' % job_id hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, cmd)
def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None): jobconf_args = jobconf_args if jobconf_args else {} extra_args = extra_args if extra_args else [] extra_jars = extra_jars if extra_jars else [] # Set default jobconf args jobconf_args = jobconf_args.copy() if self.config.SCHEDULER_QUEUE: jobconf_args[self.config.SCHEDULER_PARAM] = \ self.config.SCHEDULER_QUEUE jobconf_args['mapred.job.name'] = self.job_name # create arguments string arguments = \ ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems()) arguments += ' ' arguments += ' '.join(extra_args) base_dir = self.get_job_resource_dir(self.config.USER) libjars_glob = ' '.join([ '%s/%s/*.jar' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS ]) libjars = '`echo %s | tr \' \' \',\'`' % libjars_glob user_jar_dirs = [ '%s/%s/*' % (base_dir, d) for d in self.config.USER_LIBJAR_DIRS ] hadoop_classpath = ':'.join(user_jar_dirs) app_jar_path = '%s/%s' % (base_dir, self.config.USER_APPJAR_PATH) # temp dir for holding stdout and stderr query_dir = self._mk_tmp_dir() # generate command var_dict = { 'class_name': class_name, 'arguments': arguments, 'query_dir': query_dir, 'app_jar': app_jar_path, 'libjars': libjars, 'hadoop_classpath': hadoop_classpath, } cmd = ('set -o pipefail; HADOOP_CLASSPATH=%(hadoop_classpath)s' ' hadoop jar %(app_jar)s' ' %(class_name)s' ' -libjars %(libjars)s' ' %(arguments)s' ' 2>&1 > %(query_dir)s/out.csv | tee %(query_dir)s/out.err') % \ var_dict self.log.info('Running class:%s with arguments:%s' % (class_name, arguments)) self.log.info('Full command: %s' % cmd) # run command hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, cmd, log_line_processor=self._hadoop_job_log_line_processor) rows = self._get_raw_query_result('%s/out.csv' % query_dir) stderr = self._get_raw_query_result('%s/out.err' % query_dir) return rows, stderr, self.job_ids
def kill_job(self, job_id): """Kills a EMR job with the given job_id.""" cmd = 'hadoop job -kill %s' % job_id hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config, cmd)