def _mk_tmp_dir(self): query_timestamp =\ datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") query_dir = os.path.join( self.config.HIVE_QUERIES_DIR, self.config.USER, '%s_%s_%s' % (query_timestamp, utils.get_random_string(), self.config.NAME)) hadoop_utils.run_and_check_command_in_hadoop(self.hadoop_host_config, command='mkdir -p %s' % query_dir) return query_dir
def _mk_tmp_dir(self): query_timestamp = datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") query_dir = os.path.join( self.config.HIVE_QUERIES_DIR, self.config.USER, "{0!s}_{1!s}_{2!s}".format(query_timestamp, utils.get_random_string(), self.config.NAME), ) hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, command="mkdir -p {0!s}".format(query_dir) ) return query_dir
def _mk_tmp_dir(self): query_timestamp =\ datetime.datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S") query_dir = os.path.join( self.config.HIVE_QUERIES_DIR, self.config.USER, '%s_%s_%s' % (query_timestamp, utils.get_random_string(), self.config.NAME)) hadoop_utils.run_and_check_command_in_hadoop( self.hadoop_host_config, command='mkdir -p %s' % query_dir) return query_dir
def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None): """Run a Hadoop job in Qubole cluster. We assume extra_jars are stored on s3 and the path looks like: s3://pinball/%{USER}/some_jar_dir/ We fail the entire command if pulling the JARs down from s3 fails, so we use "&&" to connect shell commands. """ jobconf_args = jobconf_args if jobconf_args else {} extra_args = extra_args if extra_args else [] extra_jars = extra_jars if extra_jars else [] # The place where all jars are stored in s3. s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars # The place where all jars will be copied to locally. local_jar_dir = '/tmp/hadoop_users/%s/%s' % \ (self.config.USER, utils.get_random_string()) download_jar_cmds = [ 'hadoop fs -get %s %s' % (s3_dir, local_jar_dir) for s3_dir in s3_jar_dirs ] download_jar_cmd = ' && '.join(download_jar_cmds) appjar_name = s3_utils.extract_file_name_from_s3_path( self.config.USER_APPJAR_PATH) download_jar_cmd += ' && hadoop fs -get %s %s/%s' % ( self.config.USER_APPJAR_PATH, local_jar_dir, appjar_name) # Set default JobConf args. jobconf_args = {} if jobconf_args is None else jobconf_args.copy() if self.config.SCHEDULER_QUEUE: jobconf_args[self.config.SCHEDULER_PARAM] = \ self.config.SCHEDULER_QUEUE jobconf_args['mapred.job.name'] = self.job_name # Create arguments. arguments = ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems()) arguments += ' ' arguments += ' '.join(extra_args) libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir) hadoop_classpath = '%s/*' % local_jar_dir cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s' files_to_be_deleted = [] for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST: files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar)) if files_to_be_deleted: cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted)) # Generate command. var_dict = { 'class_name': class_name, 'arguments': arguments, 'appjar_name': appjar_name, 'download_jar_cmd': download_jar_cmd, 'local_jar_dir': local_jar_dir, 'hadoop_classpath': hadoop_classpath, 'libjars': libjars, } cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s' ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s' ' %(class_name)s' ' -libjars %(libjars)s' ' %(arguments)s') cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;' cmd = cmd % var_dict # Log command messages. self.log.info('Full command: %s' % cmd) # Run command. hc, output, stderr, job_ids = self.run_shell_command(cmd) return output, stderr, job_ids
def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None): """Run a Hadoop job in Qubole cluster. We assume extra_jars are stored on s3 and the path looks like: s3://pinball/%{USER}/some_jar_dir/ We fail the entire command if pulling the JARs down from s3 fails, so we use "&&" to connect shell commands. """ jobconf_args = jobconf_args if jobconf_args else {} extra_args = extra_args if extra_args else [] extra_jars = extra_jars if extra_jars else [] # The place where all jars are stored in s3. s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars # The place where all jars will be copied to locally. local_jar_dir = '/tmp/hadoop_users/%s/%s' % \ (self.config.USER, utils.get_random_string()) download_jar_cmds = ['hadoop fs -get %s %s' % (s3_dir, local_jar_dir) for s3_dir in s3_jar_dirs] download_jar_cmd = ' && '.join(download_jar_cmds) appjar_name = s3_utils.extract_file_name_from_s3_path( self.config.USER_APPJAR_PATH) download_jar_cmd += ' && hadoop fs -get %s %s/%s' % ( self.config.USER_APPJAR_PATH, local_jar_dir, appjar_name ) # Set default JobConf args. jobconf_args = {} if jobconf_args is None else jobconf_args.copy() if self.config.SCHEDULER_QUEUE: jobconf_args[self.config.SCHEDULER_PARAM] = \ self.config.SCHEDULER_QUEUE jobconf_args['mapred.job.name'] = self.job_name # Create arguments. arguments = ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems()) arguments += ' ' arguments += ' '.join(extra_args) libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir) hadoop_classpath = '%s/*' % local_jar_dir cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s' files_to_be_deleted = [] for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST: files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar)) if files_to_be_deleted: cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted)) # Generate command. var_dict = { 'class_name': class_name, 'arguments': arguments, 'appjar_name': appjar_name, 'download_jar_cmd': download_jar_cmd, 'local_jar_dir': local_jar_dir, 'hadoop_classpath': hadoop_classpath, 'libjars': libjars, } cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s' ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s' ' %(class_name)s' ' -libjars %(libjars)s' ' %(arguments)s') cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;' cmd = cmd % var_dict # Log command messages. self.log.info('Full command: %s' % cmd) # Run command. hc, output, stderr, job_ids = self.run_shell_command(cmd) return output, stderr, job_ids