def _get_libjars_local_paths(self, s3_jar_dirs, local_jar_dir): """Returns a list of local jar paths downloaded from s3. Args: s3_jar_dirs: S3 path from which we pull down JARs from. local_jar_dir: local path on every machine in the Qubole cluster to which jars are pulled down Returns: List of local file paths as a string, with each file name delimited comma (,), if the supplied s3_jar_dirs is valid. Otherwise, returns empty string. """ file_paths = [] for s3_jar_dir in s3_jar_dirs: file_paths += s3_utils.list_s3_directory(s3_jar_dir) jar_names = [ s3_utils.extract_file_name_from_s3_path(file_path) for file_path in file_paths if str(file_path).endswith('jar') ] filtered_jar_names = [ jar_name for jar_name in jar_names if jar_name not in self.config.QUBOLE_JARS_BLACKLIST ] # dedup jar lists. filtered_jar_names = list(set(filtered_jar_names)) final_jar_paths = [ '%s/%s' % (local_jar_dir, jar_name) for jar_name in filtered_jar_names ] return ','.join(final_jar_paths)
def _get_libjars_local_paths(self, s3_jar_dirs, local_jar_dir): """Returns a list of local jar paths downloaded from s3. Args: s3_jar_dirs: S3 path from which we pull down JARs from. local_jar_dir: local path on every machine in the Qubole cluster to which jars are pulled down Returns: List of local file paths as a string, with each file name delimited comma (,), if the supplied s3_jar_dirs is valid. Otherwise, returns empty string. """ file_paths = [] for s3_jar_dir in s3_jar_dirs: file_paths += s3_utils.list_s3_directory(s3_jar_dir) jar_names = [ s3_utils.extract_file_name_from_s3_path(file_path) for file_path in file_paths if str(file_path).endswith('jar')] filtered_jar_names = [ jar_name for jar_name in jar_names if jar_name not in self.config.QUBOLE_JARS_BLACKLIST] # dedup jar lists. filtered_jar_names = list(set(filtered_jar_names)) final_jar_paths = [ '%s/%s' % (local_jar_dir, jar_name) for jar_name in filtered_jar_names] return ','.join(final_jar_paths)
def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None): """Run a Hadoop job in Qubole cluster. We assume extra_jars are stored on s3 and the path looks like: s3://pinball/%{USER}/some_jar_dir/ We fail the entire command if pulling the JARs down from s3 fails, so we use "&&" to connect shell commands. """ jobconf_args = jobconf_args if jobconf_args else {} extra_args = extra_args if extra_args else [] extra_jars = extra_jars if extra_jars else [] # The place where all jars are stored in s3. s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars # The place where all jars will be copied to locally. local_jar_dir = '/tmp/hadoop_users/%s/%s' % \ (self.config.USER, utils.get_random_string()) download_jar_cmds = [ 'hadoop fs -get %s %s' % (s3_dir, local_jar_dir) for s3_dir in s3_jar_dirs ] download_jar_cmd = ' && '.join(download_jar_cmds) appjar_name = s3_utils.extract_file_name_from_s3_path( self.config.USER_APPJAR_PATH) download_jar_cmd += ' && hadoop fs -get %s %s/%s' % ( self.config.USER_APPJAR_PATH, local_jar_dir, appjar_name) # Set default JobConf args. jobconf_args = {} if jobconf_args is None else jobconf_args.copy() if self.config.SCHEDULER_QUEUE: jobconf_args[self.config.SCHEDULER_PARAM] = \ self.config.SCHEDULER_QUEUE jobconf_args['mapred.job.name'] = self.job_name # Create arguments. arguments = ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems()) arguments += ' ' arguments += ' '.join(extra_args) libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir) hadoop_classpath = '%s/*' % local_jar_dir cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s' files_to_be_deleted = [] for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST: files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar)) if files_to_be_deleted: cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted)) # Generate command. var_dict = { 'class_name': class_name, 'arguments': arguments, 'appjar_name': appjar_name, 'download_jar_cmd': download_jar_cmd, 'local_jar_dir': local_jar_dir, 'hadoop_classpath': hadoop_classpath, 'libjars': libjars, } cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s' ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s' ' %(class_name)s' ' -libjars %(libjars)s' ' %(arguments)s') cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;' cmd = cmd % var_dict # Log command messages. self.log.info('Full command: %s' % cmd) # Run command. hc, output, stderr, job_ids = self.run_shell_command(cmd) return output, stderr, job_ids
def run_hadoop_job(self, class_name, jobconf_args=None, extra_args=None, extra_jars=None): """Run a Hadoop job in Qubole cluster. We assume extra_jars are stored on s3 and the path looks like: s3://pinball/%{USER}/some_jar_dir/ We fail the entire command if pulling the JARs down from s3 fails, so we use "&&" to connect shell commands. """ jobconf_args = jobconf_args if jobconf_args else {} extra_args = extra_args if extra_args else [] extra_jars = extra_jars if extra_jars else [] # The place where all jars are stored in s3. s3_jar_dirs = self.config.USER_LIBJAR_DIRS + extra_jars # The place where all jars will be copied to locally. local_jar_dir = '/tmp/hadoop_users/%s/%s' % \ (self.config.USER, utils.get_random_string()) download_jar_cmds = ['hadoop fs -get %s %s' % (s3_dir, local_jar_dir) for s3_dir in s3_jar_dirs] download_jar_cmd = ' && '.join(download_jar_cmds) appjar_name = s3_utils.extract_file_name_from_s3_path( self.config.USER_APPJAR_PATH) download_jar_cmd += ' && hadoop fs -get %s %s/%s' % ( self.config.USER_APPJAR_PATH, local_jar_dir, appjar_name ) # Set default JobConf args. jobconf_args = {} if jobconf_args is None else jobconf_args.copy() if self.config.SCHEDULER_QUEUE: jobconf_args[self.config.SCHEDULER_PARAM] = \ self.config.SCHEDULER_QUEUE jobconf_args['mapred.job.name'] = self.job_name # Create arguments. arguments = ' '.join('-D%s=%s' % (k, v) for k, v in jobconf_args.iteritems()) arguments += ' ' arguments += ' '.join(extra_args) libjars = self._get_libjars_local_paths(s3_jar_dirs, local_jar_dir) hadoop_classpath = '%s/*' % local_jar_dir cmd = 'mkdir -p %(local_jar_dir)s && %(download_jar_cmd)s' files_to_be_deleted = [] for qubole_jar in self.config.QUBOLE_JARS_BLACKLIST: files_to_be_deleted.append('%s/%s' % (local_jar_dir, qubole_jar)) if files_to_be_deleted: cmd += ' && rm -f %s' % (' && rm -f '.join(files_to_be_deleted)) # Generate command. var_dict = { 'class_name': class_name, 'arguments': arguments, 'appjar_name': appjar_name, 'download_jar_cmd': download_jar_cmd, 'local_jar_dir': local_jar_dir, 'hadoop_classpath': hadoop_classpath, 'libjars': libjars, } cmd += (' && export HADOOP_CLASSPATH=%(hadoop_classpath)s' ' && hadoop jar %(local_jar_dir)s/%(appjar_name)s' ' %(class_name)s' ' -libjars %(libjars)s' ' %(arguments)s') cmd += ';\nEXIT_CODE=$?; \nrm -rf %(local_jar_dir)s; \nexit $EXIT_CODE;' cmd = cmd % var_dict # Log command messages. self.log.info('Full command: %s' % cmd) # Run command. hc, output, stderr, job_ids = self.run_shell_command(cmd) return output, stderr, job_ids