def hadoop_jar(stdout, stderr, environ, *args): if len(args) < 1: print('RunJar jarFile [mainClass] args...', file=stderr) return -1 jar_path = args[0] if not os.path.exists(jar_path): print('Exception in thread "main" java.io.IOException: Error opening' ' job jar: %s' % jar_path, file=stderr) return -1 # only simulate for streaming steps if HADOOP_STREAMING_JAR_RE.match(os.path.basename(jar_path)): streaming_args = args[1:] output_idx = list(streaming_args).index('-output') assert output_idx != -1 output_dir = streaming_args[output_idx + 1] real_output_dir = hdfs_path_to_real_path(output_dir, environ) mock_output_dir = get_mock_hadoop_output() if mock_output_dir is None: print('Job failed!', file=stderr) return -1 if os.path.isdir(real_output_dir): os.rmdir(real_output_dir) shutil.move(mock_output_dir, real_output_dir) now = datetime.datetime.now() print(now.strftime('Running job: job_%Y%m%d%H%M_0001'), file=stderr) print('Job succeeded!', file=stderr) return 0
def hadoop_jar(stdout, stderr, environ, *args): if len(args) < 1: stderr.write('RunJar jarFile [mainClass] args...\n') return -1 jar_path = args[0] if not os.path.exists(jar_path): stderr.write( 'Exception in thread "main" java.io.IOException: Error opening job' ' jar: %s\n' % jar_path) return -1 # only simulate for streaming steps if HADOOP_STREAMING_JAR_RE.match(os.path.basename(jar_path)): streaming_args = args[1:] output_idx = list(streaming_args).index('-output') assert output_idx != -1 output_dir = streaming_args[output_idx + 1] real_output_dir = hdfs_path_to_real_path(output_dir, environ) mock_output_dir = get_mock_hadoop_output() if mock_output_dir is None: stderr.write('Job failed!') return -1 if os.path.isdir(real_output_dir): os.rmdir(real_output_dir) shutil.move(mock_output_dir, real_output_dir) now = datetime.datetime.now() stderr.write(now.strftime('Running job: job_%Y%m%d%H%M_0001\n')) stderr.write('Job succeeded!\n') return 0
def find_hadoop_streaming_jar(path): """Return the path of the hadoop streaming jar inside the given directory tree, or None if we can't find it.""" for (dirpath, _, filenames) in os.walk(path): for filename in filenames: if HADOOP_STREAMING_JAR_RE.match(filename): return os.path.join(dirpath, filename) else: return None
def hadoop_jar(stdout, stderr, environ, *args): if len(args) < 1: print('RunJar jarFile [mainClass] args...', file=stderr) return -1 jar_path = args[0] if not os.path.exists(jar_path): print('Exception in thread "main" java.io.IOException: Error opening' ' job jar: %s' % jar_path, file=stderr) return -1 # use this to simulate log4j def mock_log4j(message, level='INFO', logger='mapreduce.JOB', now=None): now = now or datetime.datetime.now() line = '%s %s %s: %s' % (now.strftime('%Y/%m/%d %H:%M:%S'), level, logger, message) print(line, file=stderr) # simulate counters counters = next_mock_hadoop_counters() if counters: num_counters = sum(len(g) for g in counters.values()) mock_log4j('Counters: %d' % num_counters) # subsequent lines are actually part of same log record for group, group_counters in sorted(counters.items()): print(('\t%s' % group), file=stderr) for counter, amount in sorted(group_counters.items()): print(('\t\t%s=%d' % (counter, amount)), file=stderr) # simulate output for streaming steps if HADOOP_STREAMING_JAR_RE.match(os.path.basename(jar_path)): streaming_args = args[1:] output_idx = list(streaming_args).index('-output') assert output_idx != -1 output_dir = streaming_args[output_idx + 1] real_output_dir = hdfs_uri_to_real_path(output_dir, environ) mock_output_dir = get_mock_hadoop_output() if mock_output_dir is None: mock_log4j('Job failed!') return -1 if os.path.isdir(real_output_dir): os.rmdir(real_output_dir) shutil.move(mock_output_dir, real_output_dir) now = datetime.datetime.now() mock_log4j(now.strftime('Running job: job_%Y%m%d%H%M_0001')) mock_log4j('Job succeeded!') return 0
def is_job_flow_non_streaming(job_flow): """Return True if the give job flow has steps, but not of them are Hadoop streaming steps (for example, if the job flow is running Hive). """ if not job_flow.steps: return False for step in job_flow.steps: if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(step.jar)): return False # job has at least one step, and none are streaming steps return True
def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s' % path) streaming_jars = [] for path in self.fs.ls(path): if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None