def _find_hadoop_bin(self): """Look for the hadoop binary in any plausible place. If all else fails, return ``['hadoop']``. """ def yield_paths(): for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL': path = os.environ.get(name) if path: yield os.path.join(path, 'bin') # They use $HADOOP_INSTALL/hadoop/bin here: # https://wiki.apache.org/hadoop/GettingStartedWithHadoop if os.environ.get('HADOOP_INSTALL'): yield os.path.join( os.environ['HADOOP_INSTALL'], 'hadoop', 'bin') yield None # use $PATH # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give # up. Don't worry about duplicates; they're de-duplicated below for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield os.path.join(path, 'bin') for path in unique(yield_paths()): log.info('Looking for hadoop binary in %s...' % (path or '$PATH')) hadoop_bin = which('hadoop', path=path) if hadoop_bin: log.info('Found hadoop binary: %s' % hadoop_bin) return [hadoop_bin] else: log.info("Falling back to 'hadoop'") return ['hadoop']
def _find_hadoop_bin(self): """Look for the hadoop binary in any plausible place. If all else fails, return ``['hadoop']``. """ def yield_paths(): for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL': path = os.environ.get(name) if path: yield os.path.join(path, 'bin') # They use $HADOOP_INSTALL/hadoop/bin here: # https://wiki.apache.org/hadoop/GettingStartedWithHadoop if os.environ.get('HADOOP_INSTALL'): yield os.path.join(os.environ['HADOOP_INSTALL'], 'hadoop', 'bin') yield None # use $PATH # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give # up. Don't worry about duplicates; they're de-duplicated below for name, path in sorted(os.environ.items()): if name.startswith('HADOOP_') and name.endswith('_HOME'): yield os.path.join(path, 'bin') for path in unique(yield_paths()): log.info('Looking for hadoop binary in %s...' % (path or '$PATH')) hadoop_bin = which('hadoop', path=path) if hadoop_bin: log.info('Found hadoop binary: %s' % hadoop_bin) return [hadoop_bin] else: log.info("Falling back to 'hadoop'") return ['hadoop']
def _stream_history_log_dirs(self, output_dir=None): """Yield lists of directories to look for the history log in.""" for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if _logs_exist(self.fs, log_dir): log.info('Looking for history log in %s...' % log_dir) # logs aren't always in a subdir named history/ yield [log_dir]
def stream_history_log_dirs(): for log_dir in unique( self._hadoop_log_dirs( output_dir=step_interpretation.get('output_dir'))): if self.fs.exists(log_dir): log.info('Looking for history log in %s' % log_dir) yield [log_dir]
def _pick_error_attempt_ids(log_interpretation): """Pick error attempt IDs, so we know which task logs to look at.""" errors = _pick_errors(log_interpretation) errors.sort(key=_is_probably_task_error, reverse=True) return list( unique(error['attempt_id'] for error in errors if error.get('attempt_id')))
def _pick_error_attempt_ids(log_interpretation): """Pick error attempt IDs, so we know which task logs to look at.""" errors = _pick_errors(log_interpretation) errors.sort(key=_is_probably_task_error, reverse=True) return list(unique( error['attempt_id'] for error in errors if error.get('attempt_id')))
def _stream_task_log_dirs(self, application_id=None, output_dir=None): """Yield lists of directories to look for the task logs in.""" # Note: this is unlikely to be super-helpful on "real" (multi-node) # pre-YARN Hadoop because task logs aren't generally shipped to a # local directory. It's a start, anyways. See #1201. for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)): if application_id: path = self.fs.join(log_dir, 'userlogs', application_id) else: path = self.fs.join(log_dir, 'userlogs') if _logs_exist(self.fs, path): log.info('Looking for task syslogs in %s...' % path) yield [path]
def _find_spark_submit_bin(self): # TODO: this is very similar to _find_hadoop_bin() (in fs) for path in unique(self._spark_submit_bin_dirs()): log.info('Looking for spark-submit binary in %s...' % (path or '$PATH')) spark_submit_bin = which('spark-submit', path=path) if spark_submit_bin: log.info('Found spark-submit binary: %s' % spark_submit_bin) return [spark_submit_bin] else: log.info("Falling back to 'spark-submit'") return ['spark-submit']
def _find_spark_submit_bin(self): # TODO: this is very similar to _find_hadoop_bin() (in fs) for path in unique(self._spark_submit_bin_dirs()): log.info('Looking for spark-submit binary in %s...' % ( path or '$PATH')) spark_submit_bin = which('spark-submit', path=path) if spark_submit_bin: log.info('Found spark-submit binary: %s' % spark_submit_bin) return [spark_submit_bin] else: log.info("Falling back to 'spark-submit'") return ['spark-submit']
def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs(output_dir=output_dir)): if yarn: path = self.fs.join(log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('looking for logs in %s' % path) yield [path]
def _pick_error_attempt_ids(log_interpretation): """Pick error attempt IDs from step and history logs, so we know which task logs to look at (most relevant first)""" errors = _extract_errors(log_interpretation) attempt_to_container_id = log_interpretation.get('history', {}).get( 'attempt_to_container_id', {}) errors = _merge_and_sort_errors(errors, attempt_to_container_id) errors.sort(key=_is_probably_task_error, reverse=True) return list( unique(error['attempt_id'] for error in errors if error.get('attempt_id')))
def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs( output_dir=step_interpretation.get('output_dir'))): if yarn: path = self.fs.join(log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by # job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('Scanning task syslogs in %s' % path) yield [path]
def stream_task_log_dirs(): for log_dir in unique( self._hadoop_log_dirs( output_dir=step_interpretation.get('output_dir'))): if yarn: path = self.fs.join( log_dir, 'userlogs', application_id) else: # sometimes pre-YARN attempt logs are organized by # job_id, # sometimes not. Play it safe path = self.fs.join(log_dir, 'userlogs') if self.fs.exists(path): log.info('Scanning task syslogs in %s' % path) yield [path]
def _find_spark_submit_bin(self): """Attempt to find the spark binary. Returns a list of arguments. Defaults to ``['spark-submit']``. Re-define this in your subclass if you already know where to find spark-submit (e.g. on cloud services). """ for path in unique(self._spark_submit_bin_dirs()): log.info('Looking for spark-submit binary in %s...' % ( path or '$PATH')) spark_submit_bin = which('spark-submit', path=path) if spark_submit_bin: log.info('Found spark-submit binary: %s' % spark_submit_bin) return [spark_submit_bin] else: log.info("Falling back to 'spark-submit'") return ['spark-submit']
def _find_hadoop_streaming_jar(self): """Search for the hadoop streaming jar. See :py:meth:`_hadoop_streaming_jar_dirs` for where we search.""" for path in unique(self._hadoop_streaming_jar_dirs()): log.info('Looking for Hadoop streaming jar in %s...' % path) streaming_jars = [] for path in self.fs.ls(path): if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)): streaming_jars.append(path) if streaming_jars: # prefer shorter names and shallower paths def sort_key(p): return (len(p.split('/')), len(posixpath.basename(p)), p) streaming_jars.sort(key=sort_key) return streaming_jars[0] return None
def test_mixed_types_ok(self): self.assertEqual(list(unique(['a', None, 33, 'a'])), ['a', None, 33])
def test_de_duplication(self): self.assertEqual(list(unique([1, 2, 1, 5, 1])), [1, 2, 5])
def test_preserves_order(self): self.assertEqual(list(unique([6, 7, 2, 0, 7, 1])), [6, 7, 2, 0, 1])
def test_empty(self): self.assertEqual(list(unique([])), [])