Ejemplo n.º 1
0
Archivo: hadoop.py Proyecto: Yelp/mrjob
    def _find_hadoop_bin(self):
        """Look for the hadoop binary in any plausible place. If all
        else fails, return ``['hadoop']``.
        """
        def yield_paths():
            for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL':
                path = os.environ.get(name)
                if path:
                    yield os.path.join(path, 'bin')

            # They use $HADOOP_INSTALL/hadoop/bin here:
            # https://wiki.apache.org/hadoop/GettingStartedWithHadoop
            if os.environ.get('HADOOP_INSTALL'):
                yield os.path.join(
                    os.environ['HADOOP_INSTALL'], 'hadoop', 'bin')

            yield None  # use $PATH

            # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give
            # up. Don't worry about duplicates; they're de-duplicated below
            for name, path in sorted(os.environ.items()):
                if name.startswith('HADOOP_') and name.endswith('_HOME'):
                    yield os.path.join(path, 'bin')

        for path in unique(yield_paths()):
            log.info('Looking for hadoop binary in %s...' % (path or '$PATH'))

            hadoop_bin = which('hadoop', path=path)

            if hadoop_bin:
                log.info('Found hadoop binary: %s' % hadoop_bin)
                return [hadoop_bin]
        else:
            log.info("Falling back to 'hadoop'")
            return ['hadoop']
Ejemplo n.º 2
0
    def _find_hadoop_bin(self):
        """Look for the hadoop binary in any plausible place. If all
        else fails, return ``['hadoop']``.
        """
        def yield_paths():
            for name in 'HADOOP_PREFIX', 'HADOOP_HOME', 'HADOOP_INSTALL':
                path = os.environ.get(name)
                if path:
                    yield os.path.join(path, 'bin')

            # They use $HADOOP_INSTALL/hadoop/bin here:
            # https://wiki.apache.org/hadoop/GettingStartedWithHadoop
            if os.environ.get('HADOOP_INSTALL'):
                yield os.path.join(os.environ['HADOOP_INSTALL'], 'hadoop',
                                   'bin')

            yield None  # use $PATH

            # Maybe it's in $HADOOP_MAPRED_HOME? $HADOOP_YARN_HOME? Don't give
            # up. Don't worry about duplicates; they're de-duplicated below
            for name, path in sorted(os.environ.items()):
                if name.startswith('HADOOP_') and name.endswith('_HOME'):
                    yield os.path.join(path, 'bin')

        for path in unique(yield_paths()):
            log.info('Looking for hadoop binary in %s...' % (path or '$PATH'))

            hadoop_bin = which('hadoop', path=path)

            if hadoop_bin:
                log.info('Found hadoop binary: %s' % hadoop_bin)
                return [hadoop_bin]
        else:
            log.info("Falling back to 'hadoop'")
            return ['hadoop']
Ejemplo n.º 3
0
 def _stream_history_log_dirs(self, output_dir=None):
     """Yield lists of directories to look for the history log in."""
     for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
         if _logs_exist(self.fs, log_dir):
             log.info('Looking for history log in %s...' % log_dir)
             # logs aren't always in a subdir named history/
             yield [log_dir]
Ejemplo n.º 4
0
 def _stream_history_log_dirs(self, output_dir=None):
     """Yield lists of directories to look for the history log in."""
     for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
         if _logs_exist(self.fs, log_dir):
             log.info('Looking for history log in %s...' % log_dir)
             # logs aren't always in a subdir named history/
             yield [log_dir]
Ejemplo n.º 5
0
            def stream_history_log_dirs():
                for log_dir in unique(
                        self._hadoop_log_dirs(
                            output_dir=step_interpretation.get('output_dir'))):

                    if self.fs.exists(log_dir):
                        log.info('Looking for history log in %s' % log_dir)
                        yield [log_dir]
Ejemplo n.º 6
0
            def stream_history_log_dirs():
                for log_dir in unique(
                        self._hadoop_log_dirs(
                            output_dir=step_interpretation.get('output_dir'))):

                    if self.fs.exists(log_dir):
                         log.info('Looking for history log in %s' % log_dir)
                         yield [log_dir]
Ejemplo n.º 7
0
def _pick_error_attempt_ids(log_interpretation):
    """Pick error attempt IDs, so we know which task logs to look at."""
    errors = _pick_errors(log_interpretation)

    errors.sort(key=_is_probably_task_error, reverse=True)

    return list(
        unique(error['attempt_id'] for error in errors
               if error.get('attempt_id')))
Ejemplo n.º 8
0
def _pick_error_attempt_ids(log_interpretation):
    """Pick error attempt IDs, so we know which task logs to look at."""
    errors = _pick_errors(log_interpretation)

    errors.sort(key=_is_probably_task_error, reverse=True)

    return list(unique(
        error['attempt_id'] for error in errors
        if error.get('attempt_id')))
Ejemplo n.º 9
0
    def _stream_task_log_dirs(self, application_id=None, output_dir=None):
        """Yield lists of directories to look for the task logs in."""
        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a
        # local directory. It's a start, anyways. See #1201.
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if application_id:
                path = self.fs.join(log_dir, 'userlogs', application_id)
            else:
                path = self.fs.join(log_dir, 'userlogs')

            if _logs_exist(self.fs, path):
                log.info('Looking for task syslogs in %s...' % path)
                yield [path]
Ejemplo n.º 10
0
    def _find_spark_submit_bin(self):
        # TODO: this is very similar to _find_hadoop_bin() (in fs)
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' %
                     (path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']
Ejemplo n.º 11
0
    def _stream_task_log_dirs(self, application_id=None, output_dir=None):
        """Yield lists of directories to look for the task logs in."""
        # Note: this is unlikely to be super-helpful on "real" (multi-node)
        # pre-YARN Hadoop because task logs aren't generally shipped to a
        # local directory. It's a start, anyways. See #1201.
        for log_dir in unique(self._hadoop_log_dirs(output_dir=output_dir)):
            if application_id:
                path = self.fs.join(log_dir, 'userlogs', application_id)
            else:
                path = self.fs.join(log_dir, 'userlogs')

            if _logs_exist(self.fs, path):
                log.info('Looking for task syslogs in %s...' % path)
                yield [path]
Ejemplo n.º 12
0
    def _find_spark_submit_bin(self):
        # TODO: this is very similar to _find_hadoop_bin() (in fs)
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' % (
                path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']
Ejemplo n.º 13
0
        def stream_task_log_dirs():
            for log_dir in unique(
                    self._hadoop_log_dirs(output_dir=output_dir)):

                if yarn:
                    path = self.fs.join(log_dir, 'userlogs', application_id)
                else:
                    # sometimes pre-YARN attempt logs are organized by job_id,
                    # sometimes not. Play it safe
                    path = self.fs.join(log_dir, 'userlogs')

                if self.fs.exists(path):
                    log.info('looking for logs in %s' % path)
                    yield [path]
Ejemplo n.º 14
0
        def stream_task_log_dirs():
            for log_dir in unique(
                    self._hadoop_log_dirs(output_dir=output_dir)):

                if yarn:
                    path = self.fs.join(log_dir, 'userlogs', application_id)
                else:
                    # sometimes pre-YARN attempt logs are organized by job_id,
                    # sometimes not. Play it safe
                    path = self.fs.join(log_dir, 'userlogs')

                if self.fs.exists(path):
                    log.info('looking for logs in %s' % path)
                    yield [path]
Ejemplo n.º 15
0
def _pick_error_attempt_ids(log_interpretation):
    """Pick error attempt IDs from step and history logs, so we know which
    task logs to look at (most relevant first)"""
    errors = _extract_errors(log_interpretation)

    attempt_to_container_id = log_interpretation.get('history', {}).get(
        'attempt_to_container_id', {})

    errors = _merge_and_sort_errors(errors, attempt_to_container_id)

    errors.sort(key=_is_probably_task_error, reverse=True)

    return list(
        unique(error['attempt_id'] for error in errors
               if error.get('attempt_id')))
Ejemplo n.º 16
0
            def stream_task_log_dirs():
                for log_dir in unique(
                        self._hadoop_log_dirs(
                            output_dir=step_interpretation.get('output_dir'))):

                    if yarn:
                        path = self.fs.join(log_dir, 'userlogs',
                                            application_id)
                    else:
                        # sometimes pre-YARN attempt logs are organized by
                        # job_id,
                        # sometimes not. Play it safe
                        path = self.fs.join(log_dir, 'userlogs')

                    if self.fs.exists(path):
                        log.info('Scanning task syslogs in %s' % path)
                        yield [path]
Ejemplo n.º 17
0
            def stream_task_log_dirs():
                for log_dir in unique(
                    self._hadoop_log_dirs(
                        output_dir=step_interpretation.get('output_dir'))):

                    if yarn:
                        path = self.fs.join(
                            log_dir, 'userlogs', application_id)
                    else:
                        # sometimes pre-YARN attempt logs are organized by
                        # job_id,
                        # sometimes not. Play it safe
                        path = self.fs.join(log_dir, 'userlogs')

                    if self.fs.exists(path):
                        log.info('Scanning task syslogs in %s' % path)
                        yield [path]
Ejemplo n.º 18
0
Archivo: bin.py Proyecto: Affirm/mrjob
    def _find_spark_submit_bin(self):
        """Attempt to find the spark binary. Returns a list of arguments.
        Defaults to ``['spark-submit']``.

        Re-define this in your subclass if you already know where
        to find spark-submit (e.g. on cloud services).
        """
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' % (
                path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']
Ejemplo n.º 19
0
    def _find_spark_submit_bin(self):
        """Attempt to find the spark binary. Returns a list of arguments.
        Defaults to ``['spark-submit']``.

        Re-define this in your subclass if you already know where
        to find spark-submit (e.g. on cloud services).
        """
        for path in unique(self._spark_submit_bin_dirs()):
            log.info('Looking for spark-submit binary in %s...' % (
                path or '$PATH'))

            spark_submit_bin = which('spark-submit', path=path)

            if spark_submit_bin:
                log.info('Found spark-submit binary: %s' % spark_submit_bin)
                return [spark_submit_bin]
        else:
            log.info("Falling back to 'spark-submit'")
            return ['spark-submit']
Ejemplo n.º 20
0
    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s...' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')), len(posixpath.basename(p)), p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None
Ejemplo n.º 21
0
    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s...' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if _HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')),
                            len(posixpath.basename(p)),
                            p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None
Ejemplo n.º 22
0
 def test_mixed_types_ok(self):
     self.assertEqual(list(unique(['a', None, 33, 'a'])),
                      ['a', None, 33])
Ejemplo n.º 23
0
 def test_de_duplication(self):
     self.assertEqual(list(unique([1, 2, 1, 5, 1])), [1, 2, 5])
Ejemplo n.º 24
0
 def test_preserves_order(self):
     self.assertEqual(list(unique([6, 7, 2, 0, 7, 1])), [6, 7, 2, 0, 1])
Ejemplo n.º 25
0
 def test_mixed_types_ok(self):
     self.assertEqual(list(unique(['a', None, 33, 'a'])), ['a', None, 33])
Ejemplo n.º 26
0
 def test_empty(self):
     self.assertEqual(list(unique([])), [])
Ejemplo n.º 27
0
 def test_preserves_order(self):
     self.assertEqual(list(unique([6, 7, 2, 0, 7, 1])),
                      [6, 7, 2, 0, 1])
Ejemplo n.º 28
0
 def test_empty(self):
     self.assertEqual(list(unique([])), [])
Ejemplo n.º 29
0
 def test_de_duplication(self):
     self.assertEqual(list(unique([1, 2, 1, 5, 1])),
                      [1, 2, 5])