Python HADOOP_STREAMING_JAR_RE Examples, mrjob.parse.HADOOP_STREAMING_JAR_RE Python Examples

Example #1

0

Show file

def hadoop_jar(stdout, stderr, environ, *args):
    if len(args) < 1:
        print('RunJar jarFile [mainClass] args...', file=stderr)
        return -1

    jar_path = args[0]
    if not os.path.exists(jar_path):
        print('Exception in thread "main" java.io.IOException: Error opening'
              ' job jar: %s' % jar_path, file=stderr)
        return -1

    # only simulate for streaming steps
    if HADOOP_STREAMING_JAR_RE.match(os.path.basename(jar_path)):
        streaming_args = args[1:]
        output_idx = list(streaming_args).index('-output')
        assert output_idx != -1
        output_dir = streaming_args[output_idx + 1]
        real_output_dir = hdfs_path_to_real_path(output_dir, environ)

        mock_output_dir = get_mock_hadoop_output()
        if mock_output_dir is None:
            print('Job failed!', file=stderr)
            return -1

        if os.path.isdir(real_output_dir):
            os.rmdir(real_output_dir)

        shutil.move(mock_output_dir, real_output_dir)

    now = datetime.datetime.now()
    print(now.strftime('Running job: job_%Y%m%d%H%M_0001'), file=stderr)
    print('Job succeeded!', file=stderr)
    return 0

Example #2

0

Show file

File: mockhadoop.py Project: ENuge/mrjob

def hadoop_jar(stdout, stderr, environ, *args):
    if len(args) < 1:
        stderr.write('RunJar jarFile [mainClass] args...\n')
        return -1

    jar_path = args[0]
    if not os.path.exists(jar_path):
        stderr.write(
            'Exception in thread "main" java.io.IOException: Error opening job'
            ' jar: %s\n' % jar_path)
        return -1

    # only simulate for streaming steps
    if HADOOP_STREAMING_JAR_RE.match(os.path.basename(jar_path)):
        streaming_args = args[1:]
        output_idx = list(streaming_args).index('-output')
        assert output_idx != -1
        output_dir = streaming_args[output_idx + 1]
        real_output_dir = hdfs_path_to_real_path(output_dir, environ)

        mock_output_dir = get_mock_hadoop_output()
        if mock_output_dir is None:
            stderr.write('Job failed!')
            return -1

        if os.path.isdir(real_output_dir):
            os.rmdir(real_output_dir)

        shutil.move(mock_output_dir, real_output_dir)

    now = datetime.datetime.now()
    stderr.write(now.strftime('Running job: job_%Y%m%d%H%M_0001\n'))
    stderr.write('Job succeeded!\n')
    return 0

Example #3

0

Show file

File: hadoop.py Project: Infolaber/mrjob

def find_hadoop_streaming_jar(path):
    """Return the path of the hadoop streaming jar inside the given
    directory tree, or None if we can't find it."""
    for (dirpath, _, filenames) in os.walk(path):
        for filename in filenames:
            if HADOOP_STREAMING_JAR_RE.match(filename):
                return os.path.join(dirpath, filename)
    else:
        return None

Example #4

0

Show file

File: hadoop.py Project: yuanda/mrjob

def find_hadoop_streaming_jar(path):
    """Return the path of the hadoop streaming jar inside the given
    directory tree, or None if we can't find it."""
    for (dirpath, _, filenames) in os.walk(path):
        for filename in filenames:
            if HADOOP_STREAMING_JAR_RE.match(filename):
                return os.path.join(dirpath, filename)
    else:
        return None

Example #5

0

Show file

File: mockhadoop.py Project: vu-bigdata-2016/mrjob

def hadoop_jar(stdout, stderr, environ, *args):
    if len(args) < 1:
        print('RunJar jarFile [mainClass] args...', file=stderr)
        return -1

    jar_path = args[0]
    if not os.path.exists(jar_path):
        print('Exception in thread "main" java.io.IOException: Error opening'
              ' job jar: %s' % jar_path,
              file=stderr)
        return -1

    # use this to simulate log4j
    def mock_log4j(message, level='INFO', logger='mapreduce.JOB', now=None):
        now = now or datetime.datetime.now()
        line = '%s %s %s: %s' % (now.strftime('%Y/%m/%d %H:%M:%S'), level,
                                 logger, message)
        print(line, file=stderr)

    # simulate counters
    counters = next_mock_hadoop_counters()
    if counters:
        num_counters = sum(len(g) for g in counters.values())
        mock_log4j('Counters: %d' % num_counters)
        # subsequent lines are actually part of same log record
        for group, group_counters in sorted(counters.items()):
            print(('\t%s' % group), file=stderr)
            for counter, amount in sorted(group_counters.items()):
                print(('\t\t%s=%d' % (counter, amount)), file=stderr)

    # simulate output for streaming steps
    if HADOOP_STREAMING_JAR_RE.match(os.path.basename(jar_path)):
        streaming_args = args[1:]
        output_idx = list(streaming_args).index('-output')
        assert output_idx != -1
        output_dir = streaming_args[output_idx + 1]
        real_output_dir = hdfs_uri_to_real_path(output_dir, environ)

        mock_output_dir = get_mock_hadoop_output()
        if mock_output_dir is None:
            mock_log4j('Job failed!')
            return -1

        if os.path.isdir(real_output_dir):
            os.rmdir(real_output_dir)

        shutil.move(mock_output_dir, real_output_dir)

    now = datetime.datetime.now()
    mock_log4j(now.strftime('Running job: job_%Y%m%d%H%M_0001'))
    mock_log4j('Job succeeded!')
    return 0

Example #6

0

Show file

def is_job_flow_non_streaming(job_flow):
    """Return True if the give job flow has steps, but not of them are
    Hadoop streaming steps (for example, if the job flow is running Hive).
    """
    if not job_flow.steps:
        return False

    for step in job_flow.steps:
        if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(step.jar)):
            return False

    # job has at least one step, and none are streaming steps
    return True

Example #7

0

Show file

File: terminate_idle_job_flows.py Project: AntonKast/mrjob

def is_job_flow_non_streaming(job_flow):
    """Return True if the give job flow has steps, but not of them are
    Hadoop streaming steps (for example, if the job flow is running Hive).
    """
    if not job_flow.steps:
        return False

    for step in job_flow.steps:
        if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(step.jar)):
            return False

    # job has at least one step, and none are streaming steps
    return True

Example #8

0

Show file

File: mockhadoop.py Project: gitbenedict/mrjob

def hadoop_jar(stdout, stderr, environ, *args):
    if len(args) < 1:
        print('RunJar jarFile [mainClass] args...', file=stderr)
        return -1

    jar_path = args[0]
    if not os.path.exists(jar_path):
        print('Exception in thread "main" java.io.IOException: Error opening'
              ' job jar: %s' % jar_path, file=stderr)
        return -1

    # use this to simulate log4j
    def mock_log4j(message, level='INFO', logger='mapreduce.JOB', now=None):
        now = now or datetime.datetime.now()
        line = '%s %s %s: %s' % (now.strftime('%Y/%m/%d %H:%M:%S'),
                                 level, logger, message)
        print(line, file=stderr)

    # simulate counters
    counters = next_mock_hadoop_counters()
    if counters:
        num_counters = sum(len(g) for g in counters.values())
        mock_log4j('Counters: %d' % num_counters)
        # subsequent lines are actually part of same log record
        for group, group_counters in sorted(counters.items()):
            print(('\t%s' % group), file=stderr)
            for counter, amount in sorted(group_counters.items()):
                print(('\t\t%s=%d' % (counter, amount)), file=stderr)

    # simulate output for streaming steps
    if HADOOP_STREAMING_JAR_RE.match(os.path.basename(jar_path)):
        streaming_args = args[1:]
        output_idx = list(streaming_args).index('-output')
        assert output_idx != -1
        output_dir = streaming_args[output_idx + 1]
        real_output_dir = hdfs_uri_to_real_path(output_dir, environ)

        mock_output_dir = get_mock_hadoop_output()
        if mock_output_dir is None:
            mock_log4j('Job failed!')
            return -1

        if os.path.isdir(real_output_dir):
            os.rmdir(real_output_dir)

        shutil.move(mock_output_dir, real_output_dir)

    now = datetime.datetime.now()
    mock_log4j(now.strftime('Running job: job_%Y%m%d%H%M_0001'))
    mock_log4j('Job succeeded!')
    return 0

Example #9

0

Show file

File: hadoop.py Project: sebratt/mrjob

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')), len(posixpath.basename(p)), p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None

Example #10

0

Show file

File: hadoop.py Project: mmontagna/mrjob

    def _find_hadoop_streaming_jar(self):
        """Search for the hadoop streaming jar. See
        :py:meth:`_hadoop_streaming_jar_dirs` for where we search."""
        for path in unique(self._hadoop_streaming_jar_dirs()):
            log.info('Looking for Hadoop streaming jar in %s' % path)

            streaming_jars = []
            for path in self.fs.ls(path):
                if HADOOP_STREAMING_JAR_RE.match(posixpath.basename(path)):
                    streaming_jars.append(path)

            if streaming_jars:
                # prefer shorter names and shallower paths
                def sort_key(p):
                    return (len(p.split('/')),
                            len(posixpath.basename(p)),
                            p)

                streaming_jars.sort(key=sort_key)

                return streaming_jars[0]

        return None