Beispiel #1
0
def expand_paths(datapath_uri):
    """
    If a URI contains wildcards, this function expands them.

    Returns a list of URIs.
    """
    # simple case:  the path simply exists
    if phdfs.path.exists(datapath_uri.geturl()):
        return [datapath_uri.geturl()]

    # second case:  the path doesn't exist as it is.  It may contain wildcards, so we try
    # listing the datapath with hadoop dfs.  If we were to list with
    # pydoop.hdfs.ls we'd have to implement hadoop wildcards ourselves (perhaps with fnmatch)

    def process(ls_line):
        path = ls_line[(ls_line.rindex(' ') + 1):]
        url = Uri(urlparse.urlparse(path))
        url.scheme = datapath_uri.scheme
        url.netloc = datapath_uri.netloc
        return url.geturl()

    try:
        # run -ls with hadoop dfs the process the output.
        # We drop the first line since it's something like "Found xx items".
        ls_output = subprocess.check_output([pydoop.hadoop_exec(), 'dfs', '-ls', datapath_uri.geturl()]).rstrip('\n').split('\n')[1:]
        # for each data line, run apply the 'process' function to transform it into a full URI
        return map(process, ls_output)
    except subprocess.CalledProcessError as e:
        print_err("Could not list datapath %s.  Please check whether it exists" % datapath_uri.geturl())
        print_err("Message:", str(e))
        sys.exit(1)
Beispiel #2
0
def run_hadoop_jar(jar, class_name=None, additional_cp=None, properties=None, args_list=[]):
    """
    Run a jar with "hadoop jar", optionally specifying the main class.
    """
    if not os.path.exists(jar) or not os.access(jar, os.R_OK):
        raise ValueError("Can't read jar file %s" % jar)
    args = [pydoop.hadoop_exec(), 'jar', jar]
    if class_name:
        args.append(class_name)
    if additional_cp:
        env = copy.copy(os.environ)
        if type(additional_cp) == str: # wrap a single class path in a list
            additional_cp = [additional_cp]
        # Pass this classpath string to hadoop through the HADOOP_CLASSPATH
        # environment variable.  If HADOOP_CLASSPATH is already defined, we'll
        # append our values to it.
        if env.has_key('HADOOP_CLASSPATH'):
            additional_cp.insert(0, env['HADOOP_CLASSPATH'])
        env['HADOOP_CLASSPATH'] = ":".join(additional_cp)
    else:
        env = os.environ
    if properties:
        args.extend( __construct_property_args(properties) )
    args.extend(args_list)
    return subprocess.call(args, env=env)
Beispiel #3
0
def run_hadoop_jar(jar,
                   class_name=None,
                   additional_cp=None,
                   properties=None,
                   args_list=[]):
    """
    Run a jar with "hadoop jar", optionally specifying the main class.
    """
    if not os.path.exists(jar) or not os.access(jar, os.R_OK):
        raise ValueError("Can't read jar file %s" % jar)
    args = [pydoop.hadoop_exec(), 'jar', jar]
    if class_name:
        args.append(class_name)
    if additional_cp:
        env = copy.copy(os.environ)
        if type(additional_cp) == str:  # wrap a single class path in a list
            additional_cp = [additional_cp]
        # Pass this classpath string to hadoop through the HADOOP_CLASSPATH
        # environment variable.  If HADOOP_CLASSPATH is already defined, we'll
        # append our values to it.
        if env.has_key('HADOOP_CLASSPATH'):
            additional_cp.insert(0, env['HADOOP_CLASSPATH'])
        env['HADOOP_CLASSPATH'] = ":".join(additional_cp)
    else:
        env = os.environ
    if properties:
        args.extend(__construct_property_args(properties))
    args.extend(args_list)
    return subprocess.call(args, env=env)
Beispiel #4
0
def run_cmd(cmd, args=None, properties=None, hadoop_home=None,
            hadoop_conf_dir=None, logger=None, keep_streams=True):
    """
    Run a Hadoop command.

    If ``keep_streams`` is set to :obj:`True` (the default), the
    stdout and stderr of the command will be buffered in memory.  If
    the command succeeds, the former will be returned; if it fails, a
    ``RunCmdError`` will be raised with the latter as the message.
    This mode is appropriate for short-running commands whose "result"
    is represented by their standard output (e.g., ``"dfsadmin",
    ["-safemode", "get"]``).

    If ``keep_streams`` is set to :obj:`False`, the command will write
    directly to the stdout and stderr of the calling process, and the
    return value will be empty.  This mode is appropriate for long
    running commands that do not write their "real" output to stdout
    (such as pipes).

    .. code-block:: python

      >>> hadoop_classpath = run_cmd('classpath')
    """
    if logger is None:
        logger = utils.NullLogger()
    hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home)
    _args = [hadoop]
    if hadoop_conf_dir:
        _args.extend(["--config", hadoop_conf_dir])
    _args.append(cmd)
    if properties:
        _args.extend(_construct_property_args(properties))
    if args:
        if isinstance(args, basestring):
            args = shlex.split(args)
        _merge_csv_args(args)
        gargs = _pop_generic_args(args)
        for seq in gargs, args:
            _args.extend(map(str, seq))
    logger.debug('final args: %r' % (_args,))
    if keep_streams:
        p = subprocess.Popen(
            _args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        error = ""
        stderr_iterator = iter(p.stderr.readline, b"")
        for line in stderr_iterator:
            error += line
            logger.info("cmd stderr line: " + line.strip())

        output, _ = p.communicate()
    else:
        p = subprocess.Popen(_args, stdout=None, stderr=None, bufsize=1)
        ret = p.wait()
        error = 'command exited with %d status' % ret if ret else ''
        output = ''
    if p.returncode:
        raise RunCmdError(p.returncode, ' '.join(_args), error)
    return output
Beispiel #5
0
def main(class_name, app_name, args):
    print >>sys.stderr, "Using hadoop executable", pydoop.hadoop_exec()
    print >>sys.stderr, "Using seal jar", seal.jar_path()

    retcode = seal_utilities.run_hadoop_jar(seal.jar_path(), class_name, args_list=args)
    if retcode != 0 and retcode != 3: # 3 for usage error
        print >>sys.stderr, "Error running", app_name
    return retcode
Beispiel #6
0
def run_cmd(cmd,
            args=None,
            properties=None,
            hadoop_home=None,
            hadoop_conf_dir=None,
            logger=None,
            keep_streams=True):
    """
    Run a Hadoop command.

    If ``keep_streams`` is set to :obj:`True` (the default), the
    stdout and stderr of the command will be buffered in memory.  If
    the command succeeds, the former will be returned; if it fails, a
    ``RunCmdError`` will be raised with the latter as the message.
    This mode is appropriate for short-running commands whose "result"
    is represented by their standard output (e.g., ``"dfsadmin",
    ["-safemode", "get"]``).

    If ``keep_streams`` is set to :obj:`False`, the command will write
    directly to the stdout and stderr of the calling process, and the
    return value will be empty.  This mode is appropriate for long
    running commands that do not write their "real" output to stdout
    (such as pipes).

    .. code-block:: python

      >>> hadoop_classpath = run_cmd('classpath')
    """
    if logger is None:
        logger = utils.NullLogger()
    hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home)
    _args = [hadoop]
    if hadoop_conf_dir:
        _args.extend(["--config", hadoop_conf_dir])
    _args.append(cmd)
    if properties:
        _args.extend(_construct_property_args(properties))
    if args:
        if isinstance(args, basestring):
            args = shlex.split(args)
        _merge_csv_args(args)
        gargs = _pop_generic_args(args)
        for seq in gargs, args:
            _args.extend(map(str, seq))
    logger.debug('final args: %r' % (_args, ))
    if keep_streams:
        p = subprocess.Popen(_args,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        output, error = p.communicate()
    else:
        p = subprocess.Popen(_args, stdout=None, stderr=None, bufsize=1)
        ret = p.wait()
        error = 'command exited with %d status' % ret if ret else ''
        output = ''
    if p.returncode:
        raise RunCmdError(p.returncode, ' '.join(_args), error)
    return output
Beispiel #7
0
def run_pipes(executable, input_path, output_path, properties=None, args_list=[]):
    args = [pydoop.hadoop_exec(), "pipes"]
    properties = properties.copy() if properties else {}
    properties['hadoop.pipes.executable'] = executable

    args.extend( __construct_property_args(properties) )
    args.extend(["-input", input_path, "-output", output_path])
    args.extend(args_list)
    return subprocess.call(args)
Beispiel #8
0
def test_hadoop():
    """
    Test the hadoop configuration.
    Calls sys.exit if test fails.
    """
    cmd = [pydoop.hadoop_exec(), 'dfs', '-stat', 'file:///']
    try:
        subprocess.check_output(cmd)
    except subprocess.CalledProcessError as e:
        print_err("Error running hadoop program.  Please check your environment (tried %s)" % ' '.join(cmd))
        print_err("Message:", str(e))
        sys.exit(2)
Beispiel #9
0
def run_cmd(cmd,
            args=None,
            properties=None,
            hadoop_home=None,
            hadoop_conf_dir=None,
            logger=None):
    """
  Run a Hadoop command.

  If the command succeeds, return its output; if it fails, raise a
  ``RunCmdError`` with its error output as the message.

  .. code-block:: python

    >>> import uuid
    >>> properties = {'dfs.block.size': 32*2**20}
    >>> args = ['-put', 'hadut.py', uuid.uuid4().hex]
    >>> res = run_cmd('fs', args, properties)
    >>> res
    ''
    >>> print run_cmd('dfsadmin', ['-help', 'report'])
    -report: Reports basic filesystem information and statistics.
    >>> try:
    ...     run_cmd('foo')
    ... except RunCmdError as e:
    ...     print e
    ...
    Exception in thread "main" java.lang.NoClassDefFoundError: foo
    ...
  """
    if logger is None:
        logger = utils.NullLogger()
    hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home)
    _args = [hadoop]
    if hadoop_conf_dir:
        _args.extend(["--config", hadoop_conf_dir])
    _args.append(cmd)
    if properties:
        _args.extend(_construct_property_args(properties))
    if args:
        if isinstance(args, basestring):
            args = shlex.split(args)
        _merge_csv_args(args)
        gargs = _pop_generic_args(args)
        for seq in gargs, args:
            _args.extend(map(str, seq))
    logger.info('args %s, cmd %s, properties %s', _args, cmd, properties)
    p = subprocess.Popen(_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output, error = p.communicate()
    if p.returncode:
        raise RunCmdError(p.returncode, ' '.join(_args), error)
    return output
Beispiel #10
0
def run_pipes(executable,
              input_path,
              output_path,
              properties=None,
              args_list=[]):
    args = [pydoop.hadoop_exec(), "pipes"]
    properties = properties.copy() if properties else {}
    properties['hadoop.pipes.executable'] = executable

    args.extend(__construct_property_args(properties))
    args.extend(["-input", input_path, "-output", output_path])
    args.extend(args_list)
    return subprocess.call(args)
Beispiel #11
0
def run_cmd(cmd,
            args=None,
            properties=None,
            hadoop_home=None,
            hadoop_conf_dir=None,
            logger=None,
            keep_streams=True):
    tool = pydoop.hadoop_exec(hadoop_home=hadoop_home)
    run_tool_cmd(tool,
                 cmd,
                 args=args,
                 properties=properties,
                 hadoop_conf_dir=hadoop_conf_dir,
                 logger=logger,
                 keep_streams=keep_streams)
Beispiel #12
0
def run_cmd(cmd, args=None, properties=None, hadoop_home=None,
            hadoop_conf_dir=None, logger=None):
  """
  Run a Hadoop command.

  If the command succeeds, return its output; if it fails, raise a
  ``RunCmdError`` with its error output as the message.

  .. code-block:: python

    >>> import uuid
    >>> properties = {'dfs.block.size': 32*2**20}
    >>> args = ['-put', 'hadut.py', uuid.uuid4().hex]
    >>> res = run_cmd('fs', args, properties)
    >>> res
    ''
    >>> print run_cmd('dfsadmin', ['-help', 'report'])
    -report: Reports basic filesystem information and statistics.
    >>> try:
    ...     run_cmd('foo')
    ... except RunCmdError as e:
    ...     print e
    ...
    Exception in thread "main" java.lang.NoClassDefFoundError: foo
    ...
  """
  if logger is None:
    logger = utils.NullLogger()
  hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home)
  _args = [hadoop]
  if hadoop_conf_dir:
    _args.extend(["--config", hadoop_conf_dir])
  _args.append(cmd)
  if properties:
    _args.extend(_construct_property_args(properties))
  if args:
    if isinstance(args, basestring):
      args = shlex.split(args)
    _merge_csv_args(args)
    gargs = _pop_generic_args(args)
    for seq in gargs, args:
      _args.extend(map(str, seq))
  logger.info('args %s, cmd %s, properties %s', _args, cmd, properties)
  p = subprocess.Popen(_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  output, error = p.communicate()
  if p.returncode:
    raise RunCmdError(p.returncode, ' '.join(_args), error)
  return output
Beispiel #13
0
def perform_distcp(copy_groups):
    cmd_start = [ pydoop.hadoop_exec(), 'distcp2', '-atomic' ]
    try:
        for output_path, src_paths in copy_groups.iteritems():
            cmd = cmd_start[:]
            cmd.extend(src_paths)
            cmd.append(output_path)
            log.debug("%s", cmd)
            subprocess.check_call(cmd)
            # Hadoop distcp2 doesn't seem to correctly report errors through its
            # exit code. For instance, it exists with a 0 even when the job is killed.
            # To verify its success we'll check that the destination directory exists.
            # Since we're using -atomic it should only exist if everything went well.
            if phdfs.path.exists(output_path):
                log.info("Successfully ran distcp")
            else:
                raise RuntimeError("Distcp2 failed to complete. Output path not created: %s" % output_path)
    except (subprocess.CalledProcessError, RuntimeError) as e:
        log.critical("Error running distcp: %s", e.message)
        raise e
Beispiel #14
0
The purpose of this example is to demonstrate the usage of
SequenceFileInputFormat and SequenceFileOutputFormat.
"""

import os
import optparse
import logging

logging.basicConfig(level=logging.INFO)

import pydoop
import pydoop.test_support as pts
import pydoop.hadut as hadut


HADOOP = pydoop.hadoop_exec()
HADOOP_CONF_DIR = pydoop.hadoop_conf()
OUTPUT = "output"
LOCAL_WC_SCRIPT = "bin/wordcount.py"
LOCAL_FILTER_SCRIPT = "bin/filter.py"

THIS_DIR = os.path.dirname(os.path.abspath(__file__))
DEFAULT_INPUT = os.path.normpath(os.path.join(THIS_DIR, "../input"))

MR_JOB_NAME = "mapred.job.name"
MR_HOME_DIR = 'mapreduce.admin.user.home.dir'
PIPES_JAVA_RR = "hadoop.pipes.java.recordreader"
PIPES_JAVA_RW = "hadoop.pipes.java.recordwriter"
MR_OUT_COMPRESS_TYPE = "mapred.output.compression.type"
MR_REDUCE_TASKS = "mapred.reduce.tasks"
MR_IN_CLASS = "mapred.input.format.class"
Beispiel #15
0
def run_cmd(cmd, args=None, properties=None, hadoop_home=None,
            hadoop_conf_dir=None, logger=None, keep_streams=True):
    tool = pydoop.hadoop_exec(hadoop_home=hadoop_home)
    run_tool_cmd(tool, cmd, args=args, properties=properties,
                 hadoop_conf_dir=hadoop_conf_dir, logger=logger,
                 keep_streams=keep_streams)