Ejemplo n.º 1
0
def run_class(class_name,
              args=None,
              properties=None,
              classpath=None,
              hadoop_conf_dir=None,
              logger=None):
    """
    Run a class that needs the Hadoop jars in its class path.

    ``args`` and ``properties`` are passed to :func:`run_cmd`.

    .. code-block:: python

      >>> cls = 'org.apache.hadoop.hdfs.tools.DFSAdmin'
      >>> print run_class(cls, args=['-help', 'report'])
      -report: Reports basic filesystem information and statistics.
    """
    if logger is None:
        logger = utils.NullLogger()
    old_classpath = None
    if classpath:
        old_classpath = os.getenv('HADOOP_CLASSPATH', '')
        if isinstance(classpath, basestring):
            classpath = [classpath]
        classpath_list = [cp.strip() for s in classpath for cp in s.split(":")]
        os.environ['HADOOP_CLASSPATH'] = ":".join(classpath_list)
        logger.debug('HADOOP_CLASSPATH %s', os.environ['HADOOP_CLASSPATH'])
    res = run_cmd(class_name,
                  args,
                  properties,
                  hadoop_conf_dir=hadoop_conf_dir,
                  logger=logger)
    if old_classpath is not None:
        os.environ['HADOOP_CLASSPATH'] = old_classpath
    return res
Ejemplo n.º 2
0
Archivo: hadut.py Proyecto: wtj/pydoop
 def __init__(self, prefix=None, logger=None):
     self.wd = self.exe = self.input = self.output = None
     self.logger = logger or utils.NullLogger()
     if prefix:
         self.wd = utils.make_random_str(prefix=prefix)
         hdfs.mkdir(self.wd)
         for n in "input", "output":
             setattr(self, n, hdfs.path.join(self.wd, n))
Ejemplo n.º 3
0
def run_cmd(cmd,
            args=None,
            properties=None,
            hadoop_home=None,
            hadoop_conf_dir=None,
            logger=None,
            keep_streams=True):
    """
    Run a Hadoop command.

    If ``keep_streams`` is set to :obj:`True` (the default), the
    stdout and stderr of the command will be buffered in memory.  If
    the command succeeds, the former will be returned; if it fails, a
    ``RunCmdError`` will be raised with the latter as the message.
    This mode is appropriate for short-running commands whose "result"
    is represented by their standard output (e.g., ``"dfsadmin",
    ["-safemode", "get"]``).

    If ``keep_streams`` is set to :obj:`False`, the command will write
    directly to the stdout and stderr of the calling process, and the
    return value will be empty.  This mode is appropriate for long
    running commands that do not write their "real" output to stdout
    (such as pipes).

    .. code-block:: python

      >>> hadoop_classpath = run_cmd('classpath')
    """
    if logger is None:
        logger = utils.NullLogger()
    hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home)
    _args = [hadoop]
    if hadoop_conf_dir:
        _args.extend(["--config", hadoop_conf_dir])
    _args.append(cmd)
    if properties:
        _args.extend(_construct_property_args(properties))
    if args:
        if isinstance(args, basestring):
            args = shlex.split(args)
        _merge_csv_args(args)
        gargs = _pop_generic_args(args)
        for seq in gargs, args:
            _args.extend(map(str, seq))
    logger.debug('final args: %r' % (_args, ))
    if keep_streams:
        p = subprocess.Popen(_args,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        output, error = p.communicate()
    else:
        p = subprocess.Popen(_args, stdout=None, stderr=None, bufsize=1)
        ret = p.wait()
        error = 'command exited with %d status' % ret if ret else ''
        output = ''
    if p.returncode:
        raise RunCmdError(p.returncode, ' '.join(_args), error)
    return output
Ejemplo n.º 4
0
def run_class(class_name,
              args=None,
              properties=None,
              classpath=None,
              hadoop_conf_dir=None,
              logger=None,
              keep_streams=True):
    """
    Run a Java class with Hadoop (equivalent of running ``hadoop
    <class_name>`` from the command line).

    Additional ``HADOOP_CLASSPATH`` elements can be provided via
    ``classpath`` (either as a non-string sequence where each element
    is a classpath element or as a ``':'``-separated string).  Other
    arguments are passed to :func:`run_cmd`.

    .. code-block:: python

      >>> cls = 'org.apache.hadoop.fs.FsShell'
      >>> try: out = run_class(cls, args=['-test', '-e', 'file:/tmp'])
      ... except RunCmdError: tmp_exists = False
      ... else: tmp_exists = True

    .. note::

      ``HADOOP_CLASSPATH`` makes dependencies available **only on the
      client side**.  If you are running a MapReduce application, use
      ``args=['-libjars', 'jar1,jar2,...']`` to make them available to
      the server side as well.
    """
    if logger is None:
        logger = utils.NullLogger()
    old_classpath = None
    if classpath:
        old_classpath = os.getenv('HADOOP_CLASSPATH', '')
        if isinstance(classpath, basestring):
            classpath = [classpath]
        # Prepend the classpaths provided by the user to the existing
        # HADOOP_CLASSPATH value.  Order matters.  We could work a little
        # harder to avoid duplicates, but it's not essential
        os.environ['HADOOP_CLASSPATH'] = ":".join(classpath +
                                                  old_classpath.split(':', 1))
        logger.debug('HADOOP_CLASSPATH: %r', os.getenv('HADOOP_CLASSPATH'))
    try:
        res = run_cmd(class_name,
                      args,
                      properties,
                      hadoop_conf_dir=hadoop_conf_dir,
                      logger=logger,
                      keep_streams=keep_streams)
    finally:
        if old_classpath is not None:
            os.environ['HADOOP_CLASSPATH'] = old_classpath
    return res
Ejemplo n.º 5
0
    def __init__(self, prefix=None, logger=None):
        hadoop_version_info = pydoop.hadoop_version_info()
        if hadoop_version_info.is_local():
            raise pydoop.LocalModeNotSupported()

        self.wd = self.exe = self.input = self.output = None
        self.logger = logger or utils.NullLogger()
        if prefix:
            self.wd = utils.make_random_str(prefix=prefix)
            hdfs.mkdir(self.wd)
            for n in "input", "output":
                setattr(self, n, hdfs.path.join(self.wd, n))
Ejemplo n.º 6
0
Archivo: hadut.py Proyecto: wtj/pydoop
def run_cmd(cmd,
            args=None,
            properties=None,
            hadoop_home=None,
            hadoop_conf_dir=None,
            logger=None):
    """
    Run a Hadoop command.

    If the command succeeds, return its output; if it fails, raise a
    ``RunCmdError`` with its error output as the message.

    .. code-block:: python

      >>> import uuid
      >>> properties = {'dfs.block.size': 32*2**20}
      >>> args = ['-put', 'hadut.py', uuid.uuid4().hex]
      >>> res = run_cmd('fs', args, properties)
      >>> res
      ''
      >>> print run_cmd('dfsadmin', ['-help', 'report'])
      -report: Reports basic filesystem information and statistics.
      >>> try:
      ...     run_cmd('foo')
      ... except RunCmdError as e:
      ...     print e
      ...
      Exception in thread "main" java.lang.NoClassDefFoundError: foo
      ...
    """
    if logger is None:
        logger = utils.NullLogger()
    hadoop = pydoop.hadoop_exec(hadoop_home=hadoop_home)
    _args = [hadoop]
    if hadoop_conf_dir:
        _args.extend(["--config", hadoop_conf_dir])
    _args.append(cmd)
    if properties:
        _args.extend(_construct_property_args(properties))
    if args:
        if isinstance(args, basestring):
            args = shlex.split(args)
        _merge_csv_args(args)
        gargs = _pop_generic_args(args)
        for seq in gargs, args:
            _args.extend(map(str, seq))
    logger.debug('final args: %r' % (_args, ))
    p = subprocess.Popen(_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output, error = p.communicate()
    if p.returncode:
        raise RunCmdError(p.returncode, ' '.join(_args), error)
    return output
Ejemplo n.º 7
0
Archivo: hadut.py Proyecto: wtj/pydoop
def run_pipes(executable,
              input_path,
              output_path,
              more_args=None,
              properties=None,
              force_pydoop_submitter=False,
              hadoop_conf_dir=None,
              logger=None):
    """
    Run a pipes command.

    ``more_args`` (after setting input/output path) and ``properties``
    are passed to :func:`run_cmd`.

    If not specified otherwise, this function sets the properties
    ``hadoop.pipes.java.recordreader`` and ``hadoop.pipes.java.recordwriter``
    to ``"true"``.

    This function works around a bug in Hadoop pipes that affects
    versions of Hadoop with security when the local file system is
    used as the default FS (no HDFS); see
    https://issues.apache.org/jira/browse/MAPREDUCE-4000.  In those
    set-ups, the function uses Pydoop's own pipes submitter
    application.  You can force the use of Pydoop's submitter by
    passing the argument force_pydoop_submitter=True.
    """
    if logger is None:
        logger = utils.NullLogger()
    if not hdfs.path.exists(executable):
        raise IOError("executable %s not found" % executable)
    if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS):
        raise IOError("input path %s not found" % input_path)
    if properties is None:
        properties = {}
    properties.setdefault('hadoop.pipes.java.recordreader', 'true')
    properties.setdefault('hadoop.pipes.java.recordwriter', 'true')
    if force_pydoop_submitter:
        use_pydoop_submit = True
    else:
        use_pydoop_submit = False
        ver = pydoop.hadoop_version_info()
        if ver.has_security():
            if ver.is_cdh_mrv2() and hdfs.default_is_local():
                raise RuntimeError("mrv2 on local fs not supported yet")
            use_pydoop_submit = hdfs.default_is_local()
    args = [
        "-program",
        executable,
        "-input",
        input_path,
        "-output",
        output_path,
    ]
    if more_args is not None:
        args.extend(more_args)
    if use_pydoop_submit:
        submitter = "it.crs4.pydoop.pipes.Submitter"
        pydoop_jar = pydoop.jar_path()
        args.extend(("-libjars", pydoop_jar))
        return run_class(submitter,
                         args,
                         properties,
                         classpath=pydoop_jar,
                         logger=logger)
    else:
        return run_cmd("pipes",
                       args,
                       properties,
                       hadoop_conf_dir=hadoop_conf_dir,
                       logger=logger)