Esempio n. 1
0
  def readline_block_boundary(self):
    kwargs = {}
    if pydoop.hadoop_version_info().has_deprecated_bs():
      bs = hdfs.fs.hdfs().default_block_size()
    else:
      bs = u.get_bytes_per_checksum()
      kwargs['blocksize'] = bs

    line = "012345678\n"
    path = self._make_random_path()
    with self.fs.open_file(path, flags="w", **kwargs) as f:
      bytes_written = lines_written = 0
      while bytes_written < bs + 1:
        f.write(line)
        lines_written += 1
        bytes_written += len(line)
    with self.fs.open_file(path) as f:
      lines = []
      while 1:
        l = f.readline()
        if l == "":
          break
        lines.append(l)
    self.assertEqual(len(lines), lines_written)
    for i, l in enumerate(lines):
      self.assertEqual(l, line, "line %d: %r != %r" % (i, l, line))
Esempio n. 2
0
  def block_boundary(self):
    path = self._make_random_path()
    CHUNK_SIZE = 10
    N = 2
    kwargs = {}
    if pydoop.hadoop_version_info().has_deprecated_bs():
        bs = hdfs.fs.hdfs().default_block_size()
    else:
        bs = N * get_bytes_per_checksum()
        kwargs['blocksize'] = bs
    total_data_size = 2 * bs
    with self.fs.open_file(path, "w", **kwargs) as f:
      data = make_random_data(total_data_size)
      i = 0
      bufsize = hdfs.common.BUFSIZE
      while i < len(data):
        f.write(data[i:i+bufsize])
        i += bufsize

    with self.fs.open_file(path) as f:
      p = total_data_size - CHUNK_SIZE
      for pos in 0, 1, bs-1, bs, bs+1, p-1, p, p+1, total_data_size-1:
        expected_len = CHUNK_SIZE if pos <= p else total_data_size - pos
        f.seek(pos)
        chunk = f.read(CHUNK_SIZE)
        self.assertEqual(len(chunk), expected_len)
Esempio n. 3
0
def run_pipes(
    executable,
    input_path,
    output_path,
    more_args=None,
    properties=None,
    force_pydoop_submitter=False,
    hadoop_conf_dir=None,
    logger=None,
    keep_streams=False,
):
    """
    Run a pipes command.

    ``more_args`` (after setting input/output path) and ``properties``
    are passed to :func:`run_cmd`.

    If not specified otherwise, this function sets the properties
    ``hadoop.pipes.java.recordreader`` and ``hadoop.pipes.java.recordwriter``
    to ``"true"``.

    This function works around a bug in Hadoop pipes that affects
    versions of Hadoop with security when the local file system is
    used as the default FS (no HDFS); see
    https://issues.apache.org/jira/browse/MAPREDUCE-4000.  In those
    set-ups, the function uses Pydoop's own pipes submitter
    application.  You can force the use of Pydoop's submitter by
    passing the argument force_pydoop_submitter=True.
    """
    if logger is None:
        logger = utils.NullLogger()
    if not hdfs.path.exists(executable):
        raise IOError("executable %s not found" % executable)
    if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS):
        raise IOError("input path %s not found" % input_path)
    if properties is None:
        properties = {}
    properties.setdefault("hadoop.pipes.java.recordreader", "true")
    properties.setdefault("hadoop.pipes.java.recordwriter", "true")
    if force_pydoop_submitter:
        use_pydoop_submit = True
    else:
        use_pydoop_submit = False
        ver = pydoop.hadoop_version_info()
        if ver.has_security():
            if ver.is_cdh_mrv2() and hdfs.default_is_local():
                raise RuntimeError("mrv2 on local fs not supported yet")
            use_pydoop_submit = hdfs.default_is_local()
    args = ["-program", executable, "-input", input_path, "-output", output_path]
    if more_args is not None:
        args.extend(more_args)
    if use_pydoop_submit:
        submitter = "it.crs4.pydoop.pipes.Submitter"
        pydoop_jar = pydoop.jar_path()
        args.extend(("-libjars", pydoop_jar))
        return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger, keep_streams=keep_streams)
    else:
        return run_cmd(
            "pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams
        )
Esempio n. 4
0
    def block_boundary(self):
        path = self._make_random_path()
        CHUNK_SIZE = 10
        N = 2
        kwargs = {}
        if pydoop.hadoop_version_info().has_deprecated_bs():
            bs = hdfs.fs.hdfs().default_block_size()
        else:
            bs = N * get_bytes_per_checksum()
            kwargs['blocksize'] = bs
        total_data_size = 2 * bs
        with self.fs.open_file(path, "w", **kwargs) as f:
            data = make_random_data(total_data_size)
            i = 0
            bufsize = hdfs.common.BUFSIZE
            while i < len(data):
                f.write(data[i:i + bufsize])
                i += bufsize

        with self.fs.open_file(path) as f:
            p = total_data_size - CHUNK_SIZE
            for pos in 0, 1, bs - 1, bs, bs + 1, p - 1, p, p + 1, total_data_size - 1:
                expected_len = CHUNK_SIZE if pos <= p else total_data_size - pos
                f.seek(pos)
                chunk = f.read(CHUNK_SIZE)
                self.assertEqual(len(chunk), expected_len)
Esempio n. 5
0
    def block_boundary(self):
        hd_info = pydoop.hadoop_version_info()
        path = self._make_random_path()
        CHUNK_SIZE = 10
        N = 2
        kwargs = {}
        if hd_info.has_deprecated_bs():
            bs = hdfs.fs.hdfs().default_block_size()
        else:
            # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576
            bs = max(1048576, N * utils.get_bytes_per_checksum())
            kwargs['blocksize'] = bs
        total_data_size = 2 * bs
        with self.fs.open_file(path, "w", **kwargs) as f:
            i = 0
            bufsize = 12 * 1024 * 1024
            while i < total_data_size:
                data = 'X' * min(bufsize, total_data_size - i)
                f.write(data)
                i += bufsize

        with self.fs.open_file(path) as f:
            p = total_data_size - CHUNK_SIZE
            for pos in (0, 1, bs - 1, bs, bs + 1, p - 1, p, p + 1,
                        total_data_size - 1):
                expected_len = (CHUNK_SIZE if pos <= p else total_data_size -
                                pos)
                f.seek(pos)
                chunk = f.read(CHUNK_SIZE)
                self.assertEqual(len(chunk), expected_len)
Esempio n. 6
0
    def block_boundary(self):
        hd_info = pydoop.hadoop_version_info()
        path = self._make_random_path()
        CHUNK_SIZE = 10
        N = 2
        kwargs = {}
        if hd_info.has_deprecated_bs():
            bs = hdfs.fs.hdfs().default_block_size()
        else:
            # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576
            bs = max(1048576, N * utils.get_bytes_per_checksum())
            kwargs["blocksize"] = bs
        total_data_size = 2 * bs
        with self.fs.open_file(path, "w", **kwargs) as f:
            i = 0
            bufsize = 24 * 1024 * 1024
            while i < total_data_size:
                data = "X" * min(bufsize, total_data_size - i)
                f.write(data)
                i += bufsize

        with self.fs.open_file(path) as f:
            p = total_data_size - CHUNK_SIZE
            for pos in 0, 1, bs - 1, bs, bs + 1, p - 1, p, p + 1, total_data_size - 1:
                expected_len = CHUNK_SIZE if pos <= p else total_data_size - pos
                f.seek(pos)
                chunk = f.read(CHUNK_SIZE)
                self.assertEqual(len(chunk), expected_len)
Esempio n. 7
0
    def readline_block_boundary(self):
        kwargs = {}
        if pydoop.hadoop_version_info().has_deprecated_bs():
            bs = hdfs.fs.hdfs().default_block_size()
        else:
            bs = u.get_bytes_per_checksum()
            kwargs["blocksize"] = bs

        line = "012345678\n"
        path = self._make_random_path()
        with self.fs.open_file(path, flags="w", **kwargs) as f:
            bytes_written = lines_written = 0
            while bytes_written < bs + 1:
                f.write(line)
                lines_written += 1
                bytes_written += len(line)
        with self.fs.open_file(path) as f:
            lines = []
            while 1:
                l = f.readline()
                if l == "":
                    break
                lines.append(l)
        self.assertEqual(len(lines), lines_written)
        for i, l in enumerate(lines):
            self.assertEqual(l, line, "line %d: %r != %r" % (i, l, line))
Esempio n. 8
0
 def __init__(self, data):
     stream = StringIO(data)
     if hadoop_version_info().has_variable_isplit_encoding():
         self.filename = deserialize_text(stream)
     else:
         self.filename = deserialize_old_style_filename(stream)
     self.offset = deserialize_long(stream)
     self.length = deserialize_long(stream)
Esempio n. 9
0
File: pipes.py Progetto: wtj/pydoop
 def __init__(self, data):
     stream = StringIO(data)
     if hadoop_version_info().has_variable_isplit_encoding():
         self.filename = deserialize_text(stream)
     else:
         self.filename = deserialize_old_style_filename(stream)
     self.offset = deserialize_long(stream)
     self.length = deserialize_long(stream)
Esempio n. 10
0
File: pipes.py Progetto: wtj/pydoop
 def to_string(cls, filename, offset, length):
     stream = StringIO()
     if hadoop_version_info().has_variable_isplit_encoding():
         serialize_text(filename, stream)
     else:
         serialize_old_style_filename(filename, stream)
     serialize_long(offset, stream)
     serialize_long(length, stream)
     return stream.getvalue()
Esempio n. 11
0
 def to_string(cls, filename, offset, length):
     stream = StringIO()
     if hadoop_version_info().has_variable_isplit_encoding():
         serialize_text(filename, stream)
     else:
         serialize_old_style_filename(filename, stream)
     serialize_long(offset, stream)
     serialize_long(length, stream)
     return stream.getvalue()
Esempio n. 12
0
File: hadut.py Progetto: crs4/pydoop
    def __init__(self, prefix=None, logger=None):
        hadoop_version_info = pydoop.hadoop_version_info()
        if hadoop_version_info.is_local():
            raise pydoop.LocalModeNotSupported()

        self.wd = self.exe = self.input = self.output = None
        self.logger = logger or utils.NullLogger()
        if prefix:
            self.wd = utils.make_random_str(prefix=prefix)
            hdfs.mkdir(self.wd)
            for n in "input", "output":
                setattr(self, n, hdfs.path.join(self.wd, n))
Esempio n. 13
0
    def __init__(self, prefix=None, logger=None):
        hadoop_version_info = pydoop.hadoop_version_info()
        if hadoop_version_info.is_local():
            raise pydoop.LocalModeNotSupported()

        self.wd = self.exe = self.input = self.output = None
        self.logger = logger or utils.NullLogger()
        if prefix:
            self.wd = utils.make_random_str(prefix=prefix)
            hdfs.mkdir(self.wd)
            for n in "input", "output":
                setattr(self, n, hdfs.path.join(self.wd, n))
Esempio n. 14
0
    def convert_args(self, args, unknown_args):
        zip_filename = utils.make_random_str(prefix="pydoop_script_",
                                             postfix='.zip')
        mr_module = utils.make_random_str(prefix="pydoop_script_module_")
        mr_driver = utils.make_random_str(prefix="pydoop_script_driver_")
        with ZipFile(zip_filename, 'w') as zipf:
            zipf.write(args.module, arcname=mr_module+'.py')
            zipf.writestr(mr_driver+'.py',
                          self.generate_driver(mr_module, args))
        if args.python_zip is None:
            args.python_zip = [zip_filename]
        else:
            args.python_zip.append(zip_filename)
        args.module = mr_driver
        args.entry_point = 'main'
        args.program = mr_driver
        args.do_not_use_java_record_reader = False
        args.do_not_use_java_record_writer = False
        args.input_format = None
        args.output_format = None
        args.cache_file = None
        args.cache_archive = None
        args.upload_to_cache = None
        args.libjars = None
        args.mrv2 = pydoop.hadoop_version_info().has_mrv2()
        args.local_fs = False
        args.conf = None
        args.disable_property_name_conversion = True
        args.job_conf = [('mapred.textoutputformat.separator',
                          args.kv_separator)]
        args.avro_input = None
        args.avro_output = None

        # despicable hack...
        properties = dict(args.D or [])
        properties.update(dict(args.job_conf))
        output_format = properties.get('mapred.output.format.class',
                                       DEFAULT_OUTPUT_FORMAT)
        if output_format == DEFAULT_OUTPUT_FORMAT:
            if properties['mapred.textoutputformat.separator'] == '':
                pydoop_jar = pydoop.jar_path()
                if pydoop_jar is not None:
                    args.output_format = NOSEP_OUTPUT_FORMAT
                    args.libjars = [pydoop_jar]
                else:
                    warnings.warn(("Can't find pydoop.jar, output will "
                                   "probably be tab-separated"))
        self.args, self.unknown_args = args, unknown_args
        self.zip_filename = zip_filename
Esempio n. 15
0
    def convert_args(self, args, unknown_args):
        zip_filename = utils.make_random_str(prefix="pydoop_script_",
                                             postfix='.zip')
        mr_module = utils.make_random_str(prefix="pydoop_script_module_")
        mr_driver = utils.make_random_str(prefix="pydoop_script_driver_")
        with ZipFile(zip_filename, 'w') as zipf:
            zipf.write(args.module, arcname=mr_module + '.py')
            zipf.writestr(mr_driver + '.py',
                          self.generate_driver(mr_module, args))
        if args.python_zip is None:
            args.python_zip = [zip_filename]
        else:
            args.python_zip.append(zip_filename)
        args.module = mr_driver
        args.entry_point = 'main'
        args.program = mr_driver
        args.do_not_use_java_record_reader = False
        args.do_not_use_java_record_writer = False
        args.input_format = None
        args.output_format = None
        args.cache_file = None
        args.cache_archive = None
        args.upload_to_cache = None
        args.libjars = None
        args.mrv2 = pydoop.hadoop_version_info().has_mrv2()
        args.local_fs = False
        args.conf = None
        args.disable_property_name_conversion = True
        args.job_conf = [('mapred.textoutputformat.separator',
                          args.kv_separator)]
        args.avro_input = None
        args.avro_output = None

        # despicable hack...
        properties = dict(args.D or [])
        properties.update(dict(args.job_conf))
        output_format = properties.get('mapred.output.format.class',
                                       DEFAULT_OUTPUT_FORMAT)
        if output_format == DEFAULT_OUTPUT_FORMAT:
            if properties['mapred.textoutputformat.separator'] == '':
                pydoop_jar = pydoop.jar_path()
                if pydoop_jar is not None:
                    args.output_format = NOSEP_OUTPUT_FORMAT
                    args.libjars = [pydoop_jar]
                else:
                    warnings.warn(("Can't find pydoop.jar, output will "
                                   "probably be tab-separated"))
        self.args, self.unknown_args = args, unknown_args
        self.zip_filename = zip_filename
Esempio n. 16
0
 def get_hosts(self):
   kwargs = {}
   if pydoop.hadoop_version_info().has_deprecated_bs():
     blocksize = hdfs.fs.hdfs().default_block_size()
   else:
     blocksize = 4096
     kwargs['blocksize'] = blocksize
   N = 4
   content = "x" * blocksize * N
   path = self._make_random_file(content=content, **kwargs)
   start = 0
   for i in xrange(N):
     length = blocksize * i + 1
     hosts_per_block = self.fs.get_hosts(path, start, length)
     self.assertEqual(len(hosts_per_block), i+1)
Esempio n. 17
0
 def get_hosts(self):
     kwargs = {}
     if pydoop.hadoop_version_info().has_deprecated_bs():
         blocksize = hdfs.fs.hdfs().default_block_size()
     else:
         blocksize = 4096
         kwargs["blocksize"] = blocksize
     N = 4
     content = "x" * blocksize * N
     path = self._make_random_file(content=content, **kwargs)
     start = 0
     for i in xrange(N):
         length = blocksize * i + 1
         hosts_per_block = self.fs.get_hosts(path, start, length)
         self.assertEqual(len(hosts_per_block), i + 1)
Esempio n. 18
0
 def get_hosts(self):
     hd_info = pydoop.hadoop_version_info()
     kwargs = {}
     if hd_info.has_deprecated_bs() and not hd_info.is_cdh_v5():
         blocksize = hdfs.fs.hdfs().default_block_size()
     else:
         # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576
         blocksize = 1048576
         kwargs['blocksize'] = blocksize
     N = 4
     content = "x" * blocksize * N
     path = self._make_random_file(content=content, **kwargs)
     start = 0
     for i in xrange(N):
         length = blocksize * i + 1
         hosts_per_block = self.fs.get_hosts(path, start, length)
         self.assertEqual(len(hosts_per_block), i + 1)
Esempio n. 19
0
 def get_hosts(self):
     hd_info = pydoop.hadoop_version_info()
     kwargs = {}
     if hd_info.has_deprecated_bs() and not hd_info.is_cdh_v5():
         blocksize = hdfs.fs.hdfs().default_block_size()
     else:
         #(dfs.namenode.fs-limits.min-block-size): 4096 < 1048576
         blocksize = 1048576
         kwargs['blocksize'] = blocksize
     N = 4
     content = "x" * blocksize * N
     path = self._make_random_file(content=content, **kwargs)
     start = 0
     for i in xrange(N):
         length = blocksize * i + 1
         hosts_per_block = self.fs.get_hosts(path, start, length)
         self.assertEqual(len(hosts_per_block), i + 1)
Esempio n. 20
0
def treegen(fs, root, depth, span):
  if isdir(fs, root) and depth > 0:
    for i in xrange(span):
      path = "%s/%d_%d" % (root, depth, i)
      kind = 'file' if i else 'directory'
      if kind == 'file':
        kwargs = {}
        if pydoop.hadoop_version_info().has_deprecated_bs():
          bs = hdfs.fs.hdfs().default_block_size()
        else:
          bs = random.sample(BS_RANGE, 1)[0]
          kwargs['blocksize'] = bs
        sys.stderr.write("%s %s %d\n" % (kind[0].upper(), path, (bs/MB)))
        with fs.open_file(path, "w", **kwargs) as f:
          f.write(path)
      else:
        sys.stderr.write("%s %s 0\n" % (kind[0].upper(), path))
        fs.create_directory(path)
        treegen(fs, path, depth-1, span)
Esempio n. 21
0
    def readline_block_boundary(self):

        def _write_prefix(f, size, bs):
            # Avoid memory problem with JVM
            chunk_size = min(bs, 12 * 1048576)
            written = 0
            while written < size:
                data = 'X' * min(chunk_size, size - written)
                written += f.write(data)

        hd_info = pydoop.hadoop_version_info()
        kwargs = {}
        if hd_info.has_deprecated_bs():
            bs = hdfs.fs.hdfs().default_block_size()
        else:
            # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576
            bs = 1048576
            kwargs['blocksize'] = bs

        line = "012345678\n"
        offset = bs - (10 * len(line) + 5)
        path = self._make_random_path()
        with self.fs.open_file(path, flags="w", **kwargs) as f:
            bytes_written = lines_written = 0
            _write_prefix(f, offset, bs)
            bytes_written = offset
            while bytes_written < bs + 1:
                f.write(line)
                lines_written += 1
                bytes_written += len(line)
        with self.fs.open_file(path) as f:
            f.seek(offset)
            lines = []
            while 1:
                l = f.readline()
                if l == "":
                    break
                lines.append(l)
        self.assertEqual(len(lines), lines_written)
        for i, l in enumerate(lines):
            self.assertEqual(l, line, "line %d: %r != %r" % (i, l, line))
Esempio n. 22
0
    def readline_block_boundary(self):

        def _write_prefix(f, size, bs):
            # Avoid memory problem with JVM
            chunk_size = min(bs, 12 * 1048576)
            written = 0
            while written < size:
                data = b'X' * min(chunk_size, size - written)
                written += f.write(data)

        hd_info = pydoop.hadoop_version_info()
        kwargs = {}
        if hd_info.has_deprecated_bs():
            bs = hdfs.fs.hdfs().default_block_size()
        else:
            # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576
            bs = 1048576
            kwargs['blocksize'] = bs

        line = b"012345678\n"
        offset = bs - (10 * len(line) + 5)
        path = self._make_random_path()
        with self.fs.open_file(path, mode="w", **kwargs) as f:
            bytes_written = lines_written = 0
            _write_prefix(f, offset, bs)
            bytes_written = offset
            while bytes_written < bs + 1:
                f.write(line)
                lines_written += 1
                bytes_written += len(line)
        with self.fs.open_file(path) as f:
            f.seek(offset)
            lines = []
            while 1:
                L = f.readline()
                if not L:
                    break
                lines.append(L)
        self.assertEqual(len(lines), lines_written)
        for i, L in enumerate(lines):
            self.assertEqual(L, line, "line %d: %r != %r" % (i, L, line))
Esempio n. 23
0
    def __init__(self):
        hadoop_version_info = pydoop.hadoop_version_info()
        if hadoop_version_info.is_local():
            raise pydoop.LocalModeNotSupported()

        self.logger = logging.getLogger("PydoopSubmitter")
        self.properties = {
            CACHE_FILES: '',
            CACHE_ARCHIVES: '',
            'mapred.create.symlink': 'yes',  # backward compatibility
            COMPRESS_MAP_OUTPUT: 'true',
            'bl.libhdfs.opts': '-Xmx48m'
        }
        self.args = None
        self.requested_env = dict()
        self.remote_wd = None
        self.remote_module = None
        self.remote_module_bn = None
        self.remote_exe = None
        self.pipes_code = None
        self.files_to_upload = []
        self.unknown_args = None
Esempio n. 24
0
    def __init__(self):
        hadoop_version_info = pydoop.hadoop_version_info()
        if hadoop_version_info.is_local():
            raise pydoop.LocalModeNotSupported()

        self.logger = logging.getLogger("PydoopSubmitter")
        self.properties = {
            CACHE_FILES: '',
            CACHE_ARCHIVES: '',
            'mapred.create.symlink': 'yes',  # backward compatibility
            COMPRESS_MAP_OUTPUT: 'true',
            'bl.libhdfs.opts': '-Xmx48m'
        }
        self.args = None
        self.requested_env = dict()
        self.remote_wd = None
        self.remote_module = None
        self.remote_module_bn = None
        self.remote_exe = None
        self.pipes_code = None
        self.files_to_upload = []
        self.unknown_args = None
        self._use_mrv2 = None
Esempio n. 25
0
 def _get_hadoop_version_str():
     import pydoop
     h = pydoop.hadoop_version_info()
     return '.'.join(map(str, h.main))
Esempio n. 26
0
)

from setuptools import setup, find_packages, Extension
from distutils.command.build import build
from distutils.command.clean import clean
from distutils.errors import DistutilsSetupError
from distutils import log

import pydoop
import pydoop.utils.jvm as jvm

JAVA_HOME = jvm.get_java_home()
JVM_LIB_PATH, JVM_LIB_NAME = jvm.get_jvm_lib_path_and_name(JAVA_HOME)

HADOOP_HOME = pydoop.hadoop_home()
HADOOP_VERSION_INFO = pydoop.hadoop_version_info()

EXTENSION_MODULES = []
VERSION_FN = "VERSION"
GIT_REV_FN = "GIT_REV"
EXTRA_COMPILE_ARGS = ["-Wno-write-strings"]  # http://bugs.python.org/issue6952

# properties file.  Since the source is in the root dir, filename = basename
PROP_FN = PROP_BN = pydoop.__propfile_basename__

CONSOLE_SCRIPTS = ['pydoop = pydoop.app.main:main']
if sys.version_info[0] == 3:
    CONSOLE_SCRIPTS.append('pydoop3 = pydoop.app.main:main')
else:
    CONSOLE_SCRIPTS.append('pydoop2 = pydoop.app.main:main')
Esempio n. 27
0
 def block_size(self):
     if not pydoop.hadoop_version_info().has_deprecated_bs():
         for bs_MB in xrange(100, 500, 50):
             bs = bs_MB * 2**20
             path = self._make_random_file(blocksize=bs)
             self.assertEqual(self.fs.get_path_info(path)["block_size"], bs)
Esempio n. 28
0
from distutils.command.build_ext import build_ext
from distutils.command.build_py import build_py
from distutils.command.clean import clean
from distutils.errors import DistutilsSetupError
from distutils import log

import pydoop
import pydoop.hadoop_utils as hu


try:
  JAVA_HOME = os.environ["JAVA_HOME"]
except KeyError:
  raise RuntimeError("java home not found, try setting JAVA_HOME")
HADOOP_HOME = pydoop.hadoop_home(fallback=None)
HADOOP_VERSION_INFO = pydoop.hadoop_version_info()
BOOST_PYTHON = os.getenv("BOOST_PYTHON", "boost_python")
PIPES_SRC = ["src/%s.cpp" % n for n in (
  "pipes",
  "pipes_context",
  "pipes_test_support",
  "pipes_serial_utils",
  "exceptions",
  "pipes_input_split",
  )]
HDFS_SRC = ["src/%s.cpp" % n for n in (
  "hdfs_fs",
  "hdfs_file",
  "hdfs_common",
  )]
Esempio n. 29
0
 def block_size(self):
     if not pydoop.hadoop_version_info().has_deprecated_bs():
         for bs_MB in xrange(100, 500, 50):
             bs = bs_MB * 2 ** 20
             path = self._make_random_file(blocksize=bs)
             self.assertEqual(self.fs.get_path_info(path)["block_size"], bs)
Esempio n. 30
0
File: setup.py Progetto: crs4/seal
 def _get_hadoop_version_str():
     import pydoop
     h = pydoop.hadoop_version_info()
     return '.'.join(map(str, h.main))
Esempio n. 31
0
class DummyUpLink(object):
    pass


example_input_splits = [
    (b'/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00'
     b'\x08h(\x00\x00\x00\x00\x00\x08h\x05',
     'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 550952, 550917),
    (b'/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00'
     b'\x00\x00\x00\x00\x00\x00\x00\x00\x08h(',
     'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 0, 550952),
    (b'1hdfs://localhost:9000/user/zag/in-dir/images_list\x00\x00\x00\x00\x00'
     b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$',
     'hdfs://localhost:9000/user/zag/in-dir/images_list', 0, 36)
]
if not pydoop.hadoop_version_info().has_variable_isplit_encoding():
    example_input_splits = [("\x00" + raw_split, fn, o, l)
                            for (raw_split, fn, o, l) in example_input_splits]


class taskcontext_tc(unittest.TestCase):

    def test_input_split(self):
        for s in example_input_splits:
            i = InputSplit(s[0])
            self.assertEqual(i.filename, s[1])
            self.assertEqual(i.offset, s[2])
            self.assertEqual(i.length, s[3])

    def test_get_input_split(self):
        ctx = TaskContext(DummyUpLink())
Esempio n. 32
0
def run_pipes(executable,
              input_path,
              output_path,
              more_args=None,
              properties=None,
              force_pydoop_submitter=False,
              hadoop_conf_dir=None,
              logger=None):
    """
  Run a pipes command.

  ``more_args`` (after setting input/output path) and ``properties``
  are passed to :func:`run_cmd`.

  If not specified otherwise, this function sets the properties
  hadoop.pipes.java.recordreader and hadoop.pipes.java.recordwriter to 'true'.

  This function works around a bug in Hadoop pipes that affects versions of
  Hadoop with security when the local file system is used as the default FS
  (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000.
  In those set-ups, the function uses Pydoop's own pipes submitter application.
  You can force the use of Pydoop's submitter by passing the argument
  force_pydoop_submitter=True.
  """
    if logger is None:
        logger = utils.NullLogger()
    if not hdfs.path.exists(executable):
        raise IOError("executable %s not found" % executable)
    if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS):
        raise IOError("input path %s not found" % input_path)
    if properties is None:
        properties = {}
    properties.setdefault('hadoop.pipes.java.recordreader', 'true')
    properties.setdefault('hadoop.pipes.java.recordwriter', 'true')
    if force_pydoop_submitter:
        use_pydoop_submit = True
    else:
        use_pydoop_submit = False
        ver = pydoop.hadoop_version_info()
        if ver.has_security():
            if ver.cdh >= (4, 0,
                           0) and not ver.ext and hdfs.default_is_local():
                raise RuntimeError(
                    "mrv2 on local fs not supported yet")  # FIXME
            use_pydoop_submit = hdfs.default_is_local()
    args = [
        "-program", executable, "-input", input_path, "-output", output_path
    ]
    if more_args is not None:
        args.extend(more_args)
    if use_pydoop_submit:
        submitter = "it.crs4.pydoop.pipes.Submitter"
        pydoop_jar = pydoop.jar_path()
        args.extend(("-libjars", pydoop_jar))
        return run_class(submitter,
                         args,
                         properties,
                         classpath=pydoop_jar,
                         logger=logger)
    else:
        return run_cmd("pipes",
                       args,
                       properties,
                       hadoop_conf_dir=hadoop_conf_dir,
                       logger=logger)
Esempio n. 33
0
import unittest
import pydoop
from pydoop.pipes import InputSplit

example_input_splits = [
    ('/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00'
     '\x08h(\x00\x00\x00\x00\x00\x08h\x05',
     'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 550952, 550917),
    ('/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00'
     '\x00\x00\x00\x00\x00\x00\x00\x00\x08h(',
     'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 0, 550952),
    ('1hdfs://localhost:9000/user/zag/in-dir/images_list\x00\x00\x00\x00\x00'
     '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$',
     'hdfs://localhost:9000/user/zag/in-dir/images_list', 0, 36)
]
if not pydoop.hadoop_version_info().has_variable_isplit_encoding():
    example_input_splits = [("\x00" + raw_split, fn, o, l)
                            for (raw_split, fn, o, l) in example_input_splits]


class taskcontext_tc(unittest.TestCase):
    def test_input_split(self):
        for s in example_input_splits:
            i = InputSplit(s[0])
            self.assertEqual(i.filename, s[1])
            self.assertEqual(i.offset, s[2])
            self.assertEqual(i.length, s[3])


def suite():
    suite = unittest.TestSuite()