def readline_block_boundary(self): kwargs = {} if pydoop.hadoop_version_info().has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: bs = u.get_bytes_per_checksum() kwargs['blocksize'] = bs line = "012345678\n" path = self._make_random_path() with self.fs.open_file(path, flags="w", **kwargs) as f: bytes_written = lines_written = 0 while bytes_written < bs + 1: f.write(line) lines_written += 1 bytes_written += len(line) with self.fs.open_file(path) as f: lines = [] while 1: l = f.readline() if l == "": break lines.append(l) self.assertEqual(len(lines), lines_written) for i, l in enumerate(lines): self.assertEqual(l, line, "line %d: %r != %r" % (i, l, line))
def block_boundary(self): path = self._make_random_path() CHUNK_SIZE = 10 N = 2 kwargs = {} if pydoop.hadoop_version_info().has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: bs = N * get_bytes_per_checksum() kwargs['blocksize'] = bs total_data_size = 2 * bs with self.fs.open_file(path, "w", **kwargs) as f: data = make_random_data(total_data_size) i = 0 bufsize = hdfs.common.BUFSIZE while i < len(data): f.write(data[i:i+bufsize]) i += bufsize with self.fs.open_file(path) as f: p = total_data_size - CHUNK_SIZE for pos in 0, 1, bs-1, bs, bs+1, p-1, p, p+1, total_data_size-1: expected_len = CHUNK_SIZE if pos <= p else total_data_size - pos f.seek(pos) chunk = f.read(CHUNK_SIZE) self.assertEqual(len(chunk), expected_len)
def run_pipes( executable, input_path, output_path, more_args=None, properties=None, force_pydoop_submitter=False, hadoop_conf_dir=None, logger=None, keep_streams=False, ): """ Run a pipes command. ``more_args`` (after setting input/output path) and ``properties`` are passed to :func:`run_cmd`. If not specified otherwise, this function sets the properties ``hadoop.pipes.java.recordreader`` and ``hadoop.pipes.java.recordwriter`` to ``"true"``. This function works around a bug in Hadoop pipes that affects versions of Hadoop with security when the local file system is used as the default FS (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000. In those set-ups, the function uses Pydoop's own pipes submitter application. You can force the use of Pydoop's submitter by passing the argument force_pydoop_submitter=True. """ if logger is None: logger = utils.NullLogger() if not hdfs.path.exists(executable): raise IOError("executable %s not found" % executable) if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS): raise IOError("input path %s not found" % input_path) if properties is None: properties = {} properties.setdefault("hadoop.pipes.java.recordreader", "true") properties.setdefault("hadoop.pipes.java.recordwriter", "true") if force_pydoop_submitter: use_pydoop_submit = True else: use_pydoop_submit = False ver = pydoop.hadoop_version_info() if ver.has_security(): if ver.is_cdh_mrv2() and hdfs.default_is_local(): raise RuntimeError("mrv2 on local fs not supported yet") use_pydoop_submit = hdfs.default_is_local() args = ["-program", executable, "-input", input_path, "-output", output_path] if more_args is not None: args.extend(more_args) if use_pydoop_submit: submitter = "it.crs4.pydoop.pipes.Submitter" pydoop_jar = pydoop.jar_path() args.extend(("-libjars", pydoop_jar)) return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger, keep_streams=keep_streams) else: return run_cmd( "pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams )
def block_boundary(self): path = self._make_random_path() CHUNK_SIZE = 10 N = 2 kwargs = {} if pydoop.hadoop_version_info().has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: bs = N * get_bytes_per_checksum() kwargs['blocksize'] = bs total_data_size = 2 * bs with self.fs.open_file(path, "w", **kwargs) as f: data = make_random_data(total_data_size) i = 0 bufsize = hdfs.common.BUFSIZE while i < len(data): f.write(data[i:i + bufsize]) i += bufsize with self.fs.open_file(path) as f: p = total_data_size - CHUNK_SIZE for pos in 0, 1, bs - 1, bs, bs + 1, p - 1, p, p + 1, total_data_size - 1: expected_len = CHUNK_SIZE if pos <= p else total_data_size - pos f.seek(pos) chunk = f.read(CHUNK_SIZE) self.assertEqual(len(chunk), expected_len)
def block_boundary(self): hd_info = pydoop.hadoop_version_info() path = self._make_random_path() CHUNK_SIZE = 10 N = 2 kwargs = {} if hd_info.has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 bs = max(1048576, N * utils.get_bytes_per_checksum()) kwargs['blocksize'] = bs total_data_size = 2 * bs with self.fs.open_file(path, "w", **kwargs) as f: i = 0 bufsize = 12 * 1024 * 1024 while i < total_data_size: data = 'X' * min(bufsize, total_data_size - i) f.write(data) i += bufsize with self.fs.open_file(path) as f: p = total_data_size - CHUNK_SIZE for pos in (0, 1, bs - 1, bs, bs + 1, p - 1, p, p + 1, total_data_size - 1): expected_len = (CHUNK_SIZE if pos <= p else total_data_size - pos) f.seek(pos) chunk = f.read(CHUNK_SIZE) self.assertEqual(len(chunk), expected_len)
def block_boundary(self): hd_info = pydoop.hadoop_version_info() path = self._make_random_path() CHUNK_SIZE = 10 N = 2 kwargs = {} if hd_info.has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 bs = max(1048576, N * utils.get_bytes_per_checksum()) kwargs["blocksize"] = bs total_data_size = 2 * bs with self.fs.open_file(path, "w", **kwargs) as f: i = 0 bufsize = 24 * 1024 * 1024 while i < total_data_size: data = "X" * min(bufsize, total_data_size - i) f.write(data) i += bufsize with self.fs.open_file(path) as f: p = total_data_size - CHUNK_SIZE for pos in 0, 1, bs - 1, bs, bs + 1, p - 1, p, p + 1, total_data_size - 1: expected_len = CHUNK_SIZE if pos <= p else total_data_size - pos f.seek(pos) chunk = f.read(CHUNK_SIZE) self.assertEqual(len(chunk), expected_len)
def readline_block_boundary(self): kwargs = {} if pydoop.hadoop_version_info().has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: bs = u.get_bytes_per_checksum() kwargs["blocksize"] = bs line = "012345678\n" path = self._make_random_path() with self.fs.open_file(path, flags="w", **kwargs) as f: bytes_written = lines_written = 0 while bytes_written < bs + 1: f.write(line) lines_written += 1 bytes_written += len(line) with self.fs.open_file(path) as f: lines = [] while 1: l = f.readline() if l == "": break lines.append(l) self.assertEqual(len(lines), lines_written) for i, l in enumerate(lines): self.assertEqual(l, line, "line %d: %r != %r" % (i, l, line))
def __init__(self, data): stream = StringIO(data) if hadoop_version_info().has_variable_isplit_encoding(): self.filename = deserialize_text(stream) else: self.filename = deserialize_old_style_filename(stream) self.offset = deserialize_long(stream) self.length = deserialize_long(stream)
def to_string(cls, filename, offset, length): stream = StringIO() if hadoop_version_info().has_variable_isplit_encoding(): serialize_text(filename, stream) else: serialize_old_style_filename(filename, stream) serialize_long(offset, stream) serialize_long(length, stream) return stream.getvalue()
def __init__(self, prefix=None, logger=None): hadoop_version_info = pydoop.hadoop_version_info() if hadoop_version_info.is_local(): raise pydoop.LocalModeNotSupported() self.wd = self.exe = self.input = self.output = None self.logger = logger or utils.NullLogger() if prefix: self.wd = utils.make_random_str(prefix=prefix) hdfs.mkdir(self.wd) for n in "input", "output": setattr(self, n, hdfs.path.join(self.wd, n))
def convert_args(self, args, unknown_args): zip_filename = utils.make_random_str(prefix="pydoop_script_", postfix='.zip') mr_module = utils.make_random_str(prefix="pydoop_script_module_") mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module+'.py') zipf.writestr(mr_driver+'.py', self.generate_driver(mr_module, args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.input_format = None args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.mrv2 = pydoop.hadoop_version_info().has_mrv2() args.local_fs = False args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args self.zip_filename = zip_filename
def convert_args(self, args, unknown_args): zip_filename = utils.make_random_str(prefix="pydoop_script_", postfix='.zip') mr_module = utils.make_random_str(prefix="pydoop_script_module_") mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module + '.py') zipf.writestr(mr_driver + '.py', self.generate_driver(mr_module, args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.input_format = None args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.mrv2 = pydoop.hadoop_version_info().has_mrv2() args.local_fs = False args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args self.zip_filename = zip_filename
def get_hosts(self): kwargs = {} if pydoop.hadoop_version_info().has_deprecated_bs(): blocksize = hdfs.fs.hdfs().default_block_size() else: blocksize = 4096 kwargs['blocksize'] = blocksize N = 4 content = "x" * blocksize * N path = self._make_random_file(content=content, **kwargs) start = 0 for i in xrange(N): length = blocksize * i + 1 hosts_per_block = self.fs.get_hosts(path, start, length) self.assertEqual(len(hosts_per_block), i+1)
def get_hosts(self): kwargs = {} if pydoop.hadoop_version_info().has_deprecated_bs(): blocksize = hdfs.fs.hdfs().default_block_size() else: blocksize = 4096 kwargs["blocksize"] = blocksize N = 4 content = "x" * blocksize * N path = self._make_random_file(content=content, **kwargs) start = 0 for i in xrange(N): length = blocksize * i + 1 hosts_per_block = self.fs.get_hosts(path, start, length) self.assertEqual(len(hosts_per_block), i + 1)
def get_hosts(self): hd_info = pydoop.hadoop_version_info() kwargs = {} if hd_info.has_deprecated_bs() and not hd_info.is_cdh_v5(): blocksize = hdfs.fs.hdfs().default_block_size() else: # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 blocksize = 1048576 kwargs['blocksize'] = blocksize N = 4 content = "x" * blocksize * N path = self._make_random_file(content=content, **kwargs) start = 0 for i in xrange(N): length = blocksize * i + 1 hosts_per_block = self.fs.get_hosts(path, start, length) self.assertEqual(len(hosts_per_block), i + 1)
def get_hosts(self): hd_info = pydoop.hadoop_version_info() kwargs = {} if hd_info.has_deprecated_bs() and not hd_info.is_cdh_v5(): blocksize = hdfs.fs.hdfs().default_block_size() else: #(dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 blocksize = 1048576 kwargs['blocksize'] = blocksize N = 4 content = "x" * blocksize * N path = self._make_random_file(content=content, **kwargs) start = 0 for i in xrange(N): length = blocksize * i + 1 hosts_per_block = self.fs.get_hosts(path, start, length) self.assertEqual(len(hosts_per_block), i + 1)
def treegen(fs, root, depth, span): if isdir(fs, root) and depth > 0: for i in xrange(span): path = "%s/%d_%d" % (root, depth, i) kind = 'file' if i else 'directory' if kind == 'file': kwargs = {} if pydoop.hadoop_version_info().has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: bs = random.sample(BS_RANGE, 1)[0] kwargs['blocksize'] = bs sys.stderr.write("%s %s %d\n" % (kind[0].upper(), path, (bs/MB))) with fs.open_file(path, "w", **kwargs) as f: f.write(path) else: sys.stderr.write("%s %s 0\n" % (kind[0].upper(), path)) fs.create_directory(path) treegen(fs, path, depth-1, span)
def readline_block_boundary(self): def _write_prefix(f, size, bs): # Avoid memory problem with JVM chunk_size = min(bs, 12 * 1048576) written = 0 while written < size: data = 'X' * min(chunk_size, size - written) written += f.write(data) hd_info = pydoop.hadoop_version_info() kwargs = {} if hd_info.has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 bs = 1048576 kwargs['blocksize'] = bs line = "012345678\n" offset = bs - (10 * len(line) + 5) path = self._make_random_path() with self.fs.open_file(path, flags="w", **kwargs) as f: bytes_written = lines_written = 0 _write_prefix(f, offset, bs) bytes_written = offset while bytes_written < bs + 1: f.write(line) lines_written += 1 bytes_written += len(line) with self.fs.open_file(path) as f: f.seek(offset) lines = [] while 1: l = f.readline() if l == "": break lines.append(l) self.assertEqual(len(lines), lines_written) for i, l in enumerate(lines): self.assertEqual(l, line, "line %d: %r != %r" % (i, l, line))
def readline_block_boundary(self): def _write_prefix(f, size, bs): # Avoid memory problem with JVM chunk_size = min(bs, 12 * 1048576) written = 0 while written < size: data = b'X' * min(chunk_size, size - written) written += f.write(data) hd_info = pydoop.hadoop_version_info() kwargs = {} if hd_info.has_deprecated_bs(): bs = hdfs.fs.hdfs().default_block_size() else: # (dfs.namenode.fs-limits.min-block-size): 4096 < 1048576 bs = 1048576 kwargs['blocksize'] = bs line = b"012345678\n" offset = bs - (10 * len(line) + 5) path = self._make_random_path() with self.fs.open_file(path, mode="w", **kwargs) as f: bytes_written = lines_written = 0 _write_prefix(f, offset, bs) bytes_written = offset while bytes_written < bs + 1: f.write(line) lines_written += 1 bytes_written += len(line) with self.fs.open_file(path) as f: f.seek(offset) lines = [] while 1: L = f.readline() if not L: break lines.append(L) self.assertEqual(len(lines), lines_written) for i, L in enumerate(lines): self.assertEqual(L, line, "line %d: %r != %r" % (i, L, line))
def __init__(self): hadoop_version_info = pydoop.hadoop_version_info() if hadoop_version_info.is_local(): raise pydoop.LocalModeNotSupported() self.logger = logging.getLogger("PydoopSubmitter") self.properties = { CACHE_FILES: '', CACHE_ARCHIVES: '', 'mapred.create.symlink': 'yes', # backward compatibility COMPRESS_MAP_OUTPUT: 'true', 'bl.libhdfs.opts': '-Xmx48m' } self.args = None self.requested_env = dict() self.remote_wd = None self.remote_module = None self.remote_module_bn = None self.remote_exe = None self.pipes_code = None self.files_to_upload = [] self.unknown_args = None
def __init__(self): hadoop_version_info = pydoop.hadoop_version_info() if hadoop_version_info.is_local(): raise pydoop.LocalModeNotSupported() self.logger = logging.getLogger("PydoopSubmitter") self.properties = { CACHE_FILES: '', CACHE_ARCHIVES: '', 'mapred.create.symlink': 'yes', # backward compatibility COMPRESS_MAP_OUTPUT: 'true', 'bl.libhdfs.opts': '-Xmx48m' } self.args = None self.requested_env = dict() self.remote_wd = None self.remote_module = None self.remote_module_bn = None self.remote_exe = None self.pipes_code = None self.files_to_upload = [] self.unknown_args = None self._use_mrv2 = None
def _get_hadoop_version_str(): import pydoop h = pydoop.hadoop_version_info() return '.'.join(map(str, h.main))
) from setuptools import setup, find_packages, Extension from distutils.command.build import build from distutils.command.clean import clean from distutils.errors import DistutilsSetupError from distutils import log import pydoop import pydoop.utils.jvm as jvm JAVA_HOME = jvm.get_java_home() JVM_LIB_PATH, JVM_LIB_NAME = jvm.get_jvm_lib_path_and_name(JAVA_HOME) HADOOP_HOME = pydoop.hadoop_home() HADOOP_VERSION_INFO = pydoop.hadoop_version_info() EXTENSION_MODULES = [] VERSION_FN = "VERSION" GIT_REV_FN = "GIT_REV" EXTRA_COMPILE_ARGS = ["-Wno-write-strings"] # http://bugs.python.org/issue6952 # properties file. Since the source is in the root dir, filename = basename PROP_FN = PROP_BN = pydoop.__propfile_basename__ CONSOLE_SCRIPTS = ['pydoop = pydoop.app.main:main'] if sys.version_info[0] == 3: CONSOLE_SCRIPTS.append('pydoop3 = pydoop.app.main:main') else: CONSOLE_SCRIPTS.append('pydoop2 = pydoop.app.main:main')
def block_size(self): if not pydoop.hadoop_version_info().has_deprecated_bs(): for bs_MB in xrange(100, 500, 50): bs = bs_MB * 2**20 path = self._make_random_file(blocksize=bs) self.assertEqual(self.fs.get_path_info(path)["block_size"], bs)
from distutils.command.build_ext import build_ext from distutils.command.build_py import build_py from distutils.command.clean import clean from distutils.errors import DistutilsSetupError from distutils import log import pydoop import pydoop.hadoop_utils as hu try: JAVA_HOME = os.environ["JAVA_HOME"] except KeyError: raise RuntimeError("java home not found, try setting JAVA_HOME") HADOOP_HOME = pydoop.hadoop_home(fallback=None) HADOOP_VERSION_INFO = pydoop.hadoop_version_info() BOOST_PYTHON = os.getenv("BOOST_PYTHON", "boost_python") PIPES_SRC = ["src/%s.cpp" % n for n in ( "pipes", "pipes_context", "pipes_test_support", "pipes_serial_utils", "exceptions", "pipes_input_split", )] HDFS_SRC = ["src/%s.cpp" % n for n in ( "hdfs_fs", "hdfs_file", "hdfs_common", )]
def block_size(self): if not pydoop.hadoop_version_info().has_deprecated_bs(): for bs_MB in xrange(100, 500, 50): bs = bs_MB * 2 ** 20 path = self._make_random_file(blocksize=bs) self.assertEqual(self.fs.get_path_info(path)["block_size"], bs)
class DummyUpLink(object): pass example_input_splits = [ (b'/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00' b'\x08h(\x00\x00\x00\x00\x00\x08h\x05', 'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 550952, 550917), (b'/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00' b'\x00\x00\x00\x00\x00\x00\x00\x00\x08h(', 'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 0, 550952), (b'1hdfs://localhost:9000/user/zag/in-dir/images_list\x00\x00\x00\x00\x00' b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$', 'hdfs://localhost:9000/user/zag/in-dir/images_list', 0, 36) ] if not pydoop.hadoop_version_info().has_variable_isplit_encoding(): example_input_splits = [("\x00" + raw_split, fn, o, l) for (raw_split, fn, o, l) in example_input_splits] class taskcontext_tc(unittest.TestCase): def test_input_split(self): for s in example_input_splits: i = InputSplit(s[0]) self.assertEqual(i.filename, s[1]) self.assertEqual(i.offset, s[2]) self.assertEqual(i.length, s[3]) def test_get_input_split(self): ctx = TaskContext(DummyUpLink())
def run_pipes(executable, input_path, output_path, more_args=None, properties=None, force_pydoop_submitter=False, hadoop_conf_dir=None, logger=None): """ Run a pipes command. ``more_args`` (after setting input/output path) and ``properties`` are passed to :func:`run_cmd`. If not specified otherwise, this function sets the properties hadoop.pipes.java.recordreader and hadoop.pipes.java.recordwriter to 'true'. This function works around a bug in Hadoop pipes that affects versions of Hadoop with security when the local file system is used as the default FS (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000. In those set-ups, the function uses Pydoop's own pipes submitter application. You can force the use of Pydoop's submitter by passing the argument force_pydoop_submitter=True. """ if logger is None: logger = utils.NullLogger() if not hdfs.path.exists(executable): raise IOError("executable %s not found" % executable) if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS): raise IOError("input path %s not found" % input_path) if properties is None: properties = {} properties.setdefault('hadoop.pipes.java.recordreader', 'true') properties.setdefault('hadoop.pipes.java.recordwriter', 'true') if force_pydoop_submitter: use_pydoop_submit = True else: use_pydoop_submit = False ver = pydoop.hadoop_version_info() if ver.has_security(): if ver.cdh >= (4, 0, 0) and not ver.ext and hdfs.default_is_local(): raise RuntimeError( "mrv2 on local fs not supported yet") # FIXME use_pydoop_submit = hdfs.default_is_local() args = [ "-program", executable, "-input", input_path, "-output", output_path ] if more_args is not None: args.extend(more_args) if use_pydoop_submit: submitter = "it.crs4.pydoop.pipes.Submitter" pydoop_jar = pydoop.jar_path() args.extend(("-libjars", pydoop_jar)) return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger) else: return run_cmd("pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger)
import unittest import pydoop from pydoop.pipes import InputSplit example_input_splits = [ ('/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00' '\x08h(\x00\x00\x00\x00\x00\x08h\x05', 'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 550952, 550917), ('/hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps\x00\x00\x00\x00\x00' '\x00\x00\x00\x00\x00\x00\x00\x00\x08h(', 'hdfs://localhost:9000/user/zag/in-dir/FGCS-1.ps', 0, 550952), ('1hdfs://localhost:9000/user/zag/in-dir/images_list\x00\x00\x00\x00\x00' '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00$', 'hdfs://localhost:9000/user/zag/in-dir/images_list', 0, 36) ] if not pydoop.hadoop_version_info().has_variable_isplit_encoding(): example_input_splits = [("\x00" + raw_split, fn, o, l) for (raw_split, fn, o, l) in example_input_splits] class taskcontext_tc(unittest.TestCase): def test_input_split(self): for s in example_input_splits: i = InputSplit(s[0]) self.assertEqual(i.filename, s[1]) self.assertEqual(i.offset, s[2]) self.assertEqual(i.length, s[3]) def suite(): suite = unittest.TestSuite()