def run_pipes( executable, input_path, output_path, more_args=None, properties=None, force_pydoop_submitter=False, hadoop_conf_dir=None, logger=None, keep_streams=False, ): """ Run a pipes command. ``more_args`` (after setting input/output path) and ``properties`` are passed to :func:`run_cmd`. If not specified otherwise, this function sets the properties ``hadoop.pipes.java.recordreader`` and ``hadoop.pipes.java.recordwriter`` to ``"true"``. This function works around a bug in Hadoop pipes that affects versions of Hadoop with security when the local file system is used as the default FS (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000. In those set-ups, the function uses Pydoop's own pipes submitter application. You can force the use of Pydoop's submitter by passing the argument force_pydoop_submitter=True. """ if logger is None: logger = utils.NullLogger() if not hdfs.path.exists(executable): raise IOError("executable %s not found" % executable) if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS): raise IOError("input path %s not found" % input_path) if properties is None: properties = {} properties.setdefault("hadoop.pipes.java.recordreader", "true") properties.setdefault("hadoop.pipes.java.recordwriter", "true") if force_pydoop_submitter: use_pydoop_submit = True else: use_pydoop_submit = False ver = pydoop.hadoop_version_info() if ver.has_security(): if ver.is_cdh_mrv2() and hdfs.default_is_local(): raise RuntimeError("mrv2 on local fs not supported yet") use_pydoop_submit = hdfs.default_is_local() args = ["-program", executable, "-input", input_path, "-output", output_path] if more_args is not None: args.extend(more_args) if use_pydoop_submit: submitter = "it.crs4.pydoop.pipes.Submitter" pydoop_jar = pydoop.jar_path() args.extend(("-libjars", pydoop_jar)) return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger, keep_streams=keep_streams) else: return run_cmd( "pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams )
def stat_on_local(self): wd_ = tempfile.mkdtemp(prefix='pydoop_', suffix=UNI_CHR) p_ = os.path.join(wd_, make_random_str()) if hdfs.default_is_local(): wd, p = wd_, p_ host = "default" else: wd, p = ('file:%s' % _ for _ in (wd_, p_)) host = "" fs = hdfs.hdfs(host, 0) with fs.open_file(p_, 'w') as fo: fo.write(make_random_str()) info = fs.get_path_info(p_) fs.close() s = hdfs.path.stat(p) os_s = os.stat(p_) for n in dir(s): if n.startswith('st_'): try: exp_v = getattr(os_s, n) except AttributeError: try: exp_v = info[self.NMAP[n]] except KeyError: continue self.assertEqual(getattr(s, n), exp_v) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rmr(wd)
def without_user(self): p = 'foo/bar' abs_p = hdfs.path.abspath(p, user=None, local=False) if hdfs.default_is_local(): self.assertEqual(abs_p, '%s%s' % (self.root, os.path.abspath(p))) else: self.assertEqual(abs_p, '%s/user/%s/%s' % (self.root, DEFAULT_USER, p))
def with_user(self): p = 'foo/bar' abs_p = hdfs.path.abspath(p, user="******", local=False) if hdfs.default_is_local(): self.assertEqual(abs_p, '%s%s' % (self.root, os.path.abspath(p))) else: self.assertEqual(abs_p, '%s/user/pydoop/%s' % (self.root, p))
def setUp(self): if hdfs.default_is_local(): self.root = "file:" else: fs = hdfs.hdfs("default", 0) self.root = "hdfs://%s:%s" % (fs.host, fs.port) fs.close()
def good(self): cases = [ ('hdfs://localhost:9000/', ('localhost', 9000, '/')), ('hdfs://localhost:9000/a/b', ('localhost', 9000, '/a/b')), ('hdfs://localhost/a/b', ('localhost', DEFAULT_PORT, '/a/b')), ('hdfs:///a/b', ('default', 0, '/a/b')), ('hdfs:/', ('default', 0, '/')), ('file:///a/b', ('', 0, '/a/b')), ('file:/a/b', ('', 0, '/a/b')), ('file:///a', ('', 0, '/a')), ('file:/a', ('', 0, '/a')), ('file://temp/foo.txt', ('', 0, 'temp/foo.txt')), ('file://temp', ('', 0, 'temp')), ] if hdfs.default_is_local(): cases.extend([ ('///a/b', ('', 0, '/a/b')), ('/a/b', ('', 0, '/a/b')), ('a/b', ('', 0, 'a/b')), ]) else: cases.extend([ ('///a/b', ('default', 0, '/a/b')), ('/a/b', ('default', 0, '/a/b')), ('a/b', ('default', 0, '/user/%s/a/b' % DEFAULT_USER)), ]) for p, r in cases: self.assertEqual(hdfs.path.split(p), r) for p, r in cases[1:]: self.assertEqual(hdfs.path.split(p+UNI_CHR), uni_last(r))
def with_user(self): abs_p = hdfs.path.abspath(self.p, user="******", local=False) if hdfs.default_is_local(): self.assertEqual(abs_p, '%s%s' % (self.root, os.path.abspath(self.p))) else: self.assertEqual(abs_p, '%s/user/pydoop/%s' % (self.root, self.p))
def good(self): cases = [ ('hdfs://localhost:9000/', ('localhost', 9000, '/')), ('hdfs://localhost:9000/a/b', ('localhost', 9000, '/a/b')), ('hdfs://localhost/a/b', ('localhost', DEFAULT_PORT, '/a/b')), ('hdfs:///a/b', ('default', 0, '/a/b')), ('hdfs:/', ('default', 0, '/')), ('file:///a/b', ('', 0, '/a/b')), ('file:/a/b', ('', 0, '/a/b')), ('file:///a', ('', 0, '/a')), ('file:/a', ('', 0, '/a')), ('file://temp/foo.txt', ('', 0, 'temp/foo.txt')), ('file://temp', ('', 0, 'temp')), ] if hdfs.default_is_local(): cases.extend([ ('///a/b', ('', 0, '/a/b')), ('/a/b', ('', 0, '/a/b')), ('a/b', ('', 0, 'a/b')), ]) else: cases.extend([ ('///a/b', ('default', 0, '/a/b')), ('/a/b', ('default', 0, '/a/b')), ('a/b', ('default', 0, '/user/%s/a/b' % DEFAULT_USER)), ]) for p, r in cases: self.assertEqual(hdfs.path.split(p), r) for p, r in cases[1:]: self.assertEqual(hdfs.path.split(p + UNI_CHR), uni_last(r))
def capacity(self): fs = hdfs.hdfs("", 0) self.assertRaises(RuntimeError, fs.capacity) fs.close() if not hdfs.default_is_local(): fs = hdfs.hdfs("default", 0) cap = fs.capacity() self.assertGreaterEqual(cap, 0)
def expanduser(self): for pre in '~', '~%s' % DEFAULT_USER: for rest in '', '/d': p = '%s%s' % (pre, rest) if hdfs.default_is_local(): self.assertEqual(hdfs.path.expanduser(p), os.path.expanduser(p)) else: exp_res = '/user/%s%s' % (DEFAULT_USER, rest) self.assertEqual(hdfs.path.expanduser(p), exp_res)
def expanduser(self): for pre in '~', '~%s' % DEFAULT_USER: for rest in '', '/d': p = '%s%s' % (pre, rest) if hdfs.default_is_local(): self.assertEqual( hdfs.path.expanduser(p), os.path.expanduser(p) ) else: exp_res = '/user/%s%s' % (DEFAULT_USER, rest) self.assertEqual(hdfs.path.expanduser(p), exp_res)
def bad(self): cases = [ 'ftp://localhost:9000/', # bad scheme 'hdfs://localhost:spam/', # port is not an int 'hdfs://localhost:9000', # path part is empty 'hdfs://localhost:9000/a:b', # colon outside netloc ] if not hdfs.default_is_local(): cases.append('/localhost:9000/a/b') # colon outside netloc for p in cases: self.assertRaises(ValueError, hdfs.path.split, p)
def good_with_user(self): if hdfs.default_is_local(): cases = [('a/b', u, ('', 0, 'a/b')) for u in None, DEFAULT_USER, 'foo'] else: cases = [ ('a/b', None, ('default', 0, '/user/%s/a/b' % DEFAULT_USER)), ('a/b', DEFAULT_USER, ('default', 0, '/user/%s/a/b' % DEFAULT_USER)), ('a/b', 'foo', ('default', 0, '/user/foo/a/b')), ] for p, u, r in cases: self.assertEqual(hdfs.path.split(p, u), r)
def get_hosts(self): if hdfs.default_is_local(): # only run on HDFS return hdfs.dump(self.data, self.hdfs_paths[0], mode="wb") fs = hdfs.hdfs("default", 0) hs = fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertTrue(len(hs) > 0) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
def get_hosts(self): if hdfs.default_is_local(): # only run on HDFS return hdfs.dump(self.data, self.hdfs_paths[0]) fs = hdfs.hdfs("default", 0) hs = fs.get_hosts(self.hdfs_paths[0], 0, 10) self.assertTrue(len(hs) > 0) self.assertRaises( ValueError, fs.get_hosts, self.hdfs_paths[0], -10, 10 ) self.assertRaises(ValueError, fs.get_hosts, self.hdfs_paths[0], 0, -10)
def setUp(self): self.hp_cases = [("default", 0)] self.u_cases = [None, CURRENT_USER] if not hdfs.default_is_local(): self.hp_cases.append((u.HDFS_HOST, u.HDFS_PORT)) self.u_cases.append("nobody") try: hdfs_ip = socket.gethostbyname(u.HDFS_HOST) except socket.gaierror: pass else: self.hp_cases.append((hdfs_ip, u.HDFS_PORT))
def bad(self): cases = [ '', # not allowed in the Java API 'hdfs:', # no scheme-specific part 'hdfs://', # path part is empty 'ftp://localhost:9000/', # bad scheme 'hdfs://localhost:spam/', # port is not an int 'hdfs://localhost:9000', # path part is empty 'hdfs://localhost:9000/a:b', # colon outside netloc '//localhost:9000/a/b', # null scheme ] if not hdfs.default_is_local(): cases.append('/localhost:9000/a/b') # colon outside netloc for p in cases: self.assertRaises(ValueError, hdfs.path.split, p)
def setUp(self): wd = tempfile.mkdtemp() wd_bn = os.path.basename(wd) self.local_wd = "file:%s" % wd fs = hdfs.hdfs("default", 0) fs.create_directory(wd_bn) self.hdfs_wd = fs.get_path_info(wd_bn)["name"] fs.close() basenames = ["test_path_%d" % i for i in xrange(2)] self.local_paths = ["%s/%s" % (self.local_wd, bn) for bn in basenames] self.hdfs_paths = ["%s/%s" % (self.hdfs_wd, bn) for bn in basenames] self.data = make_random_data(4*BUFSIZE + BUFSIZE/2) for path in self.local_paths: self.assertTrue(path.startswith("file:")) for path in self.hdfs_paths: if not hdfs.default_is_local(): self.assertTrue(path.startswith("hdfs:"))
def setUp(self): wd = tempfile.mkdtemp() wd_bn = os.path.basename(wd) self.local_wd = "file:%s" % wd fs = hdfs.hdfs("default", 0) fs.create_directory(wd_bn) self.hdfs_wd = fs.get_path_info(wd_bn)["name"] fs.close() basenames = ["test_path_%d" % i for i in xrange(2)] self.local_paths = ["%s/%s" % (self.local_wd, bn) for bn in basenames] self.hdfs_paths = ["%s/%s" % (self.hdfs_wd, bn) for bn in basenames] self.data = make_random_data(4 * BUFSIZE + BUFSIZE / 2) for path in self.local_paths: self.assertTrue(path.startswith("file:")) for path in self.hdfs_paths: if not hdfs.default_is_local(): self.assertTrue(path.startswith("hdfs:"))
def stat(self): if hdfs.default_is_local(): return bn = '%s%s' % (make_random_str(), UNI_CHR) fn = '/user/%s/%s' % (DEFAULT_USER, bn) fs = hdfs.hdfs("default", 0) p = "hdfs://%s:%s%s" % (fs.host, fs.port, fn) with fs.open_file(fn, 'w') as fo: fo.write(make_random_str()) info = fs.get_path_info(fn) fs.close() s = hdfs.path.stat(p) for n1, n2 in self.NMAP.iteritems(): attr = getattr(s, n1, None) self.assertFalse(attr is None) self.assertEqual(attr, info[n2]) self.__check_extra_args(s, info) self.__check_wrapper_funcs(p) hdfs.rmr(p)
def suite(): suite_ = unittest.TestSuite() suite_.addTest(TestConnection('connect')) suite_.addTest(TestConnection('cache')) tests = common_tests() if not hdfs.default_is_local(): tests.extend([ 'capacity', 'default_block_size', 'used', 'chown', 'utime', 'block_size', 'replication', 'set_replication', 'readline_block_boundary', 'get_hosts', ]) for t in tests: suite_.addTest(TestHDFS(t)) return suite_
def suite(): suite = unittest.TestSuite() suite.addTest(TestConnection("connect")) suite.addTest(TestConnection("cache")) tests = common_tests() if not hdfs.default_is_local(): tests.extend( [ "capacity", "default_block_size", "used", "chown", "utime", "block_size", "replication", "set_replication", "readline_block_boundary", "get_hosts", ] ) for t in tests: suite.addTest(TestHDFS(t)) return suite
def samefile_user(self): if not hdfs.default_is_local(): self.assertTrue(hdfs.path.samefile('fn', '/user/u/fn', user='******'))
def get_wd_prefix(base="pydoop_"): if default_is_local(): return os.path.join(tempfile.gettempdir(), "pydoop_") else: return base
def run_pipes(executable, input_path, output_path, more_args=None, properties=None, force_pydoop_submitter=False, hadoop_conf_dir=None, logger=None): """ Run a pipes command. ``more_args`` (after setting input/output path) and ``properties`` are passed to :func:`run_cmd`. If not specified otherwise, this function sets the properties hadoop.pipes.java.recordreader and hadoop.pipes.java.recordwriter to 'true'. This function works around a bug in Hadoop pipes that affects versions of Hadoop with security when the local file system is used as the default FS (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000. In those set-ups, the function uses Pydoop's own pipes submitter application. You can force the use of Pydoop's submitter by passing the argument force_pydoop_submitter=True. """ if logger is None: logger = utils.NullLogger() if not hdfs.path.exists(executable): raise IOError("executable %s not found" % executable) if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS): raise IOError("input path %s not found" % input_path) if properties is None: properties = {} properties.setdefault('hadoop.pipes.java.recordreader', 'true') properties.setdefault('hadoop.pipes.java.recordwriter', 'true') if force_pydoop_submitter: use_pydoop_submit = True else: use_pydoop_submit = False ver = pydoop.hadoop_version_info() if ver.has_security(): if ver.cdh >= (4, 0, 0) and not ver.ext and hdfs.default_is_local(): raise RuntimeError( "mrv2 on local fs not supported yet") # FIXME use_pydoop_submit = hdfs.default_is_local() args = [ "-program", executable, "-input", input_path, "-output", output_path ] if more_args is not None: args.extend(more_args) if use_pydoop_submit: submitter = "it.crs4.pydoop.pipes.Submitter" pydoop_jar = pydoop.jar_path() args.extend(("-libjars", pydoop_jar)) return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger) else: return run_cmd("pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger)