def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000']) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty(self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty( self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=[ 'mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000' ], **kw) launcher(in_path, out_path, script_name, jobconfs={ 'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000' }, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % ( script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def test_readtb_writetb(self): working_path = "%s/readtb_writetb/" % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = "%s/%.5d" % (working_path, x) print(fn) data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty(working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def test_readtb_writetb(self): working_path = '%s/readtb_writetb/' % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = '%s/%.5d' % (working_path, x) print(fn) data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty( working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw): fn = "out-%f-%s" % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + ".out" print(os.path.abspath(".")) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher( in_path, out_path, script_name, jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"], **kw ) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == "launch_frozen_cmd": cmd = ( 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) ) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError("Launcher not recognized") wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc["the"], 1664) self.assertEqual(wc["Alice"], 221)
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000'], **kw) launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000'}, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)