Beispiel #1
0
 def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000',
                                                    'mapreduce.task.userlog.limit.kb=1000'])
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Beispiel #2
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
Beispiel #3
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(
             self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
Beispiel #4
0
 def _run_wc(self,
             orig_fn,
             script_name='wc.py',
             launcher=hadoopy.launch_frozen,
             **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path,
                  out_path + '_list_jobconfs',
                  script_name,
                  jobconfs=[
                      'mapred.min.split.size=100000000',
                      'mapreduce.task.userlog.limit.kb=1000'
                  ],
                  **kw)
         launcher(in_path,
                  out_path,
                  script_name,
                  jobconfs={
                      'mapred.min.split.size': '100000000',
                      'mapreduce.task.userlog.limit.kb': '1000'
                  },
                  **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (
             script_name, in_path, out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
Beispiel #5
0
 def test_readtb_writetb(self):
     working_path = "%s/readtb_writetb/" % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = "%s/%.5d" % (working_path, x)
         print(fn)
         data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
Beispiel #6
0
 def test_readtb_writetb(self):
     working_path = '%s/readtb_writetb/' % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = '%s/%.5d' % (working_path, x)
         print(fn)
         data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(
         working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path),
                      self._readtb(hadoopy.readtb, working_path))
Beispiel #7
0
 def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw):
     fn = "out-%f-%s" % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + ".out"
     print(os.path.abspath("."))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(
             in_path,
             out_path,
             script_name,
             jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"],
             **kw
         )
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == "launch_frozen_cmd":
         cmd = (
             'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"'
             % (script_name, in_path, out_path)
         )
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError("Launcher not recognized")
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc["the"], 1664)
     self.assertEqual(wc["Alice"], 221)
Beispiel #8
0
 def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000',
                                                                               'mapreduce.task.userlog.limit.kb=1000'], **kw)
         launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000',
                                                                 'mapreduce.task.userlog.limit.kb': '1000'}, **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name,
                                                                                                                                             in_path,
                                                                                                                                             out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)