def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000']) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty(self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def tearDown(self): if hadoopy.exists(self.data_path): self.assertTrue(hadoopy.isempty( self.data_path)) # directories are empty self.assertTrue(hadoopy.isdir(self.data_path)) hadoopy.rmr(self.data_path) self.assertFalse(hadoopy.exists(self.data_path)) self.assertFalse(hadoopy.isdir(self.data_path)) self.assertFalse(hadoopy.isempty(self.data_path))
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=[ 'mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000' ], **kw) launcher(in_path, out_path, script_name, jobconfs={ 'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000' }, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % ( script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def test_readtb_writetb(self): working_path = "%s/readtb_writetb/" % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = "%s/%.5d" % (working_path, x) print(fn) data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty(working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def test_readtb_writetb(self): working_path = '%s/readtb_writetb/' % (self.data_path) self.assertFalse(hadoopy.exists(working_path)) self.assertFalse(hadoopy.isdir(working_path)) self.assertFalse(hadoopy.isempty(working_path)) for x in range(10): fn = '%s/%.5d' % (working_path, x) print(fn) data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})] hadoopy.writetb(fn, data) self.assertFalse(hadoopy.isdir(fn)) self.assertFalse(hadoopy.isempty(fn)) self.assertTrue(hadoopy.isdir(working_path)) self.assertTrue(hadoopy.isempty( working_path)) # isempty returns true on directories self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw): fn = "out-%f-%s" % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + ".out" print(os.path.abspath(".")) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher( in_path, out_path, script_name, jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"], **kw ) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == "launch_frozen_cmd": cmd = ( 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) ) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError("Launcher not recognized") wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc["the"], 1664) self.assertEqual(wc["Alice"], 221)
def get_total_size(path, _format="%b"): """ get the total size of the path """ size = 0 if hadoopy.isdir(path): files = hadoopy.ls(path) for file in files: size += int(hadoopy.stat(file, _format)) else: size = hadoopy.stat(path, _format) return size
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000'], **kw) launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000'}, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def aggregateData(num_line, terminus1, terminus2, threshold, *dirs): print 'aller:{}-{} \nretour:{}-{}'.format(terminus1, terminus2, terminus2, terminus1) ter_coor1, ter_coor2 = data_extraction.getTerminusCoor( num_line, terminus1, terminus2) source_gps_dir = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data' data_aller = [] data_retour = [] for dir_name in dirs: print dir_name path = os.path.join(source_gps_dir, str(num_line), str(dir_name)) if hadoopy.isdir(path): cmd = 'hdfs dfs -du %s' % (path) p = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) frames_aller = [] frames_retour = [] for file_name in p.stdout.readlines(): line_daily = extractInfo(file_name.split()[1]) line_daily_aller, line_daily_retour = data_extraction.generateDailyData( line_daily, (ter_coor1, ter_coor2), threshold, terminus1, terminus2) frames_aller.append(line_daily_aller) frames_retour.append(line_daily_retour) data_aller.append( pd.concat(frames_aller).reset_index().drop('index', axis=1)) data_retour.append( pd.concat(frames_retour).reset_index().drop('index', axis=1)) else: if hadoopy.exists(dir_name): line_daily = extractInfo(dir_name) line_daily_aller, line_daily_retour = data_extraction.generateDailyData( line_daily, (ter_coor1, ter_coor2), threshold, terminus1, terminus2) data_aller.append(line_daily_aller) data_retour.append(line_daily_retour) else: print "there are paths in args which are not directories" sys.exit(1) data_aller = pd.concat(data_aller).reset_index().drop('index', axis=1) data_retour = pd.concat(data_retour).reset_index().drop('index', axis=1) cols = [ 'DATE', 'TIME', 'LINE', 'BUS_NUM', 'X_COORDINATE', 'Y_COORDINATE', 'LONGITUDE', 'LATITUDE', 'SPEED' ] data_aller = data_aller[cols] data_retour = data_retour[cols] return data_aller, data_retour
def doSample(jarfile, inputs, output, k): for item in inputs: if item[-1] == "/": name = (item[:-1]).split('/')[-1] else: name = item.split('/')[-1] print "item", item #tmp_dir = tmp_path + name + "/" if hadoopy.exists(item): continue hadoopy.mkdir(item) #tmp_inputs.append(tmp_dir) real_input = data_dir + name + "/" for f in hadoopy.ls(real_input): if not hadoopy.isdir(f): #ff = tmp_dir + f.split('/')[-1] if k > 0: poolSample(f, item, k) else: commonSample(f, item, ratio) '''if not hadoopy.exists(output): hadoopy.mkdir(output) if hadoopy.isdir(output): output = output[:-1] if output[-1] == '/': output = output[:-1] name = output.split('/')[-1] tmp_output = tmp_path + name + "/"''' #if not hpath.exists(tmp_output): # hdfs.mkdir(tmp_output) codegen.executeJar(jarfile, inputs, output) #jobid = job.getJobIDFromLog(tmp_log_dir) job_para = job.getJobPara() '''for item in tmp_inputs: os.system("hadoop fs -rmr " + item) os.system("hadoop fs -rmr " + tmp_output)''' return job_para