コード例 #1
0
ファイル: test_with_hadoop.py プロジェクト: bjzu/hadoopy
 def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000',
                                                    'mapreduce.task.userlog.limit.kb=1000'])
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
コード例 #2
0
ファイル: test_with_hadoop.py プロジェクト: gmazzola/hadoopy
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
コード例 #3
0
 def tearDown(self):
     if hadoopy.exists(self.data_path):
         self.assertTrue(hadoopy.isempty(
             self.data_path))  # directories are empty
         self.assertTrue(hadoopy.isdir(self.data_path))
         hadoopy.rmr(self.data_path)
     self.assertFalse(hadoopy.exists(self.data_path))
     self.assertFalse(hadoopy.isdir(self.data_path))
     self.assertFalse(hadoopy.isempty(self.data_path))
コード例 #4
0
 def _run_wc(self,
             orig_fn,
             script_name='wc.py',
             launcher=hadoopy.launch_frozen,
             **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path,
                  out_path + '_list_jobconfs',
                  script_name,
                  jobconfs=[
                      'mapred.min.split.size=100000000',
                      'mapreduce.task.userlog.limit.kb=1000'
                  ],
                  **kw)
         launcher(in_path,
                  out_path,
                  script_name,
                  jobconfs={
                      'mapred.min.split.size': '100000000',
                      'mapreduce.task.userlog.limit.kb': '1000'
                  },
                  **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (
             script_name, in_path, out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
コード例 #5
0
ファイル: test_with_hadoop.py プロジェクト: gmazzola/hadoopy
 def test_readtb_writetb(self):
     working_path = "%s/readtb_writetb/" % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = "%s/%.5d" % (working_path, x)
         print(fn)
         data = [("1", 1), (1.3, np.array([1, 2, 3])), (True, {"1": 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path), self._readtb(hadoopy.readtb, working_path))
コード例 #6
0
 def test_readtb_writetb(self):
     working_path = '%s/readtb_writetb/' % (self.data_path)
     self.assertFalse(hadoopy.exists(working_path))
     self.assertFalse(hadoopy.isdir(working_path))
     self.assertFalse(hadoopy.isempty(working_path))
     for x in range(10):
         fn = '%s/%.5d' % (working_path, x)
         print(fn)
         data = [('1', 1), (1.3, np.array([1, 2, 3])), (True, {'1': 3})]
         hadoopy.writetb(fn, data)
     self.assertFalse(hadoopy.isdir(fn))
     self.assertFalse(hadoopy.isempty(fn))
     self.assertTrue(hadoopy.isdir(working_path))
     self.assertTrue(hadoopy.isempty(
         working_path))  # isempty returns true on directories
     self.assertEqual(self._readtb(readtb, working_path),
                      self._readtb(hadoopy.readtb, working_path))
コード例 #7
0
ファイル: test_with_hadoop.py プロジェクト: gmazzola/hadoopy
 def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw):
     fn = "out-%f-%s" % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + ".out"
     print(os.path.abspath("."))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(
             in_path,
             out_path,
             script_name,
             jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"],
             **kw
         )
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == "launch_frozen_cmd":
         cmd = (
             'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"'
             % (script_name, in_path, out_path)
         )
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError("Launcher not recognized")
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc["the"], 1664)
     self.assertEqual(wc["Alice"], 221)
コード例 #8
0
def get_total_size(path, _format="%b"):
    """ get the total size of the path """

    size = 0
    if hadoopy.isdir(path):
        files = hadoopy.ls(path)
        for file in files:
            size += int(hadoopy.stat(file, _format))
    else:
        size = hadoopy.stat(path, _format)
    return size
コード例 #9
0
ファイル: test_with_hadoop.py プロジェクト: Jeffliu/hadoopy
 def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw):
     fn = 'out-%f-%s' % (time.time(), orig_fn)
     in_path = self.data_path + fn
     out_path = self.data_path + fn + '.out'
     print(os.path.abspath('.'))
     if not hadoopy.exists(in_path):
         hadoopy.put(orig_fn, in_path)
     # We also do a few hdfs checks here
     self.assertEquals(len(hadoopy.ls(in_path)), 1)
     #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)])  # This is no longer true in CDH4
     self.assertTrue(hadoopy.exists(in_path))
     self.assertFalse(hadoopy.exists(out_path))
     self.assertFalse(hadoopy.isdir(in_path))
     self.assertFalse(hadoopy.isempty(in_path))
     # Don't let the file split, CDH3 has a bug and will try to split gz's
     if not isinstance(launcher, str):
         launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000',
                                                                               'mapreduce.task.userlog.limit.kb=1000'], **kw)
         launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000',
                                                                 'mapreduce.task.userlog.limit.kb': '1000'}, **kw)
     if launcher == hadoopy.launch_frozen:
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     elif launcher == hadoopy.launch_local:
         self.assertFalse(hadoopy.isdir(out_path))
         self.assertFalse(hadoopy.isempty(out_path))
     elif launcher == 'launch_frozen_cmd':
         cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name,
                                                                                                                                             in_path,
                                                                                                                                             out_path)
         print(cmd)
         subprocess.call(cmd.split())
         self.assertTrue(hadoopy.isdir(out_path))
         self.assertTrue(hadoopy.isempty(out_path))  # Dirs are always empty
     else:
         raise ValueError('Launcher not recognized')
     wc = dict(hadoopy.readtb(out_path))
     self.assertEqual(wc['the'], 1664)
     self.assertEqual(wc['Alice'], 221)
コード例 #10
0
def aggregateData(num_line, terminus1, terminus2, threshold, *dirs):
    print 'aller:{}-{} \nretour:{}-{}'.format(terminus1, terminus2, terminus2,
                                              terminus1)
    ter_coor1, ter_coor2 = data_extraction.getTerminusCoor(
        num_line, terminus1, terminus2)
    source_gps_dir = 'hdfs://BigDataPOC:8020/datalab/exp_b02/data/gps_data'
    data_aller = []
    data_retour = []
    for dir_name in dirs:
        print dir_name
        path = os.path.join(source_gps_dir, str(num_line), str(dir_name))
        if hadoopy.isdir(path):
            cmd = 'hdfs dfs -du %s' % (path)
            p = subprocess.Popen(cmd.split(),
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)
            frames_aller = []
            frames_retour = []
            for file_name in p.stdout.readlines():
                line_daily = extractInfo(file_name.split()[1])
                line_daily_aller, line_daily_retour = data_extraction.generateDailyData(
                    line_daily, (ter_coor1, ter_coor2), threshold, terminus1,
                    terminus2)
                frames_aller.append(line_daily_aller)
                frames_retour.append(line_daily_retour)
            data_aller.append(
                pd.concat(frames_aller).reset_index().drop('index', axis=1))
            data_retour.append(
                pd.concat(frames_retour).reset_index().drop('index', axis=1))
        else:
            if hadoopy.exists(dir_name):
                line_daily = extractInfo(dir_name)
                line_daily_aller, line_daily_retour = data_extraction.generateDailyData(
                    line_daily, (ter_coor1, ter_coor2), threshold, terminus1,
                    terminus2)
                data_aller.append(line_daily_aller)
                data_retour.append(line_daily_retour)
            else:
                print "there are paths in args which are not directories"
                sys.exit(1)
    data_aller = pd.concat(data_aller).reset_index().drop('index', axis=1)
    data_retour = pd.concat(data_retour).reset_index().drop('index', axis=1)
    cols = [
        'DATE', 'TIME', 'LINE', 'BUS_NUM', 'X_COORDINATE', 'Y_COORDINATE',
        'LONGITUDE', 'LATITUDE', 'SPEED'
    ]
    data_aller = data_aller[cols]
    data_retour = data_retour[cols]
    return data_aller, data_retour
コード例 #11
0
ファイル: sampler.py プロジェクト: yeyue910107/MR-Packer
def doSample(jarfile, inputs, output, k):
    for item in inputs:
        if item[-1] == "/":
            name = (item[:-1]).split('/')[-1]
        else:
            name = item.split('/')[-1]
        print "item", item 
        #tmp_dir = tmp_path + name + "/"
        if hadoopy.exists(item):
            continue
        hadoopy.mkdir(item)
        #tmp_inputs.append(tmp_dir)
        real_input = data_dir + name + "/"
        for f in hadoopy.ls(real_input):
            if not hadoopy.isdir(f):
                #ff = tmp_dir + f.split('/')[-1]
                if k > 0:
                    poolSample(f, item, k)
                else:
                    commonSample(f, item, ratio)
    '''if not hadoopy.exists(output):
        hadoopy.mkdir(output)
    if hadoopy.isdir(output):
        output = output[:-1]
    if output[-1] == '/':
        output = output[:-1]
    name = output.split('/')[-1]
    tmp_output = tmp_path + name + "/"'''
    #if not hpath.exists(tmp_output):
    #    hdfs.mkdir(tmp_output)
    codegen.executeJar(jarfile, inputs, output)
    #jobid = job.getJobIDFromLog(tmp_log_dir)
    job_para = job.getJobPara()
    '''for item in tmp_inputs:
        os.system("hadoop fs -rmr " + item)
    os.system("hadoop fs -rmr " + tmp_output)'''
    return job_para