def _run_wc(self, orig_fn, launcher=hadoopy.launch_frozen): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's launcher(in_path, out_path, 'wc.py', jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000']) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def _run_hdfs(self, orig_fn): fn = "%f-%s" % (time.time(), orig_fn) file_path = "%s/%s" % (self.data_path, fn) hadoopy.put(orig_fn, file_path) cat_output = [_ for _ in hadoopy.readtb(file_path)] line = (331, "Title: Alice's Adventures in Wonderland") self.assertTrue(line in cat_output) ls_output = hadoopy.ls(self.data_path) self.assertTrue([x for x in ls_output if x.rsplit("/", 1)[-1] == fn]) ls_output = hadoopy.ls(file_path) self.assertTrue(ls_output[0].rsplit("/", 1)[-1] == fn)
def _run_hdfs(self, orig_fn): fn = '%f-%s' % (time.time(), orig_fn) file_path = '%s/%s' % (self.data_path, fn) hadoopy.put(orig_fn, file_path) cat_output = [_ for _ in hadoopy.readtb(file_path)] line = (331, 'Title: Alice\'s Adventures in Wonderland') self.assertTrue(line in cat_output) ls_output = hadoopy.ls(self.data_path) self.assertTrue([x for x in ls_output if x.rsplit('/', 1)[-1] == fn]) ls_output = hadoopy.ls(file_path) self.assertTrue(ls_output[0].rsplit('/', 1)[-1] == fn)
def generate_digests(pinput, digest_command): """ generate digests from the input :param pinput (string) the input path :param digest_command (string) tells if should execute the -text or -cat command :return a dict with pairs <file, digest> """ digests = {} try: files = hadoopy.ls(pinput) except IOError: return digests for afile in files: if digest_command == "-cat": stdout, stderr = subprocess.Popen([get_hadoop_path() + "/bin/hadoop", "fs", "-cat", afile], stdout=subprocess.PIPE).communicate() elif digest_command == "-text": stdout, stderr = subprocess.Popen([get_hadoop_path() + "/bin/hadoop", "fs", "-text", afile], stdout=subprocess.PIPE).communicate() m = hashlib.sha256() m.update(stdout) digests.update({afile: m.hexdigest()}) return digests
def run_video_frame_classification(train_dir): try: neg_dir = train_dir + '/0' pos_dir = train_dir + '/1' while 1: # Train using initial pos/neg c = vidfeat.SyntheticFrameFeature().train(vidfeat.load_label_frames(train_dir)) # Predict on dataset hdfs_input = random.sample(hadoopy.ls('/user/brandyn/aladdin/mp4_devt/'), 96) start_time = '%f' % time.time() hdfs_output = '/user/brandyn/aladdin_results/video_grep/%s' % start_time picarus.vision.run_video_grep_frames(hdfs_input, hdfs_output, c) unsorted_dir = tempfile.mkdtemp() try: for _, y in hadoopy.readtb(hdfs_output): open('%s/%s.jpg' % (unsorted_dir, hashlib.sha1(y).hexdigest()), 'w').write(y) # Present results to user and add to list try: cmd = 'python -m interactive_learning.image_selector %s %s %s --port 8083' % (unsorted_dir, pos_dir, neg_dir) print(cmd) subprocess.call(cmd.split()) except OSError: pass finally: shutil.rmtree(unsorted_dir) finally: #shutil.rmtree(temp_root) pass
def ls(pinput): """ list hdfs files """ try: files = hadoopy.ls(pinput) except IOError: files = [] return files
def _run_wc(self, orig_fn, script_name="wc.py", launcher=hadoopy.launch_frozen, **kw): fn = "out-%f-%s" % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + ".out" print(os.path.abspath(".")) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher( in_path, out_path, script_name, jobconfs=["mapred.min.split.size=100000000", "mapreduce.task.userlog.limit.kb=1000"], **kw ) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == "launch_frozen_cmd": cmd = ( 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) ) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError("Launcher not recognized") wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc["the"], 1664) self.assertEqual(wc["Alice"], 221)
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=[ 'mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000' ], **kw) launcher(in_path, out_path, script_name, jobconfs={ 'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000' }, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % ( script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def get_total_size(path, _format="%b"): """ get the total size of the path """ size = 0 if hadoopy.isdir(path): files = hadoopy.ls(path) for file in files: size += int(hadoopy.stat(file, _format)) else: size = hadoopy.stat(path, _format) return size
def list_directory(self, path=None): prefix = '' if path: prefix = '%s/' % path local_path, hdfs_path = self._init_path(path) exists = False try: for d in hadoopy.ls(hdfs_path): exists = True yield os.path.relpath(d, self._local_path) except Exception: pass if not exists: raise exceptions.FileNotFoundError('%s is not there' % path)
def abspath(path): """Return the absolute path to a file and canonicalize it Path is returned without a trailing slash and without redundant slashes. Caches the user's home directory. :param path: A string for the path. This should not have any wildcards. :returns Absolute path to the file :raises IOError: If unsuccessful """ global _USER_HOME_DIR # FIXME(brandyn): User's home directory must exist if path[0] == '/': return os.path.abspath(path) if _USER_HOME_DIR is None: try: _USER_HOME_DIR = hadoopy.ls('.')[0].rsplit('/', 1)[0] except IOError, e: if not exists('.'): raise IOError("Home directory doesn't exist") raise e
def hdfs_to_vidjs(): runs = list(hadoopy.ls('output/video_keyframes')) run = runs[-1] print run for (kind, hash), v in hadoopy.readtb(run): if kind == 'frame': s = StringIO.StringIO() s.write(v) s.seek(0) frame = Image.open(s) frame.save('vid_t/%s.jpg' % hash) if kind == 'video': with open('videojs/%s.js' % hash, 'w') as f: json.dump(v, f) if kind == 'scores': try: pylab.figure(1) pylab.clf() pylab.plot(v) pylab.savefig('scores/%s.png' % hash) except Exception, e: print e
def _run_wc(self, orig_fn, script_name='wc.py', launcher=hadoopy.launch_frozen, **kw): fn = 'out-%f-%s' % (time.time(), orig_fn) in_path = self.data_path + fn out_path = self.data_path + fn + '.out' print(os.path.abspath('.')) if not hadoopy.exists(in_path): hadoopy.put(orig_fn, in_path) # We also do a few hdfs checks here self.assertEquals(len(hadoopy.ls(in_path)), 1) #self.assertEquals(hadoopy.ls(in_path), [hadoopy.abspath(in_path)]) # This is no longer true in CDH4 self.assertTrue(hadoopy.exists(in_path)) self.assertFalse(hadoopy.exists(out_path)) self.assertFalse(hadoopy.isdir(in_path)) self.assertFalse(hadoopy.isempty(in_path)) # Don't let the file split, CDH3 has a bug and will try to split gz's if not isinstance(launcher, str): launcher(in_path, out_path + '_list_jobconfs', script_name, jobconfs=['mapred.min.split.size=100000000', 'mapreduce.task.userlog.limit.kb=1000'], **kw) launcher(in_path, out_path, script_name, jobconfs={'mapred.min.split.size': '100000000', 'mapreduce.task.userlog.limit.kb': '1000'}, **kw) if launcher == hadoopy.launch_frozen: self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty elif launcher == hadoopy.launch_local: self.assertFalse(hadoopy.isdir(out_path)) self.assertFalse(hadoopy.isempty(out_path)) elif launcher == 'launch_frozen_cmd': cmd = 'python %s launch_frozen %s %s -jobconf "mapred.min.split.size=100000000" -jobconf "mapreduce.task.userlog.limit.kb=1000"' % (script_name, in_path, out_path) print(cmd) subprocess.call(cmd.split()) self.assertTrue(hadoopy.isdir(out_path)) self.assertTrue(hadoopy.isempty(out_path)) # Dirs are always empty else: raise ValueError('Launcher not recognized') wc = dict(hadoopy.readtb(out_path)) self.assertEqual(wc['the'], 1664) self.assertEqual(wc['Alice'], 221)
def readtb(paths, ignore_logs=True): """Read typedbytes sequence files on HDFS (with optional compression). By default, ignores files who's names start with an underscore '_' as they are log files. This allows you to cat a directory that may be a variety of outputs from hadoop (e.g., _SUCCESS, _logs). This works on directories and files. Args: paths: HDFS path (str) or paths (iterator) ignore_logs: If True, ignore all files who's name starts with an underscore. Defaults to True. Returns: An iterator of key, value pairs. Raises: IOError: An error occurred listing the directory (e.g., not available). """ hstreaming = hadoopy._runner._find_hstreaming() if isinstance(paths, str): paths = [paths] for root_path in paths: all_paths = hadoopy.ls(root_path) if ignore_logs: # Ignore any files that start with an underscore keep_file = lambda x: os.path.basename(x)[0] != "_" all_paths = filter(keep_file, all_paths) for cur_path in all_paths: cmd = "hadoop jar %s dumptb %s" % (hstreaming, cur_path) read_fd, write_fd = os.pipe() write_fp = os.fdopen(write_fd, "w") p = hadoopy._hdfs._hadoop_fs_command(cmd, stdout=write_fp) write_fp.close() with hadoopy.TypedBytesFile(read_fd=read_fd) as tb_fp: for kv in tb_fp: yield kv p.wait()
def readtb(paths, ignore_logs=True): """Read typedbytes sequence files on HDFS (with optional compression). By default, ignores files who's names start with an underscore '_' as they are log files. This allows you to cat a directory that may be a variety of outputs from hadoop (e.g., _SUCCESS, _logs). This works on directories and files. Args: paths: HDFS path (str) or paths (iterator) ignore_logs: If True, ignore all files who's name starts with an underscore. Defaults to True. Returns: An iterator of key, value pairs. Raises: IOError: An error occurred listing the directory (e.g., not available). """ hstreaming = hadoopy._runner._find_hstreaming() if isinstance(paths, str): paths = [paths] for root_path in paths: all_paths = hadoopy.ls(root_path) if ignore_logs: # Ignore any files that start with an underscore keep_file = lambda x: os.path.basename(x)[0] != '_' all_paths = filter(keep_file, all_paths) for cur_path in all_paths: cmd = 'hadoop jar %s dumptb %s' % (hstreaming, cur_path) read_fd, write_fd = os.pipe() write_fp = os.fdopen(write_fd, 'w') p = hadoopy._hdfs._hadoop_fs_command(cmd, stdout=write_fp) write_fp.close() with hadoopy.TypedBytesFile(read_fd=read_fd) as tb_fp: for kv in tb_fp: yield kv p.wait()
def doSample(jarfile, inputs, output, k): for item in inputs: if item[-1] == "/": name = (item[:-1]).split('/')[-1] else: name = item.split('/')[-1] print "item", item #tmp_dir = tmp_path + name + "/" if hadoopy.exists(item): continue hadoopy.mkdir(item) #tmp_inputs.append(tmp_dir) real_input = data_dir + name + "/" for f in hadoopy.ls(real_input): if not hadoopy.isdir(f): #ff = tmp_dir + f.split('/')[-1] if k > 0: poolSample(f, item, k) else: commonSample(f, item, ratio) '''if not hadoopy.exists(output): hadoopy.mkdir(output) if hadoopy.isdir(output): output = output[:-1] if output[-1] == '/': output = output[:-1] name = output.split('/')[-1] tmp_output = tmp_path + name + "/"''' #if not hpath.exists(tmp_output): # hdfs.mkdir(tmp_output) codegen.executeJar(jarfile, inputs, output) #jobid = job.getJobIDFromLog(tmp_log_dir) job_para = job.getJobPara() '''for item in tmp_inputs: os.system("hadoop fs -rmr " + item) os.system("hadoop fs -rmr " + tmp_output)''' return job_para
def _get_home_dir_old(): # NOTE(brandyn): Not compatible with CDH4's ls return hadoopy.ls('.')[0].rsplit('/', 1)[0]
def test_ls(self): ls_output = hadoopy.ls(self.data_path) self.assertTrue(ls_output[0].endswith(self.file_path))