def chown(self): new_user = '******' test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path, mode="wb") hdfs.chown(test_path, user=new_user) path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], new_user) prev_owner = path_info['owner'] prev_grp = path_info['group'] # owner and group should remain unchanged hdfs.chown(test_path, user='', group='') path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], prev_owner) self.assertEqual(path_info['group'], prev_grp)
def chown(self): new_user = '******' test_path = self.hdfs_paths[0] hdfs.dump(self.data, test_path) hdfs.chown(test_path, user=new_user) path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], new_user) prev_owner = path_info['owner'] prev_grp = path_info['group'] # owner and group should remain unchanged hdfs.chown(test_path, user='', group='') path_info = hdfs.lsl(test_path)[0] self.assertEqual(path_info['owner'], prev_owner) self.assertEqual(path_info['group'], prev_grp)
def initialize_break_points(cls, n_reducers, sampled_records, input_dir, n_threads=2): file_infos = [ i for i in hdfs.lsl(input_dir) if (i['kind'] == 'file' and os.path.basename(i['name']).startswith('part')) ] n_files = len(file_infos) total_size = sum(map(lambda _: int(_['size']), file_infos)) n_records = total_size // RECORD_LENGTH assert n_records > sampled_records df = max(n_files // n_reducers, 1) paths = [ i['name'] for i in it.islice(file_infos, 0, df * n_reducers, df) ] break_points = cls.get_break_points(sampled_records // n_reducers, n_reducers, paths, n_threads) vals = [_ for _ in zip(break_points, range(1, n_reducers))] selector = Selector(vals) bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE) with io.open(bp_path, "wb") as f: f.write(srl.private_encode(selector)) return bp_path
def list_images(input_dir): ret = [] p = re.compile(r".*\.jpe?g$", re.IGNORECASE) ls = [_['name'] for _ in hdfs.lsl(input_dir) if _['kind'] == 'directory'] for d in ls: ret.extend([_ for _ in hdfs.ls(d) if p.match(_)]) LOGGER.info("%d classes, %d total images", len(ls), len(ret)) return ret
def _get_samples_from_bcl_output(output_dir): def name_filter(n): bn = os.path.basename(n).lower() return not (bn.startswith('_') or ('unknown' in bn) or ('undetermined' in bn) ) return [ d['name'] for d in phdfs.lsl(output_dir) if d['kind'] == 'directory' and name_filter(d['name']) ]
def list_images(input_dir): rval = [] logging.info("scanning %s", input_dir) for entry in hdfs.lsl(input_dir): if all( (entry["kind"] != "directory", not entry["name"].startswith("_"), entry["name"].endswith(".png"))): rval.append(entry["name"]) logging.info("found %d images", len(rval)) return rval
def _get_samples_from_bcl_output(output_dir): def name_filter(n): bn = os.path.basename(n).lower() return not (bn.startswith('_') or ('unknown' in bn) or ('undetermined' in bn)) return [ d['name'] for d in phdfs.lsl(output_dir) if d['kind'] == 'directory' and name_filter(d['name']) ]
def __init__(self, file_prefix, loadexist=False, readonly=False): CustomStorage.__init__(self) if not loadexist: if hdfs.path.exists('{0}_0'.format(file_prefix)): file_prefix += '_0' while hdfs.path.exists('{0}_0'.format(file_prefix)): insert_index = file_prefix.rfind('_') file_prefix = '{0}_{1}'.format(file_prefix[:insert_index], int(file_prefix[insert_index + 1:]) + 1) self.file_prefix = file_prefix self.read_only = readonly self.clear() logger.info('init hdfs storage from hdfs file_prefix {0}'.format(self.file_prefix)) try: total_start = timeit.default_timer() prefix_split = hdfs.path.splitpath(self.file_prefix) folder_path = prefix_split[0] real_prefix = prefix_split[1] + '_' if not hdfs.path.exists(folder_path): hdfs.mkdir(folder_path) files_info = hdfs.lsl(folder_path) # files_info = hdfs.lsl('{0}_*'.format(self.file_prefix)) logger.debug('files_info:{0}'.format(files_info)) sizecount = 0 for file_info in files_info: start_time = timeit.default_timer() file_name = hdfs.path.splitpath(file_info['path'])[1] if file_name.startswith(real_prefix) and file_info['kind'] == 'file': logger.debug('file info: {0}'.format(file_info)) page_id = file_name[len(real_prefix):] if not page_id.isdigit(): continue logger.debug('file {0} page id :{1}#'.format(file_info['path'], page_id)) # if page_id.isdigit(): logger.info('load {0}# page file {1}'.format(page_id, file_info['path'])) content = hdfs.load(file_info['path'], mode='r') # logger.debug('{0}# page content:{1}'.format(page_id, content)) self.pagedict[int(page_id)] = content logger.debug('{0}# page load complete'.format(page_id)) end_time = timeit.default_timer() eval(generate_timer_log_str.format( 'load {0} {1} byte'.format(file_name, len(self.pagedict[int(page_id)])), start_time, end_time)) sizecount += len(self.pagedict[int(page_id)]) except IOError, ie: logger.debug(traceback.format_exc())
def upsert_a_folder(src_dir, hdfs_tgt_dir, filename, debug): src_fname = os.path.join(src_dir, filename) tgt_fname = os.path.join(hdfs_tgt_dir, filename) # get target file info tgt_dict = {} try: lsl = hdfs.lsl(hdfs_tgt_dir) for i in lsl: try: tgt_dict[os.path.basename(i["name"])] = i["last_mod"] except: pass except: pass print "hdfs tgt_dict=", tgt_dict # get source info src_fs = glob.glob(src_fname) print "src_fs=", src_fs for sf in src_fs: # get source file info try: src_ctime_int = int(os.path.getctime(sf)) except: src_ctime_int = None print "src_ctime_int=", src_ctime_int src_bfname = os.path.basename(sf) tgt_fname = os.path.join(hdfs_tgt_dir, src_bfname) # put or rm/put try: if not src_bfname in tgt_dict: #insert new one if debug == 'N': hdfs.put(sf, hdfs_tgt_dir) else: print "DEBUG: put ", src_bfname, "to", hdfs_tgt_dir elif src_ctime_int > tgt_dict[src_bfname]: if debug == 'N': hdfs.rmr(tgt_fname) hdfs.put(sf, hdfs_tgt_dir) else: print "DEBUG: replace ", tgt_fname, "by", sf else: print tgt_fname, "has a newer mdate than", sf, ":", src_ctime_int except: e = sys.exc_info()[0] print "Error: ", e
def lsl(hdfs_path, recursive=False, project=None): """ Returns all the pathnames in the supplied directory. Args: :hdfs_path: You can specify either a full hdfs pathname or a relative one (relative to project_name in HDFS). :recursive: if it is a directory and recursive is True, the list contains one item for every file or directory in the tree rooted at hdfs_path. :project: If the supplied hdfs_path is a relative path, it will look for that file in this project's subdir in HDFS. Returns: A possibly-empty list of path names stored in the supplied path. """ if project == None: project = project_name() hdfs_path = _expand_path(hdfs_path, project) return hdfs.lsl(hdfs_path, recursive=recursive)
def clean_directory(dir, spam_life=spam_ttl): # Accepts a directory name and deletes anything older than TTL in days file_list = [] # check the existance of the directory if hdfs.path.exists(dir): # get a list of all files there file_list = hdfs.lsl(dir) # loop through the file list for listing in file_list: # get the last access time of the file and compare to spam lifetime if time.time() - listing[ 'last_access'] > 86400 * spam_life: # 86400 seconds in a day # if its too old delete it and log that it was deleted logger.info('Deleting ' + listing['name']) hdfs.rmr(listing['name'])
def initialize_break_points(cls, n_reducers, sampled_records, input_dir, n_threads=2): file_infos = [i for i in hdfs.lsl(input_dir) if (i['kind'] == 'file' and os.path.basename(i['name']).startswith('part'))] n_files = len(file_infos) total_size = sum(map(lambda _: int(_['size']), file_infos)) n_records = total_size // RECORD_LENGTH assert n_records > sampled_records df = max(n_files // n_reducers, 1) paths = [i['name'] for i in it.islice(file_infos, 0, df * n_reducers, df)] break_points = cls.get_break_points(sampled_records // n_reducers, n_reducers, paths, n_threads) vals = [_ for _ in zip(break_points, range(1, n_reducers))] selector = Selector(vals) bp_path = os.path.join(cls.TMP_DIR, cls.BREAK_POINTS_CACHE_FILE) with io.open(bp_path, "wb") as f: f.write(srl.private_encode(selector)) return bp_path
def walk_remotely(remote_path): LOGGER.debug("Walking {0}".format(remote_path)) inodes = hdfs.lsl(remote_path, recursive=True) return inodes
def assign_labels(top_dir): classes = [ hdfs.path.basename(_["name"]) for _ in hdfs.lsl(top_dir) if _["kind"] == "directory" ] return {c: i for i, c in enumerate(sorted(classes))}