def getmerge(self, path, local_destination, new_line=False): if new_line: cmd = load_hadoop_cmd() + [ 'fs', '-getmerge', '-nl', path, local_destination ] else: cmd = load_hadoop_cmd() + [ 'fs', '-getmerge', path, local_destination ] self.call_check(cmd)
def remove(self, path, recursive=True, skip_trash=False): if recursive: cmd = load_hadoop_cmd() + ['fs', '-rmr'] else: cmd = load_hadoop_cmd() + ['fs', '-rm'] if skip_trash: cmd = cmd + ['-skipTrash'] cmd = cmd + [path] call_check(cmd)
def chown(self, path, owner, group, recursive=False): if owner is None: owner = '' if group is None: group = '' ownership = "%s:%s" % (owner, group) if recursive: cmd = load_hadoop_cmd() + ['fs', '-chown', '-R', ownership, path] else: cmd = load_hadoop_cmd() + ['fs', '-chown', ownership, path] call_check(cmd)
def chown(self, path, owner, group, recursive=False): if owner is None: owner = '' if group is None: group = '' ownership = "%s:%s" % (owner, group) if recursive: cmd = load_hadoop_cmd() + ['fs', '-chown', '-R', ownership, path] else: cmd = load_hadoop_cmd() + ['fs', '-chown', ownership, path] self.call_check(cmd)
def remove(self, path, recursive=True, skip_trash=False): if recursive: cmd = load_hadoop_cmd() + ['fs', '-rmr'] else: cmd = load_hadoop_cmd() + ['fs', '-rm'] if skip_trash: cmd = cmd + ['-skipTrash'] cmd = cmd + [path] self.call_check(cmd)
def listdir(self, path, ignore_directories=False, ignore_files=False, include_size=False, include_type=False, include_time=False, recursive=False): if not path: path = "." # default to current/home catalog if recursive: cmd = load_hadoop_cmd() + ['fs' ] + self.recursive_listdir_cmd + [path] else: cmd = load_hadoop_cmd() + ['fs', '-ls', path] lines = self.call_check(cmd).split('\n') for line in lines: if not line: continue elif line.startswith( 'OpenJDK 64-Bit Server VM warning') or line.startswith( 'It\'s highly recommended') or line.startswith( 'Found'): continue # "hadoop fs -ls" outputs "Found %d items" as its first line elif ignore_directories and line[0] == 'd': continue elif ignore_files and line[0] == '-': continue data = line.split(' ') file = data[-1] size = int(data[-4]) line_type = line[0] extra_data = () if include_size: extra_data += (size, ) if include_type: extra_data += (line_type, ) if include_time: time_str = '%sT%s' % (data[-3], data[-2]) modification_time = datetime.datetime.strptime( time_str, '%Y-%m-%dT%H:%M') extra_data += (modification_time, ) if len(extra_data) > 0: yield (file, ) + extra_data else: yield file
def __init__(self, path, data_extension=""): self.path = path self.tmppath = hdfs_config.tmppath(self.path) self.datapath = self.tmppath + ("/data%s" % data_extension) super(HdfsAtomicWriteDirPipe, self).__init__(load_hadoop_cmd() + ['fs', '-put', '-', self.datapath])
def __init__(self, path): self.path = path self.tmppath = hdfs_config.tmppath(self.path) parent_dir = os.path.dirname(self.tmppath) mkdir(parent_dir, parents=True, raise_if_exists=False) super(HdfsAtomicWritePipe, self).__init__(load_hadoop_cmd() + ['fs', '-put', '-', self.tmppath])
def rename(self, path, dest): parent_dir = os.path.dirname(dest) if parent_dir != '' and not self.exists(parent_dir): self.mkdir(parent_dir) if type(path) not in (list, tuple): path = [path] else: warnings.warn("Renaming multiple files at once is not atomic.") call_check(load_hadoop_cmd() + ['fs', '-mv'] + path + [dest])
def move(self, path, dest): parent_dir = os.path.dirname(dest) if parent_dir != '' and not self.exists(parent_dir): self.mkdir(parent_dir) if not isinstance(path, (list, tuple)): path = [path] else: warnings.warn("Renaming multiple files at once is not atomic.", stacklevel=2) self.call_check(load_hadoop_cmd() + ['fs', '-mv'] + path + [dest])
def exists(self, path): cmd = load_hadoop_cmd() + ['fs', '-test', '-e', path] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() if p.returncode == 0: return True elif p.returncode == 1: return False else: raise hdfs_error.HDFSCliError(cmd, p.returncode, stdout, stderr)
def exists(self, path): cmd = load_hadoop_cmd() + ['fs', '-test', '-e', path] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stdout, stderr = p.communicate() if p.returncode == 0: return True elif p.returncode == 1: return False else: raise HDFSCliError(cmd, p.returncode, stdout, stderr)
def mkdir(self, path): """ No -p switch, so this will fail creating ancestors. """ try: call_check(load_hadoop_cmd() + ['fs', '-mkdir', path]) except HDFSCliError as ex: if "File exists" in ex.stderr: raise FileAlreadyExists(ex.stderr) else: raise
def mkdir(self, path): """ No -p switch, so this will fail creating ancestors. """ try: self.call_check(load_hadoop_cmd() + ['fs', '-mkdir', path]) except hdfs_error.HDFSCliError as ex: if "File exists" in ex.stderr: raise FileAlreadyExists(ex.stderr) else: raise
def mkdir(self, path, parents=True, raise_if_exists=False): """ No explicit -p switch, this version of Hadoop always creates parent directories. """ try: self.call_check(load_hadoop_cmd() + ['fs', '-mkdir', path]) except hdfs_error.HDFSCliError as ex: if "File exists" in ex.stderr: raise FileAlreadyExists(ex.stderr) else: raise
def count(self, path): cmd = load_hadoop_cmd() + ['fs', '-count', path] stdout = self.call_check(cmd) lines = stdout.split('\n') for line in stdout.split('\n'): if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or not line: lines.pop(lines.index(line)) else: (dir_count, file_count, content_size, ppath) = stdout.split() results = {'content_size': content_size, 'dir_count': dir_count, 'file_count': file_count} return results
def count(self, path): cmd = load_hadoop_cmd() + ['fs', '-count', path] stdout = call_check(cmd) lines = stdout.split('\n') for line in stdout.split('\n'): if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or not line: lines.pop(lines.index(line)) else: (dir_count, file_count, content_size, ppath) = stdout.split() results = {'content_size': content_size, 'dir_count': dir_count, 'file_count': file_count} return results
def listdir(self, path, ignore_directories=False, ignore_files=False, include_size=False, include_type=False, include_time=False, recursive=False): if not path: path = "." # default to current/home catalog if recursive: cmd = load_hadoop_cmd() + ['fs'] + self.recursive_listdir_cmd + [path] else: cmd = load_hadoop_cmd() + ['fs', '-ls', path] lines = call_check(cmd).split('\n') for line in lines: if not line: continue elif line.startswith('OpenJDK 64-Bit Server VM warning') or line.startswith('It\'s highly recommended') or line.startswith('Found'): continue # "hadoop fs -ls" outputs "Found %d items" as its first line elif ignore_directories and line[0] == 'd': continue elif ignore_files and line[0] == '-': continue data = line.split(' ') file = data[-1] size = int(data[-4]) line_type = line[0] extra_data = () if include_size: extra_data += (size,) if include_type: extra_data += (line_type,) if include_time: time_str = '%sT%s' % (data[-3], data[-2]) modification_time = datetime.datetime.strptime(time_str, '%Y-%m-%dT%H:%M') extra_data += (modification_time,) if len(extra_data) > 0: yield (file,) + extra_data else: yield file
def mkdir(self, path, parents=True, raise_if_exists=False): if parents and raise_if_exists: raise NotImplementedError("HdfsClient.mkdir can't raise with -p") try: cmd = (load_hadoop_cmd() + ['fs', '-mkdir'] + (['-p'] if parents else []) + [path]) self.call_check(cmd) except hdfs_error.HDFSCliError as ex: if "File exists" in ex.stderr: if raise_if_exists: raise FileAlreadyExists(ex.stderr) else: raise
def mkdir(self, path, parents=True, raise_if_exists=False): if (parents and raise_if_exists): raise NotImplementedError("HdfsClient.mkdir can't raise with -p") try: cmd = (load_hadoop_cmd() + ['fs', '-mkdir'] + (['-p'] if parents else []) + [path]) call_check(cmd) except HDFSCliError as ex: if "File exists" in ex.stderr: if raise_if_exists: raise FileAlreadyExists(ex.stderr) else: raise
def count(self, path): cmd = load_hadoop_cmd() + ['fs', '-count', path] stdout = self.call_check(cmd) lines = stdout.split('\n') for line in stdout.split('\n'): if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or \ line.endswith('using builtin-java classes where applicable') or not line: lines.pop( lines.index(line)) # ignoring native libraries warnings else: (dir_count, file_count, content_size, ppath) = stdout.split() results = { 'content_size': content_size, 'dir_count': dir_count, 'file_count': file_count } return results
def count(self, path): cmd = load_hadoop_cmd() + ['fs', '-count', path] logger.debug('Running path count check: %s', subprocess.list2cmdline(cmd)) stdout = self.call_check(cmd) lines = stdout.split('\n') results = {'content_size': 0, 'dir_count': 0, 'file_count': 0} for line in lines: if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or not line: continue else: (dir_count, file_count, content_size, ppath) = line.split() results['dir_count'] += int(dir_count) results['file_count'] += int(file_count) results['content_size'] += int(content_size) logger.debug('Path count check on %s: %s', path, results) return results
def exists(self, path): """ Use ``hadoop fs -stat`` to check file existence. """ cmd = load_hadoop_cmd() + ['fs', '-stat', path] logger.debug('Running file existence check: %s', subprocess.list2cmdline(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, universal_newlines=True) stdout, stderr = p.communicate() if p.returncode == 0: return True else: not_found_pattern = "^.*No such file or directory$" not_found_re = re.compile(not_found_pattern) for line in stderr.split('\n'): if not_found_re.match(line): return False raise hdfs_error.HDFSCliError(cmd, p.returncode, stdout, stderr)
def exists(self, path): """ Use ``hadoop fs -stat`` to check file existence. """ cmd = load_hadoop_cmd() + ['fs', '-stat', path] logger.debug('Running file existence check: %s', u' '.join(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, universal_newlines=True) stdout, stderr = p.communicate() if p.returncode == 0: return True else: not_found_pattern = "^.*No such file or directory$" not_found_re = re.compile(not_found_pattern) for line in stderr.split('\n'): if not_found_re.match(line): return False raise HDFSCliError(cmd, p.returncode, stdout, stderr)
def __init__(self, path): super(HdfsReadPipe, self).__init__(load_hadoop_cmd() + ['fs', '-cat', path])
def chmod(self, path, permissions, recursive=False): if recursive: cmd = load_hadoop_cmd() + ['fs', '-chmod', '-R', permissions, path] else: cmd = load_hadoop_cmd() + ['fs', '-chmod', permissions, path] self.call_check(cmd)
def put(self, local_path, destination): call_check(load_hadoop_cmd() + ['fs', '-put', local_path, destination])
def put(self, local_path, destination): self.call_check(load_hadoop_cmd() + ['fs', '-put', local_path, destination])
def touchz(self, path): call_check(load_hadoop_cmd() + ['fs', '-touchz', path])
def get(self, path, local_destination): self.call_check(load_hadoop_cmd() + ['fs', '-get', path, local_destination])
def touchz(self, path): self.call_check(load_hadoop_cmd() + ['fs', '-touchz', path])
def getmerge(self, path, local_destination, new_line=False): if new_line: cmd = load_hadoop_cmd() + ['fs', '-getmerge', '-nl', path, local_destination] else: cmd = load_hadoop_cmd() + ['fs', '-getmerge', path, local_destination] call_check(cmd)
def get(self, path, local_destination): call_check(load_hadoop_cmd() + ['fs', '-get', path, local_destination])
def copy(self, path, destination): self.call_check(load_hadoop_cmd() + ['fs', '-cp', path, destination])
def chmod(self, path, permissions, recursive=False): if recursive: cmd = load_hadoop_cmd() + ['fs', '-chmod', '-R', permissions, path] else: cmd = load_hadoop_cmd() + ['fs', '-chmod', permissions, path] call_check(cmd)
def copy(self, path, destination): call_check(load_hadoop_cmd() + ['fs', '-cp', path, destination])