Example #1
0
 def getmerge(self, path, local_destination, new_line=False):
     if new_line:
         cmd = load_hadoop_cmd() + [
             'fs', '-getmerge', '-nl', path, local_destination
         ]
     else:
         cmd = load_hadoop_cmd() + [
             'fs', '-getmerge', path, local_destination
         ]
     self.call_check(cmd)
Example #2
0
    def remove(self, path, recursive=True, skip_trash=False):
        if recursive:
            cmd = load_hadoop_cmd() + ['fs', '-rmr']
        else:
            cmd = load_hadoop_cmd() + ['fs', '-rm']

        if skip_trash:
            cmd = cmd + ['-skipTrash']

        cmd = cmd + [path]
        call_check(cmd)
Example #3
0
 def chown(self, path, owner, group, recursive=False):
     if owner is None:
         owner = ''
     if group is None:
         group = ''
     ownership = "%s:%s" % (owner, group)
     if recursive:
         cmd = load_hadoop_cmd() + ['fs', '-chown', '-R', ownership, path]
     else:
         cmd = load_hadoop_cmd() + ['fs', '-chown', ownership, path]
     call_check(cmd)
Example #4
0
 def chown(self, path, owner, group, recursive=False):
     if owner is None:
         owner = ''
     if group is None:
         group = ''
     ownership = "%s:%s" % (owner, group)
     if recursive:
         cmd = load_hadoop_cmd() + ['fs', '-chown', '-R', ownership, path]
     else:
         cmd = load_hadoop_cmd() + ['fs', '-chown', ownership, path]
     self.call_check(cmd)
Example #5
0
    def remove(self, path, recursive=True, skip_trash=False):
        if recursive:
            cmd = load_hadoop_cmd() + ['fs', '-rmr']
        else:
            cmd = load_hadoop_cmd() + ['fs', '-rm']

        if skip_trash:
            cmd = cmd + ['-skipTrash']

        cmd = cmd + [path]
        self.call_check(cmd)
Example #6
0
    def listdir(self,
                path,
                ignore_directories=False,
                ignore_files=False,
                include_size=False,
                include_type=False,
                include_time=False,
                recursive=False):
        if not path:
            path = "."  # default to current/home catalog

        if recursive:
            cmd = load_hadoop_cmd() + ['fs'
                                       ] + self.recursive_listdir_cmd + [path]
        else:
            cmd = load_hadoop_cmd() + ['fs', '-ls', path]
        lines = self.call_check(cmd).split('\n')

        for line in lines:
            if not line:
                continue
            elif line.startswith(
                    'OpenJDK 64-Bit Server VM warning') or line.startswith(
                        'It\'s highly recommended') or line.startswith(
                            'Found'):
                continue  # "hadoop fs -ls" outputs "Found %d items" as its first line
            elif ignore_directories and line[0] == 'd':
                continue
            elif ignore_files and line[0] == '-':
                continue
            data = line.split(' ')

            file = data[-1]
            size = int(data[-4])
            line_type = line[0]
            extra_data = ()

            if include_size:
                extra_data += (size, )
            if include_type:
                extra_data += (line_type, )
            if include_time:
                time_str = '%sT%s' % (data[-3], data[-2])
                modification_time = datetime.datetime.strptime(
                    time_str, '%Y-%m-%dT%H:%M')
                extra_data += (modification_time, )

            if len(extra_data) > 0:
                yield (file, ) + extra_data
            else:
                yield file
Example #7
0
 def __init__(self, path, data_extension=""):
     self.path = path
     self.tmppath = hdfs_config.tmppath(self.path)
     self.datapath = self.tmppath + ("/data%s" % data_extension)
     super(HdfsAtomicWriteDirPipe,
           self).__init__(load_hadoop_cmd() +
                          ['fs', '-put', '-', self.datapath])
Example #8
0
 def __init__(self, path):
     self.path = path
     self.tmppath = hdfs_config.tmppath(self.path)
     parent_dir = os.path.dirname(self.tmppath)
     mkdir(parent_dir, parents=True, raise_if_exists=False)
     super(HdfsAtomicWritePipe,
           self).__init__(load_hadoop_cmd() +
                          ['fs', '-put', '-', self.tmppath])
Example #9
0
 def rename(self, path, dest):
     parent_dir = os.path.dirname(dest)
     if parent_dir != '' and not self.exists(parent_dir):
         self.mkdir(parent_dir)
     if type(path) not in (list, tuple):
         path = [path]
     else:
         warnings.warn("Renaming multiple files at once is not atomic.")
     call_check(load_hadoop_cmd() + ['fs', '-mv'] + path + [dest])
Example #10
0
 def rename(self, path, dest):
     parent_dir = os.path.dirname(dest)
     if parent_dir != '' and not self.exists(parent_dir):
         self.mkdir(parent_dir)
     if type(path) not in (list, tuple):
         path = [path]
     else:
         warnings.warn("Renaming multiple files at once is not atomic.")
     call_check(load_hadoop_cmd() + ['fs', '-mv'] + path + [dest])
Example #11
0
 def move(self, path, dest):
     parent_dir = os.path.dirname(dest)
     if parent_dir != '' and not self.exists(parent_dir):
         self.mkdir(parent_dir)
     if not isinstance(path, (list, tuple)):
         path = [path]
     else:
         warnings.warn("Renaming multiple files at once is not atomic.", stacklevel=2)
     self.call_check(load_hadoop_cmd() + ['fs', '-mv'] + path + [dest])
Example #12
0
 def exists(self, path):
     cmd = load_hadoop_cmd() + ['fs', '-test', '-e', path]
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
     stdout, stderr = p.communicate()
     if p.returncode == 0:
         return True
     elif p.returncode == 1:
         return False
     else:
         raise hdfs_error.HDFSCliError(cmd, p.returncode, stdout, stderr)
Example #13
0
 def exists(self, path):
     cmd = load_hadoop_cmd() + ['fs', '-test', '-e', path]
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True)
     stdout, stderr = p.communicate()
     if p.returncode == 0:
         return True
     elif p.returncode == 1:
         return False
     else:
         raise HDFSCliError(cmd, p.returncode, stdout, stderr)
Example #14
0
 def mkdir(self, path):
     """
     No -p switch, so this will fail creating ancestors.
     """
     try:
         call_check(load_hadoop_cmd() + ['fs', '-mkdir', path])
     except HDFSCliError as ex:
         if "File exists" in ex.stderr:
             raise FileAlreadyExists(ex.stderr)
         else:
             raise
Example #15
0
 def mkdir(self, path):
     """
     No -p switch, so this will fail creating ancestors.
     """
     try:
         self.call_check(load_hadoop_cmd() + ['fs', '-mkdir', path])
     except hdfs_error.HDFSCliError as ex:
         if "File exists" in ex.stderr:
             raise FileAlreadyExists(ex.stderr)
         else:
             raise
Example #16
0
 def mkdir(self, path, parents=True, raise_if_exists=False):
     """
     No explicit -p switch, this version of Hadoop always creates parent directories.
     """
     try:
         self.call_check(load_hadoop_cmd() + ['fs', '-mkdir', path])
     except hdfs_error.HDFSCliError as ex:
         if "File exists" in ex.stderr:
             raise FileAlreadyExists(ex.stderr)
         else:
             raise
Example #17
0
 def count(self, path):
     cmd = load_hadoop_cmd() + ['fs', '-count', path]
     stdout = self.call_check(cmd)
     lines = stdout.split('\n')
     for line in stdout.split('\n'):
         if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or not line:
             lines.pop(lines.index(line))
         else:
             (dir_count, file_count, content_size, ppath) = stdout.split()
     results = {'content_size': content_size, 'dir_count': dir_count, 'file_count': file_count}
     return results
Example #18
0
 def count(self, path):
     cmd = load_hadoop_cmd() + ['fs', '-count', path]
     stdout = call_check(cmd)
     lines = stdout.split('\n')
     for line in stdout.split('\n'):
         if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or not line:
             lines.pop(lines.index(line))
         else:
             (dir_count, file_count, content_size, ppath) = stdout.split()
     results = {'content_size': content_size, 'dir_count': dir_count, 'file_count': file_count}
     return results
Example #19
0
    def listdir(self, path, ignore_directories=False, ignore_files=False,
                include_size=False, include_type=False, include_time=False, recursive=False):
        if not path:
            path = "."  # default to current/home catalog

        if recursive:
            cmd = load_hadoop_cmd() + ['fs'] + self.recursive_listdir_cmd + [path]
        else:
            cmd = load_hadoop_cmd() + ['fs', '-ls', path]
        lines = call_check(cmd).split('\n')

        for line in lines:
            if not line:
                continue
            elif line.startswith('OpenJDK 64-Bit Server VM warning') or line.startswith('It\'s highly recommended') or line.startswith('Found'):
                continue  # "hadoop fs -ls" outputs "Found %d items" as its first line
            elif ignore_directories and line[0] == 'd':
                continue
            elif ignore_files and line[0] == '-':
                continue
            data = line.split(' ')

            file = data[-1]
            size = int(data[-4])
            line_type = line[0]
            extra_data = ()

            if include_size:
                extra_data += (size,)
            if include_type:
                extra_data += (line_type,)
            if include_time:
                time_str = '%sT%s' % (data[-3], data[-2])
                modification_time = datetime.datetime.strptime(time_str,
                                                               '%Y-%m-%dT%H:%M')
                extra_data += (modification_time,)

            if len(extra_data) > 0:
                yield (file,) + extra_data
            else:
                yield file
Example #20
0
 def mkdir(self, path, parents=True, raise_if_exists=False):
     if parents and raise_if_exists:
         raise NotImplementedError("HdfsClient.mkdir can't raise with -p")
     try:
         cmd = (load_hadoop_cmd() + ['fs', '-mkdir'] +
                (['-p'] if parents else []) + [path])
         self.call_check(cmd)
     except hdfs_error.HDFSCliError as ex:
         if "File exists" in ex.stderr:
             if raise_if_exists:
                 raise FileAlreadyExists(ex.stderr)
         else:
             raise
Example #21
0
 def mkdir(self, path, parents=True, raise_if_exists=False):
     if (parents and raise_if_exists):
         raise NotImplementedError("HdfsClient.mkdir can't raise with -p")
     try:
         cmd = (load_hadoop_cmd() + ['fs', '-mkdir'] +
                (['-p'] if parents else []) +
                [path])
         call_check(cmd)
     except HDFSCliError as ex:
         if "File exists" in ex.stderr:
             if raise_if_exists:
                 raise FileAlreadyExists(ex.stderr)
         else:
             raise
Example #22
0
 def count(self, path):
     cmd = load_hadoop_cmd() + ['fs', '-count', path]
     stdout = self.call_check(cmd)
     lines = stdout.split('\n')
     for line in stdout.split('\n'):
         if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or \
                 line.endswith('using builtin-java classes where applicable') or not line:
             lines.pop(
                 lines.index(line))  # ignoring native libraries warnings
         else:
             (dir_count, file_count, content_size, ppath) = stdout.split()
     results = {
         'content_size': content_size,
         'dir_count': dir_count,
         'file_count': file_count
     }
     return results
Example #23
0
    def count(self, path):
        cmd = load_hadoop_cmd() + ['fs', '-count', path]
        logger.debug('Running path count check: %s', subprocess.list2cmdline(cmd))
        stdout = self.call_check(cmd)
        lines = stdout.split('\n')
        results = {'content_size': 0, 'dir_count': 0, 'file_count': 0}
        for line in lines:
            if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or not line:
                continue
            else:
                (dir_count, file_count, content_size, ppath) = line.split()
                results['dir_count'] += int(dir_count)
                results['file_count'] += int(file_count)
                results['content_size'] += int(content_size)

        logger.debug('Path count check on %s: %s', path, results)
        return results
Example #24
0
    def exists(self, path):
        """
        Use ``hadoop fs -stat`` to check file existence.
        """

        cmd = load_hadoop_cmd() + ['fs', '-stat', path]
        logger.debug('Running file existence check: %s', subprocess.list2cmdline(cmd))
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, universal_newlines=True)
        stdout, stderr = p.communicate()
        if p.returncode == 0:
            return True
        else:
            not_found_pattern = "^.*No such file or directory$"
            not_found_re = re.compile(not_found_pattern)
            for line in stderr.split('\n'):
                if not_found_re.match(line):
                    return False
            raise hdfs_error.HDFSCliError(cmd, p.returncode, stdout, stderr)
Example #25
0
    def exists(self, path):
        """
        Use ``hadoop fs -stat`` to check file existence.
        """

        cmd = load_hadoop_cmd() + ['fs', '-stat', path]
        logger.debug('Running file existence check: %s', u' '.join(cmd))
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, universal_newlines=True)
        stdout, stderr = p.communicate()
        if p.returncode == 0:
            return True
        else:
            not_found_pattern = "^.*No such file or directory$"
            not_found_re = re.compile(not_found_pattern)
            for line in stderr.split('\n'):
                if not_found_re.match(line):
                    return False
            raise HDFSCliError(cmd, p.returncode, stdout, stderr)
Example #26
0
 def __init__(self, path):
     super(HdfsReadPipe,
           self).__init__(load_hadoop_cmd() + ['fs', '-cat', path])
Example #27
0
File: format.py Project: 01-/luigi
 def __init__(self, path, data_extension=""):
     self.path = path
     self.tmppath = hdfs_config.tmppath(self.path)
     self.datapath = self.tmppath + ("/data%s" % data_extension)
     super(HdfsAtomicWriteDirPipe, self).__init__(load_hadoop_cmd() + ['fs', '-put', '-', self.datapath])
Example #28
0
File: format.py Project: 01-/luigi
 def __init__(self, path):
     self.path = path
     self.tmppath = hdfs_config.tmppath(self.path)
     parent_dir = os.path.dirname(self.tmppath)
     mkdir(parent_dir, parents=True, raise_if_exists=False)
     super(HdfsAtomicWritePipe, self).__init__(load_hadoop_cmd() + ['fs', '-put', '-', self.tmppath])
Example #29
0
File: format.py Project: 01-/luigi
 def __init__(self, path):
     super(HdfsReadPipe, self).__init__(load_hadoop_cmd() + ['fs', '-cat', path])
Example #30
0
 def chmod(self, path, permissions, recursive=False):
     if recursive:
         cmd = load_hadoop_cmd() + ['fs', '-chmod', '-R', permissions, path]
     else:
         cmd = load_hadoop_cmd() + ['fs', '-chmod', permissions, path]
     self.call_check(cmd)
Example #31
0
 def put(self, local_path, destination):
     call_check(load_hadoop_cmd() + ['fs', '-put', local_path, destination])
Example #32
0
 def put(self, local_path, destination):
     self.call_check(load_hadoop_cmd() +
                     ['fs', '-put', local_path, destination])
Example #33
0
 def touchz(self, path):
     call_check(load_hadoop_cmd() + ['fs', '-touchz', path])
Example #34
0
 def get(self, path, local_destination):
     self.call_check(load_hadoop_cmd() +
                     ['fs', '-get', path, local_destination])
Example #35
0
 def touchz(self, path):
     self.call_check(load_hadoop_cmd() + ['fs', '-touchz', path])
Example #36
0
 def getmerge(self, path, local_destination, new_line=False):
     if new_line:
         cmd = load_hadoop_cmd() + ['fs', '-getmerge', '-nl', path, local_destination]
     else:
         cmd = load_hadoop_cmd() + ['fs', '-getmerge', path, local_destination]
     call_check(cmd)
Example #37
0
 def get(self, path, local_destination):
     call_check(load_hadoop_cmd() + ['fs', '-get', path, local_destination])
Example #38
0
 def copy(self, path, destination):
     self.call_check(load_hadoop_cmd() + ['fs', '-cp', path, destination])
Example #39
0
 def chmod(self, path, permissions, recursive=False):
     if recursive:
         cmd = load_hadoop_cmd() + ['fs', '-chmod', '-R', permissions, path]
     else:
         cmd = load_hadoop_cmd() + ['fs', '-chmod', permissions, path]
     call_check(cmd)
Example #40
0
 def copy(self, path, destination):
     call_check(load_hadoop_cmd() + ['fs', '-cp', path, destination])