def hadoop_fs_ls(stdout, stderr, environ, *args): """Implements hadoop fs -ls.""" hdfs_path_globs = args or [''] failed = False for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) paths = [] max_size = 0 if not real_paths: print(('ls: Cannot access %s: No such file or directory.' % hdfs_path_glob), file=stderr) failed = True else: for real_path in real_paths: paths.append((real_path, scheme, netloc, 0)) for path in paths: print(_hadoop_ls_line(*path + (max_size, environ)), file=stdout) if failed: return -1 else: return 0
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) try: stdout = self.invoke_hadoop( ['fs', '-lsr', path_glob], return_stdout=True, ok_stderr=[HADOOP_LSR_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) path_index = None for line in StringIO(stdout): fields = line.rstrip('\r\n').split() # Throw out directories if fields[0].startswith('d'): continue # Try to figure out which part of the line is the path # Expected lines: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # HDFS # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar # S3 if not path_index: for index, field in enumerate(fields): if len(field) == 5 and field[2] == ':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string '%s'" % line) path = ' '.join(fields[path_index:]) yield hdfs_prefix + path
def hadoop_fs_ls(stdout, stderr, environ, *args): """Implements hadoop fs -ls.""" hdfs_path_globs = args or [''] failed = False for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) paths = [] max_size = 0 if not real_paths: print >> stderr, ( 'ls: Cannot access %s: No such file or directory.' % hdfs_path_glob) failed = True else: for real_path in real_paths: paths.append((real_path, scheme, netloc, 0)) for path in paths: print >> stdout, _hadoop_ls_line(*path + (max_size, environ)) if failed: return -1 else: return 0
def ls(self, path_glob): if not is_uri(path_glob): for path in super(HadoopJobRunner, self).ls(path_glob): yield path return components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) stdout = self._invoke_hadoop(['fs', '-lsr', path_glob], return_stdout=True, ok_stderr=[HADOOP_LSR_NO_SUCH_FILE]) for line in StringIO(stdout): fields = line.rstrip('\r\n').split() # expect lines like: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar if len(fields) < 8: raise Exception('unexpected ls line from hadoop: %r' % line) # ignore directories if fields[0].startswith('d'): continue # not sure if you can have spaces in filenames; just to be safe path = ' '.join(fields[7:]) yield hdfs_prefix + path
def ls(self, path_glob): if not is_uri(path_glob): for path in super(HadoopJobRunner, self).ls(path_glob): yield path return components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) stdout = self._invoke_hadoop( ['fs', '-lsr', path_glob], return_stdout=True, ok_stderr=[HADOOP_LSR_NO_SUCH_FILE]) for line in StringIO(stdout): fields = line.rstrip('\r\n').split() # expect lines like: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar if len(fields) < 8: raise Exception('unexpected ls line from hadoop: %r' % line) # ignore directories if fields[0].startswith('d'): continue # not sure if you can have spaces in filenames; just to be safe path = ' '.join(fields[7:]) yield hdfs_prefix + path
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_string(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def _ls_detailed(self, path_glob): """Recursively list files on GCS and includes some metadata about them: - object name - size - md5 hash - _uri *path_glob* can include ``?`` to match single characters or ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match ``/``. """ scheme = urlparse(path_glob).scheme bucket_name, base_name = _path_glob_to_parsed_gcs_uri(path_glob) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' list_request = self.api_client.objects().list( bucket=bucket_name, prefix=base_name, fields=_LS_FIELDS_TO_RETURN) uri_prefix = '%s://%s' % (scheme, bucket_name) while list_request: try: resp = list_request.execute() except google_errors.HttpError as e: if e.resp.status == 404: return raise resp_items = resp.get('items') or [] for item in resp_items: # We generate the item URI by adding the "gs://" prefix uri = "%s/%s" % (uri_prefix, item['name']) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue # filter out folders if uri.endswith('/'): continue item['_uri'] = uri item['bucket'] = bucket_name item['size'] = int(item['size']) yield item list_request = self.api_client.objects().list_next( list_request, resp)
def hadoop_fs_lsr(stdout, stderr, environ, *args): """Implements hadoop fs -lsr.""" hdfs_path_globs = args or [''] def ls_line(real_path, scheme, netloc): hdfs_path = real_path_to_hdfs_path(real_path, environ) # we could actually implement ls here, but mrjob only cares about # the path if os.path.isdir(real_path): file_type = 'd' else: file_type = '-' if scheme in ('s3', 's3n'): # no user and group on S3 (see Pull Request #573) user_and_group = '' else: user_and_group = 'dave supergroup' # newer Hadoop returns fully qualified URIs (see Pull Request #577) if scheme and environ.get('MOCK_HADOOP_LS_RETURNS_FULL_URIS'): hdfs_path = '%s://%s%s' % (scheme, netloc, hdfs_path) return ( '%srwxrwxrwx - %s 18321 2010-10-01 15:16 %s' % (file_type, user_and_group, hdfs_path)) failed = False for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) if not real_paths: print >> stderr, ( 'lsr: Cannot access %s: No such file or directory.' % hdfs_path_glob) failed = True else: for real_path in real_paths: if os.path.isdir(real_path): for dirpath, dirnames, filenames in os.walk(real_path): print >> stdout, ls_line(dirpath, scheme, netloc) for filename in filenames: path = os.path.join(dirpath, filename) print >> stdout, ls_line(path, scheme, netloc) else: print >> stdout, ls_line(real_path, scheme, netloc) if failed: return -1 else: return 0
def hdfs_path_to_real_path(hdfs_path, environ): components = urlparse(hdfs_path) scheme = components.scheme path = components.path if not scheme and not path.startswith('/'): path = '/user/%s/%s' % (environ['USER'], path) return os.path.join(environ['MOCK_HDFS_ROOT'], path.lstrip('/'))
def hdfs_path_to_real_path(hdfs_path, environ): components = urlparse(hdfs_path) scheme = components.scheme path = components.path if not scheme and not path.startswith("/"): path = "/user/%s/%s" % (environ["USER"], path) return os.path.join(environ["MOCK_HDFS_ROOT"], path.lstrip("/"))
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_unicode(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def hadoop_fs_lsr(stdout, stderr, environ, *args): """Implements hadoop fs -lsr.""" hdfs_path_globs = args or [''] def ls_line(real_path, scheme, netloc): hdfs_path = real_path_to_hdfs_path(real_path, environ) # we could actually implement ls here, but mrjob only cares about # the path if os.path.isdir(real_path): file_type = 'd' else: file_type = '-' if scheme in ('s3', 's3n'): # no user and group on S3 (see Pull Request #573) user_and_group = '' else: user_and_group = 'dave supergroup' # newer Hadoop returns fully qualified URIs (see Pull Request #577) if scheme and environ.get('MOCK_HADOOP_LS_RETURNS_FULL_URIS'): hdfs_path = '%s://%s%s' % (scheme, netloc, hdfs_path) return ('%srwxrwxrwx - %s 18321 2010-10-01 15:16 %s' % (file_type, user_and_group, hdfs_path)) failed = False for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) if not real_paths: print >> stderr, ( 'lsr: Cannot access %s: No such file or directory.' % hdfs_path_glob) failed = True else: for real_path in real_paths: if os.path.isdir(real_path): for dirpath, dirnames, filenames in os.walk(real_path): print >> stdout, ls_line(dirpath, scheme, netloc) for filename in filenames: path = os.path.join(dirpath, filename) print >> stdout, ls_line(path, scheme, netloc) else: print >> stdout, ls_line(real_path, scheme, netloc) if failed: return -1 else: return 0
def hdfs_uri_to_real_path(hdfs_uri, environ): """Map an HDFS URI to a path on the filesystem.""" components = urlparse(hdfs_uri) scheme = components.scheme path = components.path if not scheme and not path.startswith('/'): path = '/user/%s/%s' % (environ['USER'], path) return os.path.join(get_mock_hdfs_root(environ=environ), path.lstrip('/'))
def copy_from_local(self, path, local_file): # Ensure that local_file has a file:/// at the beginning... local_file = urlparse(local_file) assert local_file.scheme in ('', 'test'), "local_file must be local" assert os.path.exists(local_file.path), "local_file must exist" local_file = urlunparse(['file'] + list(local_file[1:])) try: self.invoke_hadoop(['fs', '-put', local_file, path]) except CalledProcessError as e: raise OSError("Could not create file: %s" % e)
def _hadoop_fs_ls(cmd_name, stdout, stderr, environ, path_args, recursive): """Helper for hadoop_fs_ls() and hadoop_fs_lsr().""" hdfs_uri_globs = path_args or [''] failed = False for hdfs_uri_glob in hdfs_uri_globs: parsed = urlparse(hdfs_uri_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_uri_to_real_path(hdfs_uri_glob, environ) real_paths = glob.glob(real_path_glob) paths = [] if not real_paths: print('%s: Cannot access %s: No such file or directory.' % (cmd_name, hdfs_uri_glob), file=stderr) failed = True else: for real_path in real_paths: if os.path.isdir(real_path): if recursive: for dirpath, dirnames, filenames in os.walk(real_path): paths.append((dirpath, scheme, netloc, 0)) for filename in filenames: path = os.path.join(dirpath, filename) size = os.path.getsize(path) paths.append((path, scheme, netloc, size)) else: for filename in os.listdir(real_path): path = os.path.join(real_path, filename) if os.path.isdir(path): size = 0 else: size = os.path.getsize(path) paths.append((path, scheme, netloc, size)) else: size = os.path.getsize(real_path) paths.append((real_path, scheme, netloc, size)) if paths: print('Found %d items' % len(paths), file=stdout) max_size = max(size for _, __, ___, size in paths) for path in paths: print(_hadoop_ls_line(*path + (max_size, environ)), file=stdout) if failed: return -1 else: return 0
def _hadoop_fs_ls(cmd_name, stdout, stderr, environ, path_args, recursive): """Helper for hadoop_fs_ls() and hadoop_fs_lsr().""" hdfs_path_globs = path_args or [''] failed = False for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) paths = [] if not real_paths: print('%s: Cannot access %s: No such file or directory.' % (cmd_name, hdfs_path_glob), file=stderr) failed = True else: for real_path in real_paths: if os.path.isdir(real_path): if recursive: for dirpath, dirnames, filenames in os.walk(real_path): paths.append((dirpath, scheme, netloc, 0)) for filename in filenames: path = os.path.join(dirpath, filename) size = os.path.getsize(path) paths.append((path, scheme, netloc, size)) else: for filename in os.listdir(real_path): path = os.path.join(real_path, filename) if os.path.isdir(path): size = 0 else: size = os.path.getsize(path) paths.append((path, scheme, netloc, size)) else: size = os.path.getsize(real_path) paths.append((real_path, scheme, netloc, size)) if paths: print('Found %d items' % len(paths), file=stdout) max_size = max(size for _, __, ___, size in paths) for path in paths: print(_hadoop_ls_line(*path + (max_size, environ)), file=stdout) if failed: return -1 else: return 0
def parse_gcs_uri(uri): """Parse a GCS URI into (bucket, key) >>> parse_gcs_uri("gs://walrus/tmp/") ('walrus', 'tmp/') If ``uri`` is not a GCS URI, raise a ValueError """ components = urlparse(uri) if components.scheme != "gs" or '/' not in components.path: raise ValueError('Invalid GCS URI: %s' % uri) return components.netloc, components.path[1:]
def join(self, path, *paths): """Join *paths* onto *path* (which may be a URI)""" all_paths = (path,) + paths # if there's a URI, we only care about it and what follows for i in range(len(all_paths), 0, -1): if is_uri(all_paths[i - 1]): scheme, netloc, uri_path = urlparse(all_paths[i - 1])[:3] return '%s://%s%s' % ( scheme, netloc, posixpath.join( uri_path or '/', *all_paths[i:])) else: return os.path.join(*all_paths)
def ls(self, path_glob): """Recursively list files on S3. *path_glob* can include ``?`` to match single characters or ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match ``/``. .. versionchanged:: 0.5.0 You no longer need a trailing slash to list "directories" on S3; both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list all keys starting with ``dir/``. """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just in case we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_s3_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' bucket = self.get_bucket(bucket_name) for key in bucket.list(base_name): uri = "%s://%s/%s" % (scheme, bucket_name, key.name) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri
def hadoop_fs_put(stdout, stderr, environ, *args): """Implements hadoop fs -put""" if len(args) < 2: stderr.write('Usage: java FsShell [-put <localsrc> ... <dst>]') return -1 srcs = args[:-1] dst = args[-1] real_dst = hdfs_path_to_real_path(dst, environ) dst_dir = os.path.isdir(real_dst) real_dir = os.path.dirname(real_dst) # dst could be a dir or a filename; we don't know if not dst_dir and not os.path.isdir(real_dir): os.makedirs(real_dir) skipped = False for src in srcs: # If the destination is a directory then we put the source into it # under its basename. If the destination is a file or does not exist # then this is where we wish to write to. target = os.path.join(real_dst, os.path.basename(src)) \ if dst_dir else real_dst if os.path.exists(target): if os.path.isdir(src): stderr.write("Target %s is a directory" % real_path_to_hdfs_path(target, environ)) else: stderr.write("Target %s already exists" % real_path_to_hdfs_path(target, environ)) skipped = True continue src_url = urlparse(src) if src_url.scheme in ('file', ''): src = src_url.path else: raise ValueError("hadoop fs -put mock supports only empty or " "'file' schemes for input: %s" % src) shutil.copy(src, real_dst) return 255 if skipped else 0
def test_urlparse(self): assert_equal(urlparse('http://www.yelp.com/lil_brudder'), ('http', 'www.yelp.com', '/lil_brudder', '', '', '')) assert_equal(urlparse('cant://touch/this'), ('cant', 'touch', '/this', '', '', '')) assert_equal(urlparse('s3://bucket/path'), ('s3', 'bucket', '/path', '', '', '')) assert_equal(urlparse('s3://bucket/path#customname'), ('s3', 'bucket', '/path#customname', '', '', '')) assert_equal(urlparse('s3://bucket'), ('s3', 'bucket', '', '', '', '')) assert_equal(urlparse('s3://bucket/'), ('s3', 'bucket', '/', '', '', ''))
def test_urlparse(self): self.assertEqual(urlparse('http://www.yelp.com/lil_brudder'), ('http', 'www.yelp.com', '/lil_brudder', '', '', '')) self.assertEqual(urlparse('cant://touch/this'), ('cant', 'touch', '/this', '', '', '')) self.assertEqual(urlparse('s3://bucket/path'), ('s3', 'bucket', '/path', '', '', '')) self.assertEqual(urlparse('s3://bucket/path#customname'), ('s3', 'bucket', '/path', '', '', 'customname')) self.assertEqual(urlparse('s3://bucket'), ('s3', 'bucket', '', '', '', '')) self.assertEqual(urlparse('s3://bucket/'), ('s3', 'bucket', '/', '', '', ''))
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) try: stdout = self.invoke_hadoop( ['fs', '-lsr', path_glob], return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in StringIO(stdout): line = line.rstrip('\r\n') fields = line.split(' ') # Throw out directories if fields[0].startswith('d'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): if len(field) == 5 and field[2] == ':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string '%s'" % line) path = line.split(' ', path_index)[-1] # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def _ls(self, path_glob): """Helper method for :py:meth:`ls`; yields tuples of ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary. """ # clean up the base uri to ensure we have pass boto3 an s3:// URI # (not s3n://) scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob bucket_name, base_name = parse_s3_uri(base_uri) # allow subdirectories of the path/glob if path_glob and not path_glob.endswith('/'): dir_glob = path_glob + '/*' else: dir_glob = path_glob + '*' try: bucket = self.get_bucket(bucket_name) except botocore.exceptions.ClientError as ex: if _client_error_status(ex) == 404: # treat nonexistent as empty return raise for key in bucket.objects.filter(Prefix=base_name): uri = "%s://%s/%s" % (scheme, bucket_name, key.key) # enforce globbing if not (fnmatch.fnmatchcase(uri, path_glob) or fnmatch.fnmatchcase(uri, dir_glob)): continue yield uri, key
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) try: stdout = self.invoke_hadoop(['fs', '-lsr', path_glob], return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in StringIO(stdout): line = line.rstrip('\r\n') fields = line.split(' ') # Throw out directories if fields[0].startswith('d'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): if len(field) == 5 and field[2] == ':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string '%s'" % line) path = line.split(' ', path_index)[-1] # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def hadoop_fs_lsr(stdout, stderr, environ, *args): """Implements hadoop fs -lsr.""" hdfs_path_globs = args or [''] failed = False for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) paths = [] max_size = 0 if not real_paths: print >> stderr, ( 'lsr: Cannot access %s: No such file or directory.' % hdfs_path_glob) failed = True else: for real_path in real_paths: if os.path.isdir(real_path): for dirpath, dirnames, filenames in os.walk(real_path): paths.append((dirpath, scheme, netloc, 0)) for filename in filenames: path = os.path.join(dirpath, filename) size = os.path.getsize(path) max_size = size if size > max_size else max_size paths.append((path, scheme, netloc, size)) else: paths.append((real_path, scheme, netloc, 0)) for path in paths: print >> stdout, _hadoop_ls_line(*path + (max_size, environ)) if failed: return -1 else: return 0
def hadoop_fs_lsr(stdout, stderr, environ, *args): """Implements hadoop fs -lsr.""" hdfs_path_globs = args or [''] failed = False for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) paths = [] max_size = 0 if not real_paths: print(('lsr: Cannot access %s: No such file or directory.' % hdfs_path_glob), file=stderr) failed = True else: for real_path in real_paths: if os.path.isdir(real_path): for dirpath, dirnames, filenames in os.walk(real_path): paths.append((dirpath, scheme, netloc, 0)) for filename in filenames: path = os.path.join(dirpath, filename) size = os.path.getsize(path) max_size = size if size > max_size else max_size paths.append((path, scheme, netloc, size)) else: paths.append((real_path, scheme, netloc, 0)) for path in paths: print(_hadoop_ls_line(*path + (max_size, environ)), file=stdout) if failed: return -1 else: return 0
def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just in case we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # if it's a "file" (doesn't end with /), just check if it exists if not glob_match and not path_glob.endswith('/'): uri = path_glob if self.get_s3_key(uri): yield uri return # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob for uri in self._s3_ls(base_uri): uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri)) # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue yield uri
def ls(self, path_glob): """Recursively list files on S3. This doesn't list "directories" unless there's actually a corresponding key ending with a '/' (which is weird and confusing; don't make S3 keys ending in '/') To list a directory, path_glob must end with a trailing slash (foo and foo/ are different on S3) """ # clean up the base uri to ensure we have an equal uri to boto (s3://) # just incase we get passed s3n:// scheme = urlparse(path_glob).scheme # support globs glob_match = GLOB_RE.match(path_glob) # if it's a "file" (doesn't end with /), just check if it exists if not glob_match and not path_glob.endswith('/'): uri = path_glob if self.get_s3_key(uri): yield uri return # we're going to search for all keys starting with base_uri if glob_match: # cut it off at first wildcard base_uri = glob_match.group(1) else: base_uri = path_glob for uri in self._s3_ls(base_uri): uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri)) # enforce globbing if glob_match and not fnmatch.fnmatchcase(uri, path_glob): continue yield uri
def find_hdfs_files(hdfs_path_globs, environ): for hdfs_path_glob in hdfs_path_globs: parsed = urlparse(hdfs_path_glob) scheme = parsed.scheme netloc = parsed.netloc real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ) real_paths = glob.glob(real_path_glob) if not real_paths: yield hdfs_path_glob, True else: for real_path in real_paths: if os.path.isdir(real_path): yield (real_path, scheme, netloc, 0), None for dirpath, dirnames, filenames in os.walk(real_path): for dirname in dirnames: yield (os.path.join(dirpath, dirname), scheme, netloc, 0), None for filename in filenames: path = os.path.join(dirpath, filename) size = os.path.getsize(path) yield (path, scheme, netloc, size), None else: yield (real_path, scheme, netloc, 0), None