Exemple #1
0
def hadoop_fs_ls(stdout, stderr, environ, *args):
    """Implements hadoop fs -ls."""
    hdfs_path_globs = args or ['']

    failed = False
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)

        paths = []
        max_size = 0

        if not real_paths:
            print(('ls: Cannot access %s: No such file or directory.' %
                   hdfs_path_glob),
                  file=stderr)
            failed = True
        else:
            for real_path in real_paths:
                paths.append((real_path, scheme, netloc, 0))

        for path in paths:
            print(_hadoop_ls_line(*path + (max_size, environ)), file=stdout)

    if failed:
        return -1
    else:
        return 0
Exemple #2
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        try:
            stdout = self.invoke_hadoop(
                ['fs', '-lsr', path_glob],
                return_stdout=True,
                ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        path_index = None
        for line in StringIO(stdout):
            fields = line.rstrip('\r\n').split()

            # Throw out directories
            if fields[0].startswith('d'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar # HDFS
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar # S3
            if not path_index:
                for index, field in enumerate(fields):
                    if len(field) == 5 and field[2] == ':':
                        path_index = (index + 1)
                if not path_index:
                    raise IOError("Could not locate path in string '%s'" % line)

            path = ' '.join(fields[path_index:])
            yield hdfs_prefix + path
Exemple #3
0
def hadoop_fs_ls(stdout, stderr, environ, *args):
    """Implements hadoop fs -ls."""
    hdfs_path_globs = args or ['']

    failed = False
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)

        paths = []
        max_size = 0

        if not real_paths:
            print >> stderr, (
                'ls: Cannot access %s: No such file or directory.' %
                hdfs_path_glob)
            failed = True
        else:
            for real_path in real_paths:
                paths.append((real_path, scheme, netloc, 0))

        for path in paths:
            print >> stdout, _hadoop_ls_line(*path + (max_size, environ))

    if failed:
        return -1
    else:
        return 0
Exemple #4
0
    def ls(self, path_glob):
        if not is_uri(path_glob):
            for path in super(HadoopJobRunner, self).ls(path_glob):
                yield path
            return

        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        stdout = self._invoke_hadoop(['fs', '-lsr', path_glob],
                                     return_stdout=True,
                                     ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])

        for line in StringIO(stdout):
            fields = line.rstrip('\r\n').split()
            # expect lines like:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            if len(fields) < 8:
                raise Exception('unexpected ls line from hadoop: %r' % line)
            # ignore directories
            if fields[0].startswith('d'):
                continue
            # not sure if you can have spaces in filenames; just to be safe
            path = ' '.join(fields[7:])
            yield hdfs_prefix + path
Exemple #5
0
    def ls(self, path_glob):
        if not is_uri(path_glob):
            for path in super(HadoopJobRunner, self).ls(path_glob):
                yield path
            return

        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        stdout = self._invoke_hadoop(
            ['fs', '-lsr', path_glob],
            return_stdout=True,
            ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])

        for line in StringIO(stdout):
            fields = line.rstrip('\r\n').split()
            # expect lines like:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            if len(fields) < 8:
                raise Exception('unexpected ls line from hadoop: %r' % line)
            # ignore directories
            if fields[0].startswith('d'):
                continue
            # not sure if you can have spaces in filenames; just to be safe
            path = ' '.join(fields[7:])
            yield hdfs_prefix + path
Exemple #6
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args,
                                        return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_string(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Exemple #7
0
    def _ls_detailed(self, path_glob):
        """Recursively list files on GCS and includes some metadata about them:
        - object name
        - size
        - md5 hash
        - _uri

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.
        """

        scheme = urlparse(path_glob).scheme

        bucket_name, base_name = _path_glob_to_parsed_gcs_uri(path_glob)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        list_request = self.api_client.objects().list(
            bucket=bucket_name, prefix=base_name, fields=_LS_FIELDS_TO_RETURN)

        uri_prefix = '%s://%s' % (scheme, bucket_name)
        while list_request:
            try:
                resp = list_request.execute()
            except google_errors.HttpError as e:
                if e.resp.status == 404:
                    return

                raise

            resp_items = resp.get('items') or []
            for item in resp_items:
                # We generate the item URI by adding the "gs://" prefix
                uri = "%s/%s" % (uri_prefix, item['name'])

                # enforce globbing
                if not (fnmatch.fnmatchcase(uri, path_glob) or
                        fnmatch.fnmatchcase(uri, dir_glob)):
                    continue

                # filter out folders
                if uri.endswith('/'):
                    continue

                item['_uri'] = uri
                item['bucket'] = bucket_name
                item['size'] = int(item['size'])
                yield item

            list_request = self.api_client.objects().list_next(
                list_request, resp)
Exemple #8
0
    def _ls_detailed(self, path_glob):
        """Recursively list files on GCS and includes some metadata about them:
        - object name
        - size
        - md5 hash
        - _uri

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.
        """

        scheme = urlparse(path_glob).scheme

        bucket_name, base_name = _path_glob_to_parsed_gcs_uri(path_glob)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        list_request = self.api_client.objects().list(
            bucket=bucket_name, prefix=base_name, fields=_LS_FIELDS_TO_RETURN)

        uri_prefix = '%s://%s' % (scheme, bucket_name)
        while list_request:
            try:
                resp = list_request.execute()
            except google_errors.HttpError as e:
                if e.resp.status == 404:
                    return

                raise

            resp_items = resp.get('items') or []
            for item in resp_items:
                # We generate the item URI by adding the "gs://" prefix
                uri = "%s/%s" % (uri_prefix, item['name'])

                # enforce globbing
                if not (fnmatch.fnmatchcase(uri, path_glob) or
                        fnmatch.fnmatchcase(uri, dir_glob)):
                    continue

                # filter out folders
                if uri.endswith('/'):
                    continue

                item['_uri'] = uri
                item['bucket'] = bucket_name
                item['size'] = int(item['size'])
                yield item

            list_request = self.api_client.objects().list_next(
                list_request, resp)
Exemple #9
0
def hadoop_fs_lsr(stdout, stderr, environ, *args):
    """Implements hadoop fs -lsr."""
    hdfs_path_globs = args or ['']

    def ls_line(real_path, scheme, netloc):
        hdfs_path = real_path_to_hdfs_path(real_path, environ)

        # we could actually implement ls here, but mrjob only cares about
        # the path
        if os.path.isdir(real_path):
            file_type = 'd'
        else:
            file_type = '-'

        if scheme in ('s3', 's3n'):
            # no user and group on S3 (see Pull Request #573)
            user_and_group = ''
        else:
            user_and_group = 'dave supergroup'

        # newer Hadoop returns fully qualified URIs (see Pull Request #577)
        if scheme and environ.get('MOCK_HADOOP_LS_RETURNS_FULL_URIS'):
            hdfs_path = '%s://%s%s' % (scheme, netloc, hdfs_path)

        return (
            '%srwxrwxrwx - %s      18321 2010-10-01 15:16 %s' %
            (file_type, user_and_group, hdfs_path))

    failed = False
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)
        if not real_paths:
            print >> stderr, (
                'lsr: Cannot access %s: No such file or directory.' %
                hdfs_path_glob)
            failed = True
        else:
            for real_path in real_paths:
                if os.path.isdir(real_path):
                    for dirpath, dirnames, filenames in os.walk(real_path):
                        print >> stdout, ls_line(dirpath, scheme, netloc)
                        for filename in filenames:
                            path = os.path.join(dirpath, filename)
                            print >> stdout, ls_line(path, scheme, netloc)
                else:
                    print >> stdout, ls_line(real_path, scheme, netloc)

    if failed:
        return -1
    else:
        return 0
Exemple #10
0
def hdfs_path_to_real_path(hdfs_path, environ):
    components = urlparse(hdfs_path)

    scheme = components.scheme
    path = components.path

    if not scheme and not path.startswith('/'):
        path = '/user/%s/%s' % (environ['USER'], path)

    return os.path.join(environ['MOCK_HDFS_ROOT'], path.lstrip('/'))
Exemple #11
0
def hdfs_path_to_real_path(hdfs_path, environ):
    components = urlparse(hdfs_path)

    scheme = components.scheme
    path = components.path

    if not scheme and not path.startswith("/"):
        path = "/user/%s/%s" % (environ["USER"], path)

    return os.path.join(environ["MOCK_HDFS_ROOT"], path.lstrip("/"))
Exemple #12
0
def hdfs_path_to_real_path(hdfs_path, environ):
    components = urlparse(hdfs_path)

    scheme = components.scheme
    path = components.path

    if not scheme and not path.startswith('/'):
        path = '/user/%s/%s' % (environ['USER'], path)

    return os.path.join(environ['MOCK_HDFS_ROOT'], path.lstrip('/'))
Exemple #13
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args, return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_unicode(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Exemple #14
0
def hadoop_fs_lsr(stdout, stderr, environ, *args):
    """Implements hadoop fs -lsr."""
    hdfs_path_globs = args or ['']

    def ls_line(real_path, scheme, netloc):
        hdfs_path = real_path_to_hdfs_path(real_path, environ)

        # we could actually implement ls here, but mrjob only cares about
        # the path
        if os.path.isdir(real_path):
            file_type = 'd'
        else:
            file_type = '-'

        if scheme in ('s3', 's3n'):
            # no user and group on S3 (see Pull Request #573)
            user_and_group = ''
        else:
            user_and_group = 'dave supergroup'

        # newer Hadoop returns fully qualified URIs (see Pull Request #577)
        if scheme and environ.get('MOCK_HADOOP_LS_RETURNS_FULL_URIS'):
            hdfs_path = '%s://%s%s' % (scheme, netloc, hdfs_path)

        return ('%srwxrwxrwx - %s      18321 2010-10-01 15:16 %s' %
                (file_type, user_and_group, hdfs_path))

    failed = False
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)
        if not real_paths:
            print >> stderr, (
                'lsr: Cannot access %s: No such file or directory.' %
                hdfs_path_glob)
            failed = True
        else:
            for real_path in real_paths:
                if os.path.isdir(real_path):
                    for dirpath, dirnames, filenames in os.walk(real_path):
                        print >> stdout, ls_line(dirpath, scheme, netloc)
                        for filename in filenames:
                            path = os.path.join(dirpath, filename)
                            print >> stdout, ls_line(path, scheme, netloc)
                else:
                    print >> stdout, ls_line(real_path, scheme, netloc)

    if failed:
        return -1
    else:
        return 0
Exemple #15
0
def hdfs_uri_to_real_path(hdfs_uri, environ):
    """Map an HDFS URI to a path on the filesystem."""
    components = urlparse(hdfs_uri)

    scheme = components.scheme
    path = components.path

    if not scheme and not path.startswith('/'):
        path = '/user/%s/%s' % (environ['USER'], path)

    return os.path.join(get_mock_hdfs_root(environ=environ), path.lstrip('/'))
Exemple #16
0
    def copy_from_local(self, path, local_file):
        # Ensure that local_file has a file:/// at the beginning...
        local_file = urlparse(local_file)
        assert local_file.scheme in ('', 'test'), "local_file must be local"
        assert os.path.exists(local_file.path), "local_file must exist"
        local_file = urlunparse(['file'] + list(local_file[1:]))

        try:
            self.invoke_hadoop(['fs', '-put', local_file, path])
        except CalledProcessError as e:
            raise OSError("Could not create file: %s" % e)
Exemple #17
0
def hdfs_uri_to_real_path(hdfs_uri, environ):
    """Map an HDFS URI to a path on the filesystem."""
    components = urlparse(hdfs_uri)

    scheme = components.scheme
    path = components.path

    if not scheme and not path.startswith('/'):
        path = '/user/%s/%s' % (environ['USER'], path)

    return os.path.join(get_mock_hdfs_root(environ=environ), path.lstrip('/'))
Exemple #18
0
def _hadoop_fs_ls(cmd_name, stdout, stderr, environ, path_args, recursive):
    """Helper for hadoop_fs_ls() and hadoop_fs_lsr()."""
    hdfs_uri_globs = path_args or ['']

    failed = False
    for hdfs_uri_glob in hdfs_uri_globs:
        parsed = urlparse(hdfs_uri_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_uri_to_real_path(hdfs_uri_glob, environ)
        real_paths = glob.glob(real_path_glob)

        paths = []

        if not real_paths:
            print('%s: Cannot access %s: No such file or directory.' %
                  (cmd_name, hdfs_uri_glob),
                  file=stderr)
            failed = True
        else:
            for real_path in real_paths:
                if os.path.isdir(real_path):
                    if recursive:
                        for dirpath, dirnames, filenames in os.walk(real_path):
                            paths.append((dirpath, scheme, netloc, 0))
                            for filename in filenames:
                                path = os.path.join(dirpath, filename)
                                size = os.path.getsize(path)
                                paths.append((path, scheme, netloc, size))
                    else:
                        for filename in os.listdir(real_path):
                            path = os.path.join(real_path, filename)
                            if os.path.isdir(path):
                                size = 0
                            else:
                                size = os.path.getsize(path)
                            paths.append((path, scheme, netloc, size))
                else:
                    size = os.path.getsize(real_path)
                    paths.append((real_path, scheme, netloc, size))

        if paths:
            print('Found %d items' % len(paths), file=stdout)
            max_size = max(size for _, __, ___, size in paths)
            for path in paths:
                print(_hadoop_ls_line(*path + (max_size, environ)),
                      file=stdout)

    if failed:
        return -1
    else:
        return 0
Exemple #19
0
def _hadoop_fs_ls(cmd_name, stdout, stderr, environ, path_args, recursive):
    """Helper for hadoop_fs_ls() and hadoop_fs_lsr()."""
    hdfs_path_globs = path_args or ['']

    failed = False
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)

        paths = []

        if not real_paths:
            print('%s: Cannot access %s: No such file or directory.' %
                  (cmd_name, hdfs_path_glob), file=stderr)
            failed = True
        else:
            for real_path in real_paths:
                if os.path.isdir(real_path):
                    if recursive:
                        for dirpath, dirnames, filenames in os.walk(real_path):
                            paths.append((dirpath, scheme, netloc, 0))
                            for filename in filenames:
                                path = os.path.join(dirpath, filename)
                                size = os.path.getsize(path)
                                paths.append((path, scheme, netloc, size))
                    else:
                        for filename in os.listdir(real_path):
                            path = os.path.join(real_path, filename)
                            if os.path.isdir(path):
                                size = 0
                            else:
                                size = os.path.getsize(path)
                            paths.append((path, scheme, netloc, size))
                else:
                    size = os.path.getsize(real_path)
                    paths.append((real_path, scheme, netloc, size))

        if paths:
            print('Found %d items' % len(paths), file=stdout)
            max_size = max(size for _, __, ___, size in paths)
            for path in paths:
                print(_hadoop_ls_line(*path + (max_size, environ)),
                      file=stdout)

    if failed:
        return -1
    else:
        return 0
Exemple #20
0
def parse_gcs_uri(uri):
    """Parse a GCS URI into (bucket, key)

    >>> parse_gcs_uri("gs://walrus/tmp/")
    ('walrus', 'tmp/')

    If ``uri`` is not a GCS URI, raise a ValueError
    """
    components = urlparse(uri)
    if components.scheme != "gs" or '/' not in components.path:
        raise ValueError('Invalid GCS URI: %s' % uri)

    return components.netloc, components.path[1:]
Exemple #21
0
def parse_gcs_uri(uri):
    """Parse a GCS URI into (bucket, key)

    >>> parse_gcs_uri("gs://walrus/tmp/")
    ('walrus', 'tmp/')

    If ``uri`` is not a GCS URI, raise a ValueError
    """
    components = urlparse(uri)
    if components.scheme != "gs" or '/' not in components.path:
        raise ValueError('Invalid GCS URI: %s' % uri)

    return components.netloc, components.path[1:]
Exemple #22
0
    def join(self, path, *paths):
        """Join *paths* onto *path* (which may be a URI)"""
        all_paths = (path,) + paths

        # if there's a URI, we only care about it and what follows
        for i in range(len(all_paths), 0, -1):
            if is_uri(all_paths[i - 1]):
                scheme, netloc, uri_path = urlparse(all_paths[i - 1])[:3]
                return '%s://%s%s' % (
                    scheme, netloc, posixpath.join(
                        uri_path or '/', *all_paths[i:]))
        else:
            return os.path.join(*all_paths)
Exemple #23
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.

        .. versionchanged:: 0.5.0

            You no longer need a trailing slash to list "directories" on S3;
            both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list
            all keys starting with ``dir/``.
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob)
                    or fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri
Exemple #24
0
def hadoop_fs_put(stdout, stderr, environ, *args):
    """Implements hadoop fs -put"""
    if len(args) < 2:
        stderr.write('Usage: java FsShell [-put <localsrc> ... <dst>]')
        return -1

    srcs = args[:-1]
    dst = args[-1]

    real_dst = hdfs_path_to_real_path(dst, environ)
    dst_dir = os.path.isdir(real_dst)
    real_dir = os.path.dirname(real_dst)

    # dst could be a dir or a filename; we don't know
    if not dst_dir and not os.path.isdir(real_dir):
        os.makedirs(real_dir)

    skipped = False

    for src in srcs:
        # If the destination is a directory then we put the source into it
        # under its basename. If the destination is a file or does not exist
        # then this is where we wish to write to.
        target = os.path.join(real_dst, os.path.basename(src)) \
            if dst_dir else real_dst

        if os.path.exists(target):
            if os.path.isdir(src):
                stderr.write("Target %s is a directory" %
                             real_path_to_hdfs_path(target, environ))
            else:
                stderr.write("Target %s already exists" %
                             real_path_to_hdfs_path(target, environ))
            skipped = True
            continue

        src_url = urlparse(src)
        if src_url.scheme in ('file', ''):
            src = src_url.path
        else:
            raise ValueError("hadoop fs -put mock supports only empty or "
                             "'file' schemes for input: %s" % src)

        shutil.copy(src, real_dst)

    return 255 if skipped else 0
Exemple #25
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        *path_glob* can include ``?`` to match single characters or
        ``*`` to match 0 or more characters. Both ``?`` and ``*`` can match
        ``/``.

        .. versionchanged:: 0.5.0

            You no longer need a trailing slash to list "directories" on S3;
            both ``ls('s3://b/dir')`` and `ls('s3://b/dir/')` will list
            all keys starting with ``dir/``.
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        bucket = self.get_bucket(bucket_name)
        for key in bucket.list(base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.name)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob) or
                    fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri
Exemple #26
0
 def test_urlparse(self):
     assert_equal(urlparse('http://www.yelp.com/lil_brudder'),
                  ('http', 'www.yelp.com', '/lil_brudder', '', '', ''))
     assert_equal(urlparse('cant://touch/this'),
                  ('cant', 'touch', '/this', '', '', ''))
     assert_equal(urlparse('s3://bucket/path'),
                  ('s3', 'bucket', '/path', '', '', ''))
     assert_equal(urlparse('s3://bucket/path#customname'),
                  ('s3', 'bucket', '/path#customname', '', '', ''))
     assert_equal(urlparse('s3://bucket'), ('s3', 'bucket', '', '', '', ''))
     assert_equal(urlparse('s3://bucket/'),
                  ('s3', 'bucket', '/', '', '', ''))
Exemple #27
0
 def test_urlparse(self):
     self.assertEqual(urlparse('http://www.yelp.com/lil_brudder'),
                      ('http', 'www.yelp.com', '/lil_brudder', '', '', ''))
     self.assertEqual(urlparse('cant://touch/this'),
                      ('cant', 'touch', '/this', '', '', ''))
     self.assertEqual(urlparse('s3://bucket/path'),
                      ('s3', 'bucket', '/path', '', '', ''))
     self.assertEqual(urlparse('s3://bucket/path#customname'),
                      ('s3', 'bucket', '/path', '', '', 'customname'))
     self.assertEqual(urlparse('s3://bucket'),
                      ('s3', 'bucket', '', '', '', ''))
     self.assertEqual(urlparse('s3://bucket/'),
                      ('s3', 'bucket', '/', '', '', ''))
Exemple #28
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        try:
            stdout = self.invoke_hadoop(
                ['fs', '-lsr', path_glob],
                return_stdout=True,
                ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in StringIO(stdout):
            line = line.rstrip('\r\n')
            fields = line.split(' ')

            # Throw out directories
            if fields[0].startswith('d'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                if len(field) == 5 and field[2] == ':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string '%s'" % line)

            path = line.split(' ', path_index)[-1]
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Exemple #29
0
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary.
        """
        # clean up the  base uri to ensure we have pass boto3 an s3:// URI
        # (not s3n://)
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) == 404:  # treat nonexistent as empty
                return
            raise

        for key in bucket.objects.filter(Prefix=base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.key)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob) or
                    fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, key
Exemple #30
0
    def _ls(self, path_glob):
        """Helper method for :py:meth:`ls`; yields tuples of
        ``(uri, key)`` where *key* is the corresponding boto3 s3.ObjectSummary.
        """
        # clean up the  base uri to ensure we have pass boto3 an s3:// URI
        # (not s3n://)
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        bucket_name, base_name = parse_s3_uri(base_uri)

        # allow subdirectories of the path/glob
        if path_glob and not path_glob.endswith('/'):
            dir_glob = path_glob + '/*'
        else:
            dir_glob = path_glob + '*'

        try:
            bucket = self.get_bucket(bucket_name)
        except botocore.exceptions.ClientError as ex:
            if _client_error_status(ex) == 404:  # treat nonexistent as empty
                return
            raise

        for key in bucket.objects.filter(Prefix=base_name):
            uri = "%s://%s/%s" % (scheme, bucket_name, key.key)

            # enforce globbing
            if not (fnmatch.fnmatchcase(uri, path_glob)
                    or fnmatch.fnmatchcase(uri, dir_glob)):
                continue

            yield uri, key
Exemple #31
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        try:
            stdout = self.invoke_hadoop(['fs', '-lsr', path_glob],
                                        return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in StringIO(stdout):
            line = line.rstrip('\r\n')
            fields = line.split(' ')

            # Throw out directories
            if fields[0].startswith('d'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                if len(field) == 5 and field[2] == ':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string '%s'" % line)

            path = line.split(' ', path_index)[-1]
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Exemple #32
0
def hadoop_fs_lsr(stdout, stderr, environ, *args):
    """Implements hadoop fs -lsr."""
    hdfs_path_globs = args or ['']

    failed = False
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)

        paths = []
        max_size = 0

        if not real_paths:
            print >> stderr, (
                'lsr: Cannot access %s: No such file or directory.' %
                hdfs_path_glob)
            failed = True
        else:
            for real_path in real_paths:
                if os.path.isdir(real_path):
                    for dirpath, dirnames, filenames in os.walk(real_path):
                        paths.append((dirpath, scheme, netloc, 0))
                        for filename in filenames:
                            path = os.path.join(dirpath, filename)
                            size = os.path.getsize(path)
                            max_size = size if size > max_size else max_size
                            paths.append((path, scheme, netloc, size))
                else:
                    paths.append((real_path, scheme, netloc, 0))

        for path in paths:
            print >> stdout, _hadoop_ls_line(*path + (max_size, environ))

    if failed:
        return -1
    else:
        return 0
Exemple #33
0
def hadoop_fs_lsr(stdout, stderr, environ, *args):
    """Implements hadoop fs -lsr."""
    hdfs_path_globs = args or ['']

    failed = False
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)

        paths = []
        max_size = 0

        if not real_paths:
            print(('lsr: Cannot access %s: No such file or directory.' %
                   hdfs_path_glob),
                  file=stderr)
            failed = True
        else:
            for real_path in real_paths:
                if os.path.isdir(real_path):
                    for dirpath, dirnames, filenames in os.walk(real_path):
                        paths.append((dirpath, scheme, netloc, 0))
                        for filename in filenames:
                            path = os.path.join(dirpath, filename)
                            size = os.path.getsize(path)
                            max_size = size if size > max_size else max_size
                            paths.append((path, scheme, netloc, size))
                else:
                    paths.append((real_path, scheme, netloc, 0))

        for path in paths:
            print(_hadoop_ls_line(*path + (max_size, environ)), file=stdout)

    if failed:
        return -1
    else:
        return 0
Exemple #34
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just in case we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith('/'):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri))

            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri
Exemple #35
0
    def ls(self, path_glob):
        """Recursively list files on S3.

        This doesn't list "directories" unless there's actually a
        corresponding key ending with a '/' (which is weird and confusing;
        don't make S3 keys ending in '/')

        To list a directory, path_glob must end with a trailing
        slash (foo and foo/ are different on S3)
        """

        # clean up the  base uri to ensure we have an equal uri to boto (s3://)
        # just incase we get passed s3n://
        scheme = urlparse(path_glob).scheme

        # support globs
        glob_match = GLOB_RE.match(path_glob)

        # if it's a "file" (doesn't end with /), just check if it exists
        if not glob_match and not path_glob.endswith('/'):
            uri = path_glob
            if self.get_s3_key(uri):
                yield uri
            return

        # we're going to search for all keys starting with base_uri
        if glob_match:
            # cut it off at first wildcard
            base_uri = glob_match.group(1)
        else:
            base_uri = path_glob

        for uri in self._s3_ls(base_uri):
            uri = "%s://%s/%s" % ((scheme,) + parse_s3_uri(uri))

            # enforce globbing
            if glob_match and not fnmatch.fnmatchcase(uri, path_glob):
                continue

            yield uri
Exemple #36
0
def find_hdfs_files(hdfs_path_globs, environ):
    for hdfs_path_glob in hdfs_path_globs:
        parsed = urlparse(hdfs_path_glob)
        scheme = parsed.scheme
        netloc = parsed.netloc

        real_path_glob = hdfs_path_to_real_path(hdfs_path_glob, environ)
        real_paths = glob.glob(real_path_glob)

        if not real_paths:
            yield hdfs_path_glob, True
        else:
            for real_path in real_paths:
                if os.path.isdir(real_path):
                    yield (real_path, scheme, netloc, 0), None
                    for dirpath, dirnames, filenames in os.walk(real_path):
                        for dirname in dirnames:
                            yield (os.path.join(dirpath, dirname), scheme, netloc, 0), None
                        for filename in filenames:
                            path = os.path.join(dirpath, filename)
                            size = os.path.getsize(path)
                            yield (path, scheme, netloc, size), None
                else:
                    yield (real_path, scheme, netloc, 0), None