Example #1
0
 def test_uri_parsing(self):
     self.assertEqual(is_uri('notauri!'), False)
     self.assertEqual(is_uri('they://did/the/monster/mash'), True)
     self.assertEqual(is_s3_uri('s3://a/uri'), True)
     self.assertEqual(is_s3_uri('s3n://a/uri'), True)
     self.assertEqual(is_s3_uri('hdfs://a/uri'), False)
     self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
Example #2
0
 def test_uri_parsing(self):
     self.assertEqual(is_uri('notauri!'), False)
     self.assertEqual(is_uri('they://did/the/monster/mash'), True)
     self.assertEqual(is_s3_uri('s3://a/uri'), True)
     self.assertEqual(is_s3_uri('s3n://a/uri'), True)
     self.assertEqual(is_s3_uri('hdfs://a/uri'), False)
     self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
Example #3
0
 def join(self, dirname, filename):
     """Join *filename* onto *dirname* (which may be a URI)"""
     if is_uri(filename):
         return filename
     elif is_uri(dirname):
         return posixpath.join(dirname, filename)
     else:
         return os.path.join(dirname, filename)
Example #4
0
 def join(self, dirname, filename):
     """Join *filename* onto *dirname* (which may be a URI)"""
     if is_uri(filename):
         return filename
     elif is_uri(dirname):
         return posixpath.join(dirname, filename)
     else:
         return os.path.join(dirname, filename)
Example #5
0
    def _create_input_manifest_if_needed(self):
        """Create a file with a list of URIs of input files."""
        if self._input_manifest_path or not self._uses_input_manifest():
            return

        uris = []

        log.info('finding input files to add to manifest...')

        for path in self._get_input_paths():
            log.debug('  in %s' % path)
            if is_uri(path):
                # URIs might be globs
                for uri in self.fs.ls(path):
                    uris.append(uri)
            else:
                # local paths are expected to be single files
                # (shell would resolve globs)
                if self._upload_mgr:
                    uris.append(self._upload_mgr.uri(path))
                else:
                    # just make sure job can find files from it's working dir
                    uris.append(os.path.abspath(path))

        log.info('found %d input files' % len(uris))

        path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt')
        self._write_script(uris, path, 'input manifest')

        self._input_manifest_path = path
        if self._upload_mgr:
            self._upload_mgr.add(self._input_manifest_path)
Example #6
0
    def test_default(self):
        runner = SparkMRJobRunner()

        self.assertFalse(is_uri(runner._spark_tmp_dir))
        self.assertIsNone(runner._upload_mgr)

        self.assertEqual(runner._spark_tmp_dir[-6:], '-spark')
Example #7
0
    def ls(self, path_glob):
        if not is_uri(path_glob):
            for path in super(HadoopJobRunner, self).ls(path_glob):
                yield path
            return

        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        stdout = self._invoke_hadoop(
            ['fs', '-lsr', path_glob],
            return_stdout=True,
            ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])

        for line in StringIO(stdout):
            fields = line.rstrip('\r\n').split()
            # expect lines like:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            if len(fields) < 8:
                raise Exception('unexpected ls line from hadoop: %r' % line)
            # ignore directories
            if fields[0].startswith('d'):
                continue
            # not sure if you can have spaces in filenames; just to be safe
            path = ' '.join(fields[7:])
            yield hdfs_prefix + path
Example #8
0
    def test_spark_master_mesos(self):
        runner = SparkMRJobRunner(spark_master='mesos://host:12345')

        self.assertTrue(is_uri(runner._spark_tmp_dir))
        self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')

        self.assertIsNotNone(runner._upload_mgr)
Example #9
0
    def _cat_file(self, filename):
        if is_uri(filename):
            # stream from HDFS
            cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename]
            log.debug('> %s' % cmd_line(cat_args))

            cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

            def stream():
                for line in cat_proc.stdout:
                    yield line

                # there shouldn't be any stderr
                for line in cat_proc.stderr:
                    log.error('STDERR: ' + line)

                returncode = cat_proc.wait()

                if returncode != 0:
                    raise CalledProcessError(returncode, cat_args)

            return read_file(filename, stream())
        else:
            # read from local filesystem
            return super(HadoopJobRunner, self)._cat_file(filename)
Example #10
0
    def test_spark_master_yarn(self):
        runner = SparkMRJobRunner(spark_master='yarn')

        self.assertTrue(is_uri(runner._spark_tmp_dir))
        self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')

        self.assertIsNotNone(runner._upload_mgr)
Example #11
0
    def _setup_input(self):
        """Copy local input files (if any) to a special directory on HDFS.

        Set self._hdfs_input_files
        """
        # winnow out HDFS files from local ones
        self._hdfs_input_files = []
        local_input_files = []

        for path in self._input_paths:
            if is_uri(path):
                # Don't even bother running the job if the input isn't there.
                if not self.ls(path):
                    raise AssertionError(
                        'Input path %s does not exist!' % (path,))
                self._hdfs_input_files.append(path)
            else:
                local_input_files.append(path)

        # copy local files into an input directory, with names like
        # 00000-actual_name.ext
        if local_input_files:
            hdfs_input_dir = posixpath.join(self._hdfs_tmp_dir, 'input')
            log.info('Uploading input to %s' % hdfs_input_dir)
            self._mkdir_on_hdfs(hdfs_input_dir)

            for i, path in enumerate(local_input_files):
                if path == '-':
                    path = self._dump_stdin_to_local_file()

                target = '%s/%05i-%s' % (
                    hdfs_input_dir, i, os.path.basename(path))
                self._upload_to_hdfs(path, target)

            self._hdfs_input_files.append(hdfs_input_dir)
Example #12
0
    def test_default(self):
        runner = SparkMRJobRunner()

        self.assertFalse(is_uri(runner._spark_tmp_dir))
        self.assertIsNone(runner._upload_mgr)

        self.assertEqual(runner._spark_tmp_dir[-6:], '-spark')
Example #13
0
    def _cat_file(self, filename):
        if is_uri(filename):
            # stream from HDFS
            cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename]
            log.debug('> %s' % cmd_line(cat_args))

            cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE)

            def stream():
                for line in cat_proc.stdout:
                    yield line

                # there shouldn't be any stderr
                for line in cat_proc.stderr:
                    log.error('STDERR: ' + line)

                returncode = cat_proc.wait()

                if returncode != 0:
                    raise CalledProcessError(returncode, cat_args)

            return read_file(filename, stream())
        else:
            # read from local filesystem
            return super(HadoopJobRunner, self)._cat_file(filename)
Example #14
0
    def test_spark_master_yarn(self):
        runner = SparkMRJobRunner(spark_master='yarn')

        self.assertTrue(is_uri(runner._spark_tmp_dir))
        self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')

        self.assertIsNotNone(runner._upload_mgr)
Example #15
0
    def ls(self, path_glob):
        if not is_uri(path_glob):
            for path in super(HadoopJobRunner, self).ls(path_glob):
                yield path
            return

        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        stdout = self._invoke_hadoop(['fs', '-lsr', path_glob],
                                     return_stdout=True,
                                     ok_stderr=[HADOOP_LSR_NO_SUCH_FILE])

        for line in StringIO(stdout):
            fields = line.rstrip('\r\n').split()
            # expect lines like:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            if len(fields) < 8:
                raise Exception('unexpected ls line from hadoop: %r' % line)
            # ignore directories
            if fields[0].startswith('d'):
                continue
            # not sure if you can have spaces in filenames; just to be safe
            path = ' '.join(fields[7:])
            yield hdfs_prefix + path
Example #16
0
    def _create_input_manifest_if_needed(self):
        """Create a file with a list of URIs of input files."""
        if self._input_manifest_path or not self._uses_input_manifest():
            return

        uris = []

        log.info('finding input files to add to manifest...')

        for path in self._get_input_paths():
            log.debug('  in %s' % path)
            if is_uri(path):
                # URIs might be globs
                for uri in self.fs.ls(path):
                    uris.append(uri)
            else:
                # local paths are expected to be single files
                # (shell would resolve globs)
                if self._upload_mgr:
                    uris.append(self._upload_mgr.uri(path))
                else:
                    # just make sure job can find files from it's working dir
                    uris.append(os.path.abspath(path))

        log.info('found %d input files' % len(uris))

        path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt')
        self._write_script(uris, path, 'input manifest')

        self._input_manifest_path = path
        if self._upload_mgr:
            self._upload_mgr.add(self._input_manifest_path)
Example #17
0
    def test_spark_master_mesos(self):
        runner = SparkMRJobRunner(spark_master='mesos://host:12345')

        self.assertTrue(is_uri(runner._spark_tmp_dir))
        self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///')

        self.assertIsNotNone(runner._upload_mgr)
Example #18
0
def _fully_qualify_subnetwork_uri(uri, project_id, region):
    if '/' not in uri:  # just a name
        uri = 'projects/%s/%s/subnetworks/%s' % (project_id, region, uri)

    if not is_uri(uri):
        uri = 'https://www.googleapis.com/compute/v1/' + uri

    return uri
Example #19
0
def fully_qualify_hdfs_path(path):
    """If path isn't an ``hdfs://`` URL, turn it into one."""
    if is_uri(path):
        return path
    elif path.startswith('/'):
        return 'hdfs://' + path
    else:
        return 'hdfs:///user/%s/%s' % (getpass.getuser(), path)
Example #20
0
def _from_file_uri(path_or_uri):
    if is_uri(path_or_uri):
        if path_or_uri.startswith('file:///'):
            return path_or_uri[7:]  # keep last /
        else:
            raise ValueError('Not a file:/// URI')
    else:
        return path_or_uri
Example #21
0
def fully_qualify_hdfs_path(path):
    """If path isn't an ``hdfs://`` URL, turn it into one."""
    if is_uri(path):
        return path
    elif path.startswith('/'):
        return 'hdfs://' + path
    else:
        return 'hdfs:///user/%s/%s' % (getpass.getuser(), path)
Example #22
0
    def test_copy_files_with_rename_to_remote_wd_mirror(self):
        self.add_mock_s3_data(
            {'walrus': {
                'fish': b'salmon',
                'fowl': b'goose'
            }})

        foe_path = self.makefile('foe', b'giant')

        run_spark_submit = self.start(
            patch('mrjob.bin.MRJobBinRunner._run_spark_submit',
                  return_value=0))

        job = MRSparkOSWalk([
            '-r', 'spark', '--spark-master', 'mesos://host:9999',
            '--spark-tmp-dir', 's3://walrus/tmp', '--file',
            's3://walrus/fish#ghoti', '--file', 's3://walrus/fowl', '--file',
            foe_path
        ])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            fs = runner.fs

            self.assertIsNotNone(wd_mirror)
            self.assertTrue(is_uri(wd_mirror))

            self.assertTrue(fs.exists(wd_mirror))
            # uploaded for rename
            self.assertTrue(fs.exists(fs.join(wd_mirror, 'ghoti')))
            # wrong name
            self.assertFalse(fs.exists(fs.join(wd_mirror, 'fish')))
            # no need to upload, already visible
            self.assertFalse(fs.exists(fs.join(wd_mirror, 'fowl')))
            # need to upload from local to remote
            self.assertTrue(fs.exists(fs.join(wd_mirror, 'foe')))

            run_spark_submit.assert_called_once_with(ANY,
                                                     ANY,
                                                     record_callback=ANY)

            spark_submit_args = run_spark_submit.call_args[0][0]
            self.assertIn('--files', spark_submit_args)
            files_arg = spark_submit_args[spark_submit_args.index('--files') +
                                          1]

            self.assertEqual(
                files_arg, ','.join([
                    fs.join(wd_mirror, 'foe'),
                    's3://walrus/fowl',
                    fs.join(wd_mirror, 'ghoti'),
                    fs.join(wd_mirror, 'mr_spark_os_walk.py'),
                ]))
Example #23
0
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args,
                                        return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_string(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Example #24
0
    def uri(self, path):
        """Get the URI for the given path. If *path* is a URI, just return it.
        """
        if is_uri(path):
            return path

        if path in self._path_to_name:
            return posixpath.join(self.prefix, self._path_to_name[path])
        else:
            raise ValueError('%r is not a URI or a known local file' % (path,))
Example #25
0
    def uri(self, path):
        """Get the URI for the given path. If *path* is a URI, just return it.
        """
        if (not os.path.exists(path)) and is_uri(path):
            return path

        if path in self._path_to_name:
            return posixpath.join(self.prefix, self._path_to_name[path])
        else:
            raise ValueError('%r is not a URI or a known local file' % (path,))
Example #26
0
File: hadoop.py Project: Yelp/mrjob
    def ls(self, path_glob):
        components = urlparse(path_glob)
        hdfs_prefix = '%s://%s' % (components.scheme, components.netloc)

        version = self.get_hadoop_version()

        # use ls -R on Hadoop 2 (see #1152)
        if uses_yarn(version):
            args = ['fs', '-ls', '-R', path_glob]
        else:
            args = ['fs', '-lsr', path_glob]

        try:
            stdout = self.invoke_hadoop(args, return_stdout=True,
                                        ok_stderr=[_HADOOP_LS_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not ls %s" % path_glob)

        for line in BytesIO(stdout):
            line = line.rstrip(b'\r\n')

            # ignore total item count
            if line.startswith(b'Found '):
                continue

            fields = line.split(b' ')

            # Throw out directories
            if fields[0].startswith(b'd'):
                continue

            # Try to figure out which part of the line is the path
            # Expected lines:
            #
            # HDFS:
            # -rw-r--r--   3 dave users       3276 2010-01-13 14:00 /foo/bar
            #
            # S3:
            # -rwxrwxrwx   1          3276 010-01-13 14:00 /foo/bar
            path_index = None
            for index, field in enumerate(fields):
                # look for time field, and pick one after that
                # (can't use field[2] because that's an int in Python 3)
                if len(field) == 5 and field[2:3] == b':':
                    path_index = (index + 1)
            if not path_index:
                raise IOError("Could not locate path in string %r" % line)

            path = to_unicode(line.split(b' ', path_index)[-1])
            # handle fully qualified URIs from newer versions of Hadoop ls
            # (see Pull Request #577)
            if is_uri(path):
                yield path
            else:
                yield hdfs_prefix + path
Example #27
0
    def path_exists(self, path_glob):
        """Does the given path exist?

        If dest is a directory (ends with a "/"), we check if there are
        any files starting with that path.
        """
        if not is_uri(path_glob):
            return super(HadoopJobRunner, self).path_exists(path_glob)

        return bool(self._invoke_hadoop(['fs', '-test', '-e', path_glob],
                                        ok_returncodes=(0, 1)))
Example #28
0
def _endpoint_url(host_or_uri):
    """If *host_or_uri* is non-empty and isn't a URI, prepend ``'https://'``.

    Otherwise, pass through as-is.
    """
    if not host_or_uri:
        return host_or_uri
    elif is_uri(host_or_uri):
        return host_or_uri
    else:
        return 'https://' + host_or_uri
Example #29
0
def _endpoint_url(host_or_uri):
    """If *host_or_uri* is non-empty and isn't a URI, prepend ``'https://'``.

    Otherwise, pass through as-is.
    """
    if not host_or_uri:
        return host_or_uri
    elif is_uri(host_or_uri):
        return host_or_uri
    else:
        return 'https://' + host_or_uri
Example #30
0
    def path_exists(self, path_glob):
        """Does the given path exist?

        If dest is a directory (ends with a "/"), we check if there are
        any files starting with that path.
        """
        if not is_uri(path_glob):
            return super(HadoopJobRunner, self).path_exists(path_glob)

        return bool(
            self._invoke_hadoop(['fs', '-test', '-e', path_glob],
                                ok_returncodes=(0, 1)))
Example #31
0
    def test_copy_files_with_rename_to_remote_wd_mirror(self):
        self.add_mock_s3_data({'walrus': {'fish': b'salmon',
                                          'fowl': b'goose'}})

        foe_path = self.makefile('foe', b'giant')

        run_spark_submit = self.start(patch(
            'mrjob.bin.MRJobBinRunner._run_spark_submit',
            return_value=0))

        job = MRSparkOSWalk(['-r', 'spark',
                             '--spark-master', 'mesos://host:9999',
                             '--spark-tmp-dir', 's3://walrus/tmp',
                             '--file', 's3://walrus/fish#ghoti',
                             '--file', 's3://walrus/fowl',
                             '--file', foe_path])
        job.sandbox()

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            fs = runner.fs

            self.assertIsNotNone(wd_mirror)
            self.assertTrue(is_uri(wd_mirror))

            self.assertTrue(fs.exists(wd_mirror))
            # uploaded for rename
            self.assertTrue(fs.exists(fs.join(wd_mirror, 'ghoti')))
            # wrong name
            self.assertFalse(fs.exists(fs.join(wd_mirror, 'fish')))
            # no need to upload, already visible
            self.assertFalse(fs.exists(fs.join(wd_mirror, 'fowl')))
            # need to upload from local to remote
            self.assertTrue(fs.exists(fs.join(wd_mirror, 'foe')))

            run_spark_submit.assert_called_once_with(
                ANY, ANY, record_callback=ANY)

            spark_submit_args = run_spark_submit.call_args[0][0]
            self.assertIn('--files', spark_submit_args)
            files_arg = spark_submit_args[
                spark_submit_args.index('--files') + 1]

            self.assertEqual(
                files_arg, ','.join([
                    fs.join(wd_mirror, 'foe'),
                    's3://walrus/fowl',
                    fs.join(wd_mirror, 'ghoti'),
                    fs.join(wd_mirror, 'mr_spark_os_walk.py'),
                ]))
Example #32
0
    def join(self, path, *paths):
        """Join *paths* onto *path* (which may be a URI)"""
        all_paths = (path,) + paths

        # if there's a URI, we only care about it and what follows
        for i in range(len(all_paths), 0, -1):
            if is_uri(all_paths[i - 1]):
                scheme, netloc, uri_path = urlparse(all_paths[i - 1])[:3]
                return '%s://%s%s' % (
                    scheme, netloc, posixpath.join(
                        uri_path or '/', *all_paths[i:]))
        else:
            return os.path.join(*all_paths)
Example #33
0
    def add(self, path):
        """Add a path. If *path* hasn't been added before, assign it a name.
                       If *path* is a URI don't add it; just return the URI.

        :return: the URI assigned to the path"""
        if is_uri(path):
            return path

        if path not in self._path_to_name:
            name = name_uniquely(path, names_taken=self._names_taken)
            self._names_taken.add(name)
            self._path_to_name[path] = name

        return self.uri(path)
Example #34
0
    def du(self, path_glob):
        """Get the size of a file, or None if it's not a file or doesn't
        exist."""
        if not is_uri(path_glob):
            return super(HadoopJobRunner, self).dus(path_glob)

        stdout = self._invoke_hadoop(['fs', '-du', path_glob],
                                     return_stdout=True)

        try:
            return int(stdout.split()[1])
        except (ValueError, TypeError, IndexError):
            raise Exception('Unexpected output from hadoop fs -du: %r' %
                            stdout)
Example #35
0
    def du(self, path_glob):
        """Get the size of a file, or None if it's not a file or doesn't
        exist."""
        if not is_uri(path_glob):
            return super(HadoopJobRunner, self).dus(path_glob)

        stdout = self._invoke_hadoop(['fs', '-du', path_glob],
                                     return_stdout=True)

        try:
            return int(stdout.split()[1])
        except (ValueError, TypeError, IndexError):
            raise Exception(
                'Unexpected output from hadoop fs -du: %r' % stdout)
Example #36
0
    def add(self, path):
        """Add a path. If *path* hasn't been added before, assign it a name.
                       If *path* is a URI don't add it; just return the URI.

        :return: the URI assigned to the path"""
        if (not os.path.exists(path)) and is_uri(path):
            return path

        if path not in self._path_to_name:
            name = name_uniquely(path, names_taken=self._names_taken)
            self._names_taken.add(name)
            self._path_to_name[path] = name

        return self.uri(path)
Example #37
0
def fully_qualify_hadoop_path(path):
    """If we're on MapR, we should get an alternative to hdfs://. CDH4 will fail"""
    process = Popen(HADOOP_FETCH_URI_SCHEME, shell=True, stdout=PIPE, stderr=STDOUT)
    uri_scheme = process.communicate()[0]
    if process.returncode != 0:
       uri_scheme='hdfs://'
    else:
       uri_scheme = HADOOP_FETCH_URI_CLEANUP.sub('//', uri_scheme)
    if is_uri(path):
        return path
    elif path.startswith('/'):
        return uri_scheme + path
    else:
        return '%s/user/%s/%s' % (uri_scheme, getpass.getuser(), path)
Example #38
0
    def setUp(self):
        super(CompositeFilesystemTestCase, self).setUp()

        self.log = self.start(patch('mrjob.fs.composite.log'))

        self.hadoop_fs = Mock(spec=Filesystem)
        self.hadoop_fs.get_hadoop_version = Mock()
        self.hadoop_fs.can_handle_path.side_effect = is_uri

        self.local_fs = Mock(spec=Filesystem)
        self.local_fs.can_handle_path.side_effect = lambda p: not is_uri(p)

        self.s3_fs = Mock(spec=Filesystem)
        self.s3_fs.create_bucket = Mock()
        self.s3_fs.can_handle_path.side_effect = is_s3_uri
Example #39
0
 def test_is_uri(self):
     self.assertEqual(is_uri('notauri!'), False)
     self.assertEqual(is_uri('they://did/the/monster/mash'), True)
     self.assertEqual(is_uri('C:\some\windows\path'), False)
     # test #1455
     self.assertEqual(is_uri('2016-10-11T06:29:17'), False)
     # sorry, we only care about file URIs
     self.assertEqual(is_uri('mailto:[email protected]'), False)
     # urlparse has to accept it
     self.assertEqual(is_uri('://'), False)
Example #40
0
    def rm(self, path_glob):
        if not is_uri(path_glob):
            super(HadoopJobRunner, self).rm(path_glob)

        if self.path_exists(path_glob):
            # hadoop fs -rmr will print something like:
            # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf
            # to STDOUT, which we don't care about.
            #
            # if we ask to delete a path that doesn't exist, it prints
            # to STDERR something like:
            # rmr: <path>
            # which we can safely ignore
            self._invoke_hadoop(
                ['fs', '-rmr', path_glob],
                return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])
Example #41
0
    def rm(self, path_glob):
        if not is_uri(path_glob):
            super(HadoopFilesystem, self).rm(path_glob)

        version = self.get_hadoop_version()
        if uses_yarn(version):
            args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob]
        else:
            args = ['fs', '-rmr', '-skipTrash', path_glob]

        try:
            self.invoke_hadoop(args,
                               return_stdout=True,
                               ok_stderr=[_HADOOP_RM_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not rm %s" % path_glob)
Example #42
0
    def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(
                dir_path, names_taken=self._dir_archive_names_taken)
            self._dir_archive_names_taken.add(name)

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]
Example #43
0
    def rm(self, path_glob):
        if not is_uri(path_glob):
            super(HadoopJobRunner, self).rm(path_glob)

        if self.path_exists(path_glob):
            # hadoop fs -rmr will print something like:
            # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf
            # to STDOUT, which we don't care about.
            #
            # if we ask to delete a path that doesn't exist, it prints
            # to STDERR something like:
            # rmr: <path>
            # which we can safely ignore
            self._invoke_hadoop(['fs', '-rmr', path_glob],
                                return_stdout=True,
                                ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])
Example #44
0
    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if is_uri(path) and not is_gcs_uri(path):
                continue  # can't check non-GCS URIs, hope for the best

            if not self.fs.exists(path):
                raise AssertionError(
                    'Input path %s does not exist!' % (path,))
Example #45
0
File: hadoop.py Project: Yelp/mrjob
    def rm(self, path_glob):
        if not is_uri(path_glob):
            super(HadoopFilesystem, self).rm(path_glob)

        version = self.get_hadoop_version()
        if uses_yarn(version):
            args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob]
        else:
            args = ['fs', '-rmr', '-skipTrash', path_glob]

        try:
            self.invoke_hadoop(
                args,
                return_stdout=True, ok_stderr=[_HADOOP_RM_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not rm %s" % path_glob)
Example #46
0
    def _check_input_exists(self):
        """Make sure all input exists before continuing with our job.
        """
        if not self._opts['check_input_paths']:
            return

        for path in self._input_paths:
            if path == '-':
                continue  # STDIN always exists

            if is_uri(path) and not is_gcs_uri(path):
                continue  # can't check non-GCS URIs, hope for the best

            if not self.fs.exists(path):
                raise AssertionError('Input path %s does not exist!' %
                                     (path, ))
Example #47
0
    def __init__(self, **kwargs):
        super(SparkMRJobRunner, self).__init__(**kwargs)

        self._spark_tmp_dir = self._pick_spark_tmp_dir()

        # where local files are uploaded into Spark
        if is_uri(self._spark_tmp_dir):
            spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '')
            self._upload_mgr = UploadDirManager(spark_files_dir)

        # where to put job output (if not set explicitly)
        if not self._output_dir:
            self._output_dir = posixpath.join(self._spark_tmp_dir, 'output')

        # keep track of where the spark-submit binary is
        self._spark_submit_bin = self._opts['spark_submit_bin']
Example #48
0
    def _dir_archive_path(self, dir_path):
        """Assign a path for the archive of *dir_path* but don't
        actually create anything."""
        if dir_path not in self._dir_to_archive_path:
            # we can check local paths now
            if not (is_uri(dir_path) or os.path.isdir(dir_path)):
                raise OSError('%s is not a directory!' % dir_path)

            name = name_uniquely(dir_path,
                                 names_taken=self._dir_archive_names_taken)
            self._dir_archive_names_taken.add(name)

            self._dir_to_archive_path[dir_path] = os.path.join(
                self._get_local_tmp_dir(), 'archives', name + '.tar.gz')

        return self._dir_to_archive_path[dir_path]
Example #49
0
    def _create_dir_archive(self, dir_path):
        """Helper for :py:meth:`archive_dir`"""
        if not self.fs.exists(dir_path):
            raise OSError('%s does not exist')

        tar_gz_path = self._dir_archive_path(dir_path)

        if tar_gz_path in self._dir_archives_created:
            return  # already created

        if not os.path.isdir(os.path.dirname(tar_gz_path)):
            os.makedirs(os.path.dirname(tar_gz_path))

        # for remote files
        tmp_download_path = os.path.join(self._get_local_tmp_dir(),
                                         'tmp-download')

        log.info('Archiving %s -> %s' % (dir_path, tar_gz_path))

        with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz:
            for path in self.fs.ls(dir_path):
                # fs.ls() only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError('attempted to archive %s into itself!' %
                                  tar_gz_path)

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

                    log.info('  downloading %s -> %s' %
                             (path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in self.fs.cat(path):
                            f.write(chunk)
                    local_path = tmp_download_path
                else:
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)

        self._dir_archives_created.add(tar_gz_path)
Example #50
0
    def _create_dir_archive(self, dir_path):
        """Helper for :py:meth:`archive_dir`"""
        if not self.fs.exists(dir_path):
            raise OSError('%s does not exist')

        tar_gz_path = self._dir_archive_path(dir_path)

        if tar_gz_path in self._dir_archives_created:
            return  # already created

        if not os.path.isdir(os.path.dirname(tar_gz_path)):
            os.makedirs(os.path.dirname(tar_gz_path))

        # for remote files
        tmp_download_path = os.path.join(
            self._get_local_tmp_dir(), 'tmp-download')

        log.info('Archiving %s -> %s' % (dir_path, tar_gz_path))

        with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz:
            for path in self.fs.ls(dir_path):
                # fs.ls() only lists files
                if path == dir_path:
                    raise OSError('%s is a file, not a directory!' % dir_path)

                # TODO: do we need this?
                if os.path.realpath(path) == os.path.realpath(tar_gz_path):
                    raise OSError(
                        'attempted to archive %s into itself!' % tar_gz_path)

                if is_uri(path):
                    path_in_tar_gz = path[len(dir_path):].lstrip('/')

                    log.info('  downloading %s -> %s' % (
                        path, tmp_download_path))
                    with open(tmp_download_path, 'wb') as f:
                        for chunk in self.fs.cat(path):
                            f.write(chunk)
                    local_path = tmp_download_path
                else:
                    path_in_tar_gz = path[len(dir_path):].lstrip(os.sep)
                    local_path = path

                log.debug('  adding %s to %s' % (path, tar_gz_path))
                tar_gz.add(local_path, path_in_tar_gz, recursive=False)

        self._dir_archives_created.add(tar_gz_path)
Example #51
0
    def add(self, path):
        """Add a path. If *path* hasn't been added before, assign it a name.
                       If *path* is a URI don't add it; just return the URI.

        :return: the URI assigned to the path"""
        if is_uri(path):
            return path

        if path not in self._path_to_name:
            # use unhide so that input files won't be hidden from Hadoop,
            # see #1200
            name = name_uniquely(
                path, names_taken=self._names_taken, unhide=True)
            self._names_taken.add(name)
            self._path_to_name[path] = name

        return self.uri(path)
Example #52
0
    def _upload_non_input_files(self):
        """Copy files to HDFS, and set the 'hdfs_uri' field for each file.
        """
        self._pick_hdfs_uris_for_files()

        hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '')
        self._mkdir_on_hdfs(hdfs_files_dir)
        log.info('Copying non-input files into %s' % hdfs_files_dir)

        for file_dict in self._files:
            path = file_dict['path']

            # don't bother with files already in HDFS
            if is_uri(path):
                continue

            self._upload_to_hdfs(path, file_dict['hdfs_uri'])
Example #53
0
File: setup.py Project: Yelp/mrjob
    def add(self, path):
        """Add a path. If *path* hasn't been added before, assign it a name.
                       If *path* is a URI don't add it; just return the URI.

        :return: the URI assigned to the path"""
        if is_uri(path):
            return path

        if path not in self._path_to_name:
            # use unhide so that input files won't be hidden from Hadoop,
            # see #1200
            name = name_uniquely(
                path, names_taken=self._names_taken, unhide=True)
            self._names_taken.add(name)
            self._path_to_name[path] = name

        return self.uri(path)
Example #54
0
    def rm(self, path_glob):
        if not is_uri(path_glob):
            super(HadoopFilesystem, self).rm(path_glob)

        # hadoop fs -rmr will print something like:
        # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf
        # to STDOUT, which we don't care about.
        #
        # if we ask to delete a path that doesn't exist, it prints
        # to STDERR something like:
        # rmr: <path>
        # which we can safely ignore
        try:
            self.invoke_hadoop(
                ['fs', '-rmr', path_glob],
                return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])
        except CalledProcessError:
            raise IOError("Could not rm %s" % path_glob)
Example #55
0
    def test_copy_files_with_rename_to_local_wd_mirror(self):
        # see test_upload_files_with_rename() in test_local for comparison

        fish_path = self.makefile('fish', b'salmon')
        fowl_path = self.makefile('fowl', b'goose')

        # use _LOCAL_CLUSTER_MASTER because the default master (local[*])
        # doesn't have a working directory
        job = MRSparkOSWalk(['-r', 'spark',
                             '--spark-master', _LOCAL_CLUSTER_MASTER,
                             '--file', fish_path + '#ghoti',
                             '--file', fowl_path])
        job.sandbox()

        file_sizes = {}

        with job.make_runner() as runner:
            runner.run()

            # check working dir mirror
            wd_mirror = runner._wd_mirror()
            self.assertIsNotNone(wd_mirror)
            self.assertFalse(is_uri(wd_mirror))

            self.assertTrue(exists(wd_mirror))
            # only files which needed to be renamed should be in wd_mirror
            self.assertTrue(exists(join(wd_mirror, 'ghoti')))
            self.assertFalse(exists(join(wd_mirror, 'fish')))
            self.assertFalse(exists(join(wd_mirror, 'fowl')))

            for line in to_lines(runner.cat_output()):
                path, size = safeeval(line)
                file_sizes[path] = size

        # check that files were uploaded to working dir
        self.assertIn('fowl', file_sizes)
        self.assertEqual(file_sizes['fowl'], 5)

        self.assertIn('ghoti', file_sizes)
        self.assertEqual(file_sizes['ghoti'], 6)

        # fish was uploaded as "ghoti"
        self.assertNotIn('fish', file_sizes)