def test_uri_parsing(self): self.assertEqual(is_uri('notauri!'), False) self.assertEqual(is_uri('they://did/the/monster/mash'), True) self.assertEqual(is_s3_uri('s3://a/uri'), True) self.assertEqual(is_s3_uri('s3n://a/uri'), True) self.assertEqual(is_s3_uri('hdfs://a/uri'), False) self.assertEqual(parse_s3_uri('s3://bucket/loc'), ('bucket', 'loc'))
def join(self, dirname, filename): """Join *filename* onto *dirname* (which may be a URI)""" if is_uri(filename): return filename elif is_uri(dirname): return posixpath.join(dirname, filename) else: return os.path.join(dirname, filename)
def _create_input_manifest_if_needed(self): """Create a file with a list of URIs of input files.""" if self._input_manifest_path or not self._uses_input_manifest(): return uris = [] log.info('finding input files to add to manifest...') for path in self._get_input_paths(): log.debug(' in %s' % path) if is_uri(path): # URIs might be globs for uri in self.fs.ls(path): uris.append(uri) else: # local paths are expected to be single files # (shell would resolve globs) if self._upload_mgr: uris.append(self._upload_mgr.uri(path)) else: # just make sure job can find files from it's working dir uris.append(os.path.abspath(path)) log.info('found %d input files' % len(uris)) path = os.path.join(self._get_local_tmp_dir(), 'input-manifest.txt') self._write_script(uris, path, 'input manifest') self._input_manifest_path = path if self._upload_mgr: self._upload_mgr.add(self._input_manifest_path)
def test_default(self): runner = SparkMRJobRunner() self.assertFalse(is_uri(runner._spark_tmp_dir)) self.assertIsNone(runner._upload_mgr) self.assertEqual(runner._spark_tmp_dir[-6:], '-spark')
def ls(self, path_glob): if not is_uri(path_glob): for path in super(HadoopJobRunner, self).ls(path_glob): yield path return components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) stdout = self._invoke_hadoop( ['fs', '-lsr', path_glob], return_stdout=True, ok_stderr=[HADOOP_LSR_NO_SUCH_FILE]) for line in StringIO(stdout): fields = line.rstrip('\r\n').split() # expect lines like: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar if len(fields) < 8: raise Exception('unexpected ls line from hadoop: %r' % line) # ignore directories if fields[0].startswith('d'): continue # not sure if you can have spaces in filenames; just to be safe path = ' '.join(fields[7:]) yield hdfs_prefix + path
def test_spark_master_mesos(self): runner = SparkMRJobRunner(spark_master='mesos://host:12345') self.assertTrue(is_uri(runner._spark_tmp_dir)) self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///') self.assertIsNotNone(runner._upload_mgr)
def _cat_file(self, filename): if is_uri(filename): # stream from HDFS cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) def stream(): for line in cat_proc.stdout: yield line # there shouldn't be any stderr for line in cat_proc.stderr: log.error('STDERR: ' + line) returncode = cat_proc.wait() if returncode != 0: raise CalledProcessError(returncode, cat_args) return read_file(filename, stream()) else: # read from local filesystem return super(HadoopJobRunner, self)._cat_file(filename)
def test_spark_master_yarn(self): runner = SparkMRJobRunner(spark_master='yarn') self.assertTrue(is_uri(runner._spark_tmp_dir)) self.assertEqual(runner._spark_tmp_dir[:8], 'hdfs:///') self.assertIsNotNone(runner._upload_mgr)
def _setup_input(self): """Copy local input files (if any) to a special directory on HDFS. Set self._hdfs_input_files """ # winnow out HDFS files from local ones self._hdfs_input_files = [] local_input_files = [] for path in self._input_paths: if is_uri(path): # Don't even bother running the job if the input isn't there. if not self.ls(path): raise AssertionError( 'Input path %s does not exist!' % (path,)) self._hdfs_input_files.append(path) else: local_input_files.append(path) # copy local files into an input directory, with names like # 00000-actual_name.ext if local_input_files: hdfs_input_dir = posixpath.join(self._hdfs_tmp_dir, 'input') log.info('Uploading input to %s' % hdfs_input_dir) self._mkdir_on_hdfs(hdfs_input_dir) for i, path in enumerate(local_input_files): if path == '-': path = self._dump_stdin_to_local_file() target = '%s/%05i-%s' % ( hdfs_input_dir, i, os.path.basename(path)) self._upload_to_hdfs(path, target) self._hdfs_input_files.append(hdfs_input_dir)
def ls(self, path_glob): if not is_uri(path_glob): for path in super(HadoopJobRunner, self).ls(path_glob): yield path return components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) stdout = self._invoke_hadoop(['fs', '-lsr', path_glob], return_stdout=True, ok_stderr=[HADOOP_LSR_NO_SUCH_FILE]) for line in StringIO(stdout): fields = line.rstrip('\r\n').split() # expect lines like: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar if len(fields) < 8: raise Exception('unexpected ls line from hadoop: %r' % line) # ignore directories if fields[0].startswith('d'): continue # not sure if you can have spaces in filenames; just to be safe path = ' '.join(fields[7:]) yield hdfs_prefix + path
def _fully_qualify_subnetwork_uri(uri, project_id, region): if '/' not in uri: # just a name uri = 'projects/%s/%s/subnetworks/%s' % (project_id, region, uri) if not is_uri(uri): uri = 'https://www.googleapis.com/compute/v1/' + uri return uri
def fully_qualify_hdfs_path(path): """If path isn't an ``hdfs://`` URL, turn it into one.""" if is_uri(path): return path elif path.startswith('/'): return 'hdfs://' + path else: return 'hdfs:///user/%s/%s' % (getpass.getuser(), path)
def _from_file_uri(path_or_uri): if is_uri(path_or_uri): if path_or_uri.startswith('file:///'): return path_or_uri[7:] # keep last / else: raise ValueError('Not a file:/// URI') else: return path_or_uri
def test_copy_files_with_rename_to_remote_wd_mirror(self): self.add_mock_s3_data( {'walrus': { 'fish': b'salmon', 'fowl': b'goose' }}) foe_path = self.makefile('foe', b'giant') run_spark_submit = self.start( patch('mrjob.bin.MRJobBinRunner._run_spark_submit', return_value=0)) job = MRSparkOSWalk([ '-r', 'spark', '--spark-master', 'mesos://host:9999', '--spark-tmp-dir', 's3://walrus/tmp', '--file', 's3://walrus/fish#ghoti', '--file', 's3://walrus/fowl', '--file', foe_path ]) job.sandbox() with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() fs = runner.fs self.assertIsNotNone(wd_mirror) self.assertTrue(is_uri(wd_mirror)) self.assertTrue(fs.exists(wd_mirror)) # uploaded for rename self.assertTrue(fs.exists(fs.join(wd_mirror, 'ghoti'))) # wrong name self.assertFalse(fs.exists(fs.join(wd_mirror, 'fish'))) # no need to upload, already visible self.assertFalse(fs.exists(fs.join(wd_mirror, 'fowl'))) # need to upload from local to remote self.assertTrue(fs.exists(fs.join(wd_mirror, 'foe'))) run_spark_submit.assert_called_once_with(ANY, ANY, record_callback=ANY) spark_submit_args = run_spark_submit.call_args[0][0] self.assertIn('--files', spark_submit_args) files_arg = spark_submit_args[spark_submit_args.index('--files') + 1] self.assertEqual( files_arg, ','.join([ fs.join(wd_mirror, 'foe'), 's3://walrus/fowl', fs.join(wd_mirror, 'ghoti'), fs.join(wd_mirror, 'mr_spark_os_walk.py'), ]))
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_string(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def uri(self, path): """Get the URI for the given path. If *path* is a URI, just return it. """ if is_uri(path): return path if path in self._path_to_name: return posixpath.join(self.prefix, self._path_to_name[path]) else: raise ValueError('%r is not a URI or a known local file' % (path,))
def uri(self, path): """Get the URI for the given path. If *path* is a URI, just return it. """ if (not os.path.exists(path)) and is_uri(path): return path if path in self._path_to_name: return posixpath.join(self.prefix, self._path_to_name[path]) else: raise ValueError('%r is not a URI or a known local file' % (path,))
def ls(self, path_glob): components = urlparse(path_glob) hdfs_prefix = '%s://%s' % (components.scheme, components.netloc) version = self.get_hadoop_version() # use ls -R on Hadoop 2 (see #1152) if uses_yarn(version): args = ['fs', '-ls', '-R', path_glob] else: args = ['fs', '-lsr', path_glob] try: stdout = self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_LS_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not ls %s" % path_glob) for line in BytesIO(stdout): line = line.rstrip(b'\r\n') # ignore total item count if line.startswith(b'Found '): continue fields = line.split(b' ') # Throw out directories if fields[0].startswith(b'd'): continue # Try to figure out which part of the line is the path # Expected lines: # # HDFS: # -rw-r--r-- 3 dave users 3276 2010-01-13 14:00 /foo/bar # # S3: # -rwxrwxrwx 1 3276 010-01-13 14:00 /foo/bar path_index = None for index, field in enumerate(fields): # look for time field, and pick one after that # (can't use field[2] because that's an int in Python 3) if len(field) == 5 and field[2:3] == b':': path_index = (index + 1) if not path_index: raise IOError("Could not locate path in string %r" % line) path = to_unicode(line.split(b' ', path_index)[-1]) # handle fully qualified URIs from newer versions of Hadoop ls # (see Pull Request #577) if is_uri(path): yield path else: yield hdfs_prefix + path
def path_exists(self, path_glob): """Does the given path exist? If dest is a directory (ends with a "/"), we check if there are any files starting with that path. """ if not is_uri(path_glob): return super(HadoopJobRunner, self).path_exists(path_glob) return bool(self._invoke_hadoop(['fs', '-test', '-e', path_glob], ok_returncodes=(0, 1)))
def _endpoint_url(host_or_uri): """If *host_or_uri* is non-empty and isn't a URI, prepend ``'https://'``. Otherwise, pass through as-is. """ if not host_or_uri: return host_or_uri elif is_uri(host_or_uri): return host_or_uri else: return 'https://' + host_or_uri
def path_exists(self, path_glob): """Does the given path exist? If dest is a directory (ends with a "/"), we check if there are any files starting with that path. """ if not is_uri(path_glob): return super(HadoopJobRunner, self).path_exists(path_glob) return bool( self._invoke_hadoop(['fs', '-test', '-e', path_glob], ok_returncodes=(0, 1)))
def test_copy_files_with_rename_to_remote_wd_mirror(self): self.add_mock_s3_data({'walrus': {'fish': b'salmon', 'fowl': b'goose'}}) foe_path = self.makefile('foe', b'giant') run_spark_submit = self.start(patch( 'mrjob.bin.MRJobBinRunner._run_spark_submit', return_value=0)) job = MRSparkOSWalk(['-r', 'spark', '--spark-master', 'mesos://host:9999', '--spark-tmp-dir', 's3://walrus/tmp', '--file', 's3://walrus/fish#ghoti', '--file', 's3://walrus/fowl', '--file', foe_path]) job.sandbox() with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() fs = runner.fs self.assertIsNotNone(wd_mirror) self.assertTrue(is_uri(wd_mirror)) self.assertTrue(fs.exists(wd_mirror)) # uploaded for rename self.assertTrue(fs.exists(fs.join(wd_mirror, 'ghoti'))) # wrong name self.assertFalse(fs.exists(fs.join(wd_mirror, 'fish'))) # no need to upload, already visible self.assertFalse(fs.exists(fs.join(wd_mirror, 'fowl'))) # need to upload from local to remote self.assertTrue(fs.exists(fs.join(wd_mirror, 'foe'))) run_spark_submit.assert_called_once_with( ANY, ANY, record_callback=ANY) spark_submit_args = run_spark_submit.call_args[0][0] self.assertIn('--files', spark_submit_args) files_arg = spark_submit_args[ spark_submit_args.index('--files') + 1] self.assertEqual( files_arg, ','.join([ fs.join(wd_mirror, 'foe'), 's3://walrus/fowl', fs.join(wd_mirror, 'ghoti'), fs.join(wd_mirror, 'mr_spark_os_walk.py'), ]))
def join(self, path, *paths): """Join *paths* onto *path* (which may be a URI)""" all_paths = (path,) + paths # if there's a URI, we only care about it and what follows for i in range(len(all_paths), 0, -1): if is_uri(all_paths[i - 1]): scheme, netloc, uri_path = urlparse(all_paths[i - 1])[:3] return '%s://%s%s' % ( scheme, netloc, posixpath.join( uri_path or '/', *all_paths[i:])) else: return os.path.join(*all_paths)
def add(self, path): """Add a path. If *path* hasn't been added before, assign it a name. If *path* is a URI don't add it; just return the URI. :return: the URI assigned to the path""" if is_uri(path): return path if path not in self._path_to_name: name = name_uniquely(path, names_taken=self._names_taken) self._names_taken.add(name) self._path_to_name[path] = name return self.uri(path)
def du(self, path_glob): """Get the size of a file, or None if it's not a file or doesn't exist.""" if not is_uri(path_glob): return super(HadoopJobRunner, self).dus(path_glob) stdout = self._invoke_hadoop(['fs', '-du', path_glob], return_stdout=True) try: return int(stdout.split()[1]) except (ValueError, TypeError, IndexError): raise Exception('Unexpected output from hadoop fs -du: %r' % stdout)
def du(self, path_glob): """Get the size of a file, or None if it's not a file or doesn't exist.""" if not is_uri(path_glob): return super(HadoopJobRunner, self).dus(path_glob) stdout = self._invoke_hadoop(['fs', '-du', path_glob], return_stdout=True) try: return int(stdout.split()[1]) except (ValueError, TypeError, IndexError): raise Exception( 'Unexpected output from hadoop fs -du: %r' % stdout)
def add(self, path): """Add a path. If *path* hasn't been added before, assign it a name. If *path* is a URI don't add it; just return the URI. :return: the URI assigned to the path""" if (not os.path.exists(path)) and is_uri(path): return path if path not in self._path_to_name: name = name_uniquely(path, names_taken=self._names_taken) self._names_taken.add(name) self._path_to_name[path] = name return self.uri(path)
def fully_qualify_hadoop_path(path): """If we're on MapR, we should get an alternative to hdfs://. CDH4 will fail""" process = Popen(HADOOP_FETCH_URI_SCHEME, shell=True, stdout=PIPE, stderr=STDOUT) uri_scheme = process.communicate()[0] if process.returncode != 0: uri_scheme='hdfs://' else: uri_scheme = HADOOP_FETCH_URI_CLEANUP.sub('//', uri_scheme) if is_uri(path): return path elif path.startswith('/'): return uri_scheme + path else: return '%s/user/%s/%s' % (uri_scheme, getpass.getuser(), path)
def setUp(self): super(CompositeFilesystemTestCase, self).setUp() self.log = self.start(patch('mrjob.fs.composite.log')) self.hadoop_fs = Mock(spec=Filesystem) self.hadoop_fs.get_hadoop_version = Mock() self.hadoop_fs.can_handle_path.side_effect = is_uri self.local_fs = Mock(spec=Filesystem) self.local_fs.can_handle_path.side_effect = lambda p: not is_uri(p) self.s3_fs = Mock(spec=Filesystem) self.s3_fs.create_bucket = Mock() self.s3_fs.can_handle_path.side_effect = is_s3_uri
def test_is_uri(self): self.assertEqual(is_uri('notauri!'), False) self.assertEqual(is_uri('they://did/the/monster/mash'), True) self.assertEqual(is_uri('C:\some\windows\path'), False) # test #1455 self.assertEqual(is_uri('2016-10-11T06:29:17'), False) # sorry, we only care about file URIs self.assertEqual(is_uri('mailto:[email protected]'), False) # urlparse has to accept it self.assertEqual(is_uri('://'), False)
def rm(self, path_glob): if not is_uri(path_glob): super(HadoopJobRunner, self).rm(path_glob) if self.path_exists(path_glob): # hadoop fs -rmr will print something like: # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf # to STDOUT, which we don't care about. # # if we ask to delete a path that doesn't exist, it prints # to STDERR something like: # rmr: <path> # which we can safely ignore self._invoke_hadoop( ['fs', '-rmr', path_glob], return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])
def rm(self, path_glob): if not is_uri(path_glob): super(HadoopFilesystem, self).rm(path_glob) version = self.get_hadoop_version() if uses_yarn(version): args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob] else: args = ['fs', '-rmr', '-skipTrash', path_glob] try: self.invoke_hadoop(args, return_stdout=True, ok_stderr=[_HADOOP_RM_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not rm %s" % path_glob)
def _dir_archive_path(self, dir_path): """Assign a path for the archive of *dir_path* but don't actually create anything.""" if dir_path not in self._dir_to_archive_path: # we can check local paths now if not (is_uri(dir_path) or os.path.isdir(dir_path)): raise OSError('%s is not a directory!' % dir_path) name = name_uniquely( dir_path, names_taken=self._dir_archive_names_taken) self._dir_archive_names_taken.add(name) self._dir_to_archive_path[dir_path] = os.path.join( self._get_local_tmp_dir(), 'archives', name + '.tar.gz') return self._dir_to_archive_path[dir_path]
def rm(self, path_glob): if not is_uri(path_glob): super(HadoopJobRunner, self).rm(path_glob) if self.path_exists(path_glob): # hadoop fs -rmr will print something like: # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf # to STDOUT, which we don't care about. # # if we ask to delete a path that doesn't exist, it prints # to STDERR something like: # rmr: <path> # which we can safely ignore self._invoke_hadoop(['fs', '-rmr', path_glob], return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE])
def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if is_uri(path) and not is_gcs_uri(path): continue # can't check non-GCS URIs, hope for the best if not self.fs.exists(path): raise AssertionError( 'Input path %s does not exist!' % (path,))
def rm(self, path_glob): if not is_uri(path_glob): super(HadoopFilesystem, self).rm(path_glob) version = self.get_hadoop_version() if uses_yarn(version): args = ['fs', '-rm', '-R', '-f', '-skipTrash', path_glob] else: args = ['fs', '-rmr', '-skipTrash', path_glob] try: self.invoke_hadoop( args, return_stdout=True, ok_stderr=[_HADOOP_RM_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not rm %s" % path_glob)
def _check_input_exists(self): """Make sure all input exists before continuing with our job. """ if not self._opts['check_input_paths']: return for path in self._input_paths: if path == '-': continue # STDIN always exists if is_uri(path) and not is_gcs_uri(path): continue # can't check non-GCS URIs, hope for the best if not self.fs.exists(path): raise AssertionError('Input path %s does not exist!' % (path, ))
def __init__(self, **kwargs): super(SparkMRJobRunner, self).__init__(**kwargs) self._spark_tmp_dir = self._pick_spark_tmp_dir() # where local files are uploaded into Spark if is_uri(self._spark_tmp_dir): spark_files_dir = posixpath.join(self._spark_tmp_dir, 'files', '') self._upload_mgr = UploadDirManager(spark_files_dir) # where to put job output (if not set explicitly) if not self._output_dir: self._output_dir = posixpath.join(self._spark_tmp_dir, 'output') # keep track of where the spark-submit binary is self._spark_submit_bin = self._opts['spark_submit_bin']
def _dir_archive_path(self, dir_path): """Assign a path for the archive of *dir_path* but don't actually create anything.""" if dir_path not in self._dir_to_archive_path: # we can check local paths now if not (is_uri(dir_path) or os.path.isdir(dir_path)): raise OSError('%s is not a directory!' % dir_path) name = name_uniquely(dir_path, names_taken=self._dir_archive_names_taken) self._dir_archive_names_taken.add(name) self._dir_to_archive_path[dir_path] = os.path.join( self._get_local_tmp_dir(), 'archives', name + '.tar.gz') return self._dir_to_archive_path[dir_path]
def _create_dir_archive(self, dir_path): """Helper for :py:meth:`archive_dir`""" if not self.fs.exists(dir_path): raise OSError('%s does not exist') tar_gz_path = self._dir_archive_path(dir_path) if tar_gz_path in self._dir_archives_created: return # already created if not os.path.isdir(os.path.dirname(tar_gz_path)): os.makedirs(os.path.dirname(tar_gz_path)) # for remote files tmp_download_path = os.path.join(self._get_local_tmp_dir(), 'tmp-download') log.info('Archiving %s -> %s' % (dir_path, tar_gz_path)) with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz: for path in self.fs.ls(dir_path): # fs.ls() only lists files if path == dir_path: raise OSError('%s is a file, not a directory!' % dir_path) # TODO: do we need this? if os.path.realpath(path) == os.path.realpath(tar_gz_path): raise OSError('attempted to archive %s into itself!' % tar_gz_path) if is_uri(path): path_in_tar_gz = path[len(dir_path):].lstrip('/') log.info(' downloading %s -> %s' % (path, tmp_download_path)) with open(tmp_download_path, 'wb') as f: for chunk in self.fs.cat(path): f.write(chunk) local_path = tmp_download_path else: path_in_tar_gz = path[len(dir_path):].lstrip(os.sep) local_path = path log.debug(' adding %s to %s' % (path, tar_gz_path)) tar_gz.add(local_path, path_in_tar_gz, recursive=False) self._dir_archives_created.add(tar_gz_path)
def _create_dir_archive(self, dir_path): """Helper for :py:meth:`archive_dir`""" if not self.fs.exists(dir_path): raise OSError('%s does not exist') tar_gz_path = self._dir_archive_path(dir_path) if tar_gz_path in self._dir_archives_created: return # already created if not os.path.isdir(os.path.dirname(tar_gz_path)): os.makedirs(os.path.dirname(tar_gz_path)) # for remote files tmp_download_path = os.path.join( self._get_local_tmp_dir(), 'tmp-download') log.info('Archiving %s -> %s' % (dir_path, tar_gz_path)) with tarfile.open(tar_gz_path, mode='w:gz') as tar_gz: for path in self.fs.ls(dir_path): # fs.ls() only lists files if path == dir_path: raise OSError('%s is a file, not a directory!' % dir_path) # TODO: do we need this? if os.path.realpath(path) == os.path.realpath(tar_gz_path): raise OSError( 'attempted to archive %s into itself!' % tar_gz_path) if is_uri(path): path_in_tar_gz = path[len(dir_path):].lstrip('/') log.info(' downloading %s -> %s' % ( path, tmp_download_path)) with open(tmp_download_path, 'wb') as f: for chunk in self.fs.cat(path): f.write(chunk) local_path = tmp_download_path else: path_in_tar_gz = path[len(dir_path):].lstrip(os.sep) local_path = path log.debug(' adding %s to %s' % (path, tar_gz_path)) tar_gz.add(local_path, path_in_tar_gz, recursive=False) self._dir_archives_created.add(tar_gz_path)
def add(self, path): """Add a path. If *path* hasn't been added before, assign it a name. If *path* is a URI don't add it; just return the URI. :return: the URI assigned to the path""" if is_uri(path): return path if path not in self._path_to_name: # use unhide so that input files won't be hidden from Hadoop, # see #1200 name = name_uniquely( path, names_taken=self._names_taken, unhide=True) self._names_taken.add(name) self._path_to_name[path] = name return self.uri(path)
def _upload_non_input_files(self): """Copy files to HDFS, and set the 'hdfs_uri' field for each file. """ self._pick_hdfs_uris_for_files() hdfs_files_dir = posixpath.join(self._hdfs_tmp_dir, 'files', '') self._mkdir_on_hdfs(hdfs_files_dir) log.info('Copying non-input files into %s' % hdfs_files_dir) for file_dict in self._files: path = file_dict['path'] # don't bother with files already in HDFS if is_uri(path): continue self._upload_to_hdfs(path, file_dict['hdfs_uri'])
def rm(self, path_glob): if not is_uri(path_glob): super(HadoopFilesystem, self).rm(path_glob) # hadoop fs -rmr will print something like: # Moved to trash: hdfs://hdnamenode:54310/user/dave/asdf # to STDOUT, which we don't care about. # # if we ask to delete a path that doesn't exist, it prints # to STDERR something like: # rmr: <path> # which we can safely ignore try: self.invoke_hadoop( ['fs', '-rmr', path_glob], return_stdout=True, ok_stderr=[HADOOP_RMR_NO_SUCH_FILE]) except CalledProcessError: raise IOError("Could not rm %s" % path_glob)
def test_copy_files_with_rename_to_local_wd_mirror(self): # see test_upload_files_with_rename() in test_local for comparison fish_path = self.makefile('fish', b'salmon') fowl_path = self.makefile('fowl', b'goose') # use _LOCAL_CLUSTER_MASTER because the default master (local[*]) # doesn't have a working directory job = MRSparkOSWalk(['-r', 'spark', '--spark-master', _LOCAL_CLUSTER_MASTER, '--file', fish_path + '#ghoti', '--file', fowl_path]) job.sandbox() file_sizes = {} with job.make_runner() as runner: runner.run() # check working dir mirror wd_mirror = runner._wd_mirror() self.assertIsNotNone(wd_mirror) self.assertFalse(is_uri(wd_mirror)) self.assertTrue(exists(wd_mirror)) # only files which needed to be renamed should be in wd_mirror self.assertTrue(exists(join(wd_mirror, 'ghoti'))) self.assertFalse(exists(join(wd_mirror, 'fish'))) self.assertFalse(exists(join(wd_mirror, 'fowl'))) for line in to_lines(runner.cat_output()): path, size = safeeval(line) file_sizes[path] = size # check that files were uploaded to working dir self.assertIn('fowl', file_sizes) self.assertEqual(file_sizes['fowl'], 5) self.assertIn('ghoti', file_sizes) self.assertEqual(file_sizes['ghoti'], 6) # fish was uploaded as "ghoti" self.assertNotIn('fish', file_sizes)