Beispiel #1
0
 def _input_paths_for_step(self, step_num):
     if step_num == 0:
         return [
             _from_file_uri(path)  # *path* could be a file:// URI
             for input_path_glob in self._get_input_paths()
             for path in self.fs.ls(input_path_glob)
         ]
     else:
         return self.fs.ls(
             join(self._output_dir_for_step(step_num - 1), 'part-*'))
Beispiel #2
0
    def _create_dist_cache_dir(self, step_num):
        """Copy working directory files into a shared directory,
        simulating the way Hadoop's Distributed Cache works on nodes."""
        cache_dir = self._dist_cache_dir(step_num)

        log.debug('creating simulated Distributed Cache dir: %s' % cache_dir)
        self.fs.mkdir(cache_dir)

        for name, path in self._working_dir_mgr.name_to_path('file').items():
            path = _from_file_uri(path)  # might start with file://
            dest = self._path_in_dist_cache_dir(name, step_num)
            log.debug('copying %s -> %s' % (path, dest))
            shutil.copy(path, dest)
            _chmod_u_rx(dest)

        for name, path in self._working_dir_mgr.name_to_path(
                'archive').items():
            path = _from_file_uri(path)  # might start with file://
            dest = self._path_in_dist_cache_dir(name, step_num)

            log.debug('unarchiving %s -> %s' % (path, dest))
            unarchive(path, dest)
            _chmod_u_rx(dest, recursive=True)