def spark_mapper(current_range): """ Gets the paths to the file(s) in the current executor, then declares the headers found. Args: current_range (tuple): A pair that contains the starting and ending values of the current range. Returns: function: The map function to be executed on each executor, complete with all headers needed for the analysis. """ # Get and declare headers on each worker headers_on_executor = [ pyspark.SparkFiles.get(ntpath.basename(filepath)) for filepath in headers ] Utils.declare_headers(headers_on_executor) # Get and declare shared libraries on each worker shared_libs_on_ex = [ pyspark.SparkFiles.get(ntpath.basename(filepath)) for filepath in shared_libraries ] Utils.declare_shared_libraries(shared_libs_on_ex) return mapper(current_range)
def distribute_shared_libraries(self, shared_libraries_paths): """ Includes the C++ shared libraries to be declared before execution. If any pcm file is present in the same folder as the shared libraries, the function will try to retrieve them and distribute them. Args: shared_libraries_paths (str, iter): A string or an iterable (such as a list, set...) containing the paths to all necessary C++ shared libraries as strings. This function accepts both paths to the libraries themselves and paths to directories containing the libraries. """ libraries_to_distribute = set() pcm_to_distribute = set() if isinstance(shared_libraries_paths, str): pcm_to_distribute, libraries_to_distribute = ( Utils.check_pcm_in_library_path(shared_libraries_paths)) else: for path_string in shared_libraries_paths: pcm, libraries = Utils.check_pcm_in_library_path(path_string) libraries_to_distribute.update(libraries) pcm_to_distribute.update(pcm) # Distribute shared libraries and pcm files to the workers self.distribute_unique_paths(libraries_to_distribute) self.distribute_unique_paths(pcm_to_distribute) # Include shared libraries locally Utils.declare_shared_libraries(libraries_to_distribute) # Finally, add everything to the includes set self.shared_libraries.update(libraries_to_distribute)
def dask_mapper(current_range): """ Gets the paths to the file(s) in the current executor, then declares the headers found. Args: current_range (tuple): The current range of the dataset being processed on the executor. Returns: function: The map function to be executed on each executor, complete with all headers needed for the analysis. """ # Retrieve the current worker local directory localdir = get_worker().local_directory # Get and declare headers on each worker headers_on_executor = [ os.path.join(localdir, os.path.basename(filepath)) for filepath in headers ] Utils.declare_headers(headers_on_executor) # Get and declare shared libraries on each worker shared_libs_on_ex = [ os.path.join(localdir, os.path.basename(filepath)) for filepath in shared_libraries ] Utils.declare_shared_libraries(shared_libs_on_ex) return mapper(current_range)