def test_binary_map_rdd(spark_setup): """ Test lofn binary map is handling binary as expected. We want to return a directory, user side reads in using sc.binaryFiles presumably, and the output should be partitions of tuples containing (path, contents). """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='head -c 1024 </dev/urandom > /shared/output.bin', ) as f: binary_dir = f mappedRDD = sc.binaryFiles(binary_dir) results = mappedRDD.collect()[0] assert results[0].split(':')[0] == 'file' and not results[1].isalnum()
def test_binary_map_context_manager(spark_setup): """ Test lofn binary map context manager. We want a context manager because spark's rdd binary file function takes a dictionary of files then reads them in, so we want to keep temp files until these are read then destroy the temporary directory. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='cat /shared/input.txt > /shared/output.bin', ) as f: binary_dir = f os.listdir(binary_dir) # should work inside the context manager with pytest.raises(OSError): # this is outside of the context manager, raises FileNotFoundError os.listdir(binary_dir)
def test_custom_tempfile_directory(spark_setup): """ Test user defined tempfile directory. This allows setting the name of the parent directory for the temp directories used to store tempfiles by lofn. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) def unique_dir_name(root, count=0): """Recursive function to find a unique name for testing so we don't have to destroy anything""" if not os.path.exists(root): return root else: count += 1 new_name = root components = root.split('_') if len(components) > 1: new_name = '_'.join(components[:-1]) new_name = new_name + '_' + str(count) return unique_dir_name(new_name, count) # still writing inside of /tmp to make sure we have write permissions tempdir_name = unique_dir_name('/tmp/lofntestdir') assert not os.path.exists(tempdir_name) test_object = DockerPipe(conf, temporary_directory_parent=tempdir_name) # using map_binary since the context manager gives us the ability to # keep the temp dirs open for us to check it's existence and use before # removing it all with test_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='head -c 1024 </dev/urandom > /shared/output.bin', ) as binary_dir: assert os.path.exists(tempdir_name) # make sure the path has been # created assert tempdir_name in binary_dir # make sure the specified # directory is being used shutil.rmtree(tempdir_name)
def test_binary_map_context_manager(spark_setup): """ Test lofn binary map context manager. We want a context manager because spark's rdd binary file function takes a dictionary of files then reads them in, so we want to keep temp files until these are read then destroy the temporary directory. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'word']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='cat /shared/input.txt > /shared/output.bin' ) as f: binary_dir = f # should work inside the context manager subprocess.call(['hadoop', 'fs', '-ls', '{}'.format(binary_dir)]) with pytest.raises(subprocess.CalledProcessError): # this is outside of the context manager, should raise # non-zero exist status subprocess.check_call(['hadoop', 'fs', '-ls', '{}'.format(binary_dir)])
def test_binary_reduce(spark_setup): """ Test the binary reduce. Takes output from binary map, a binary rdd, and writes them wholesale into two temp files for operations on the binary files. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='head -c 1024 </dev/urandom > /shared/output.bin', ) as f: binary_dir = f mappedRDD = sc.binaryFiles(binary_dir) results = new_object.reduce_binary( rdd=mappedRDD, image_name='ubuntu:xenial', command='cat /shared/input_1.bin > /shared/output.bin; ' 'cat /shared/input_2.bin >> /shared/output.bin' ) assert not results.isalnum() and 3000 > sys.getsizeof(results) >= 2048