def test_user_volume(spark_setup): """ Test user volume support. These are volumes that are mounted in docker for a whole file reference, shared among all mappers and reducers. The assertion for this relies on there being two partitions as defined in the spark fixture 'local[2]'. """ sc, conf = spark_setup tempdir = tempfile.mkdtemp() temp_input_file = os.path.join(tempdir, 'test.txt') with open(temp_input_file, 'w') as f: f.write("a\nb\nc") rdd = sc.parallelize(['a', 'b']) mapped_rdd = DockerPipe( conf, volumes={tempdir: {'bind': '/user_volume/'}} ).map( image_name='ubuntu:xenial', command="cat /shared/input.txt > /shared/output.txt; cat " "/user_volume/test.txt " ">> /shared/output.txt", rdd=rdd) result = DockerPipe(conf).reduce( image_name='ubuntu:xenial', command="cat /shared/input.txt | sort > /shared/output.txt", rdd=mapped_rdd ) shutil.rmtree(tempdir) assert result == ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c']
def test_custom_name(spark_setup): """ Check that lofn can do a simple map and reduce with custom temp file names inside the container. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'test', 'test', 'words']) mapped_rdd = DockerPipe(conf).map( image_name='ubuntu:xenial', container_input_name='foo.txt', container_output_name='bar.txt', command="cat /shared/foo.txt | tr -s ' ' | tr ' ' '\\n' | sort | " "uniq -c | awk '\$1=\$1' > /shared/bar.txt", rdd=rdd) result = DockerPipe(conf).reduce( image_name='ubuntu:xenial', command="cat /shared/input.txt | awk '{cnt[\$2]==0 ? cnt[\$2]=\$1 : " "cnt[\$2]+=\$1} END {for (_ in cnt) print cnt[_],_}'" " | sort -nr > /shared/output.txt", rdd=mapped_rdd) top_word = result[0].split()[1] top_word_count = int(result[0].split()[0]) assert top_word == 'test' assert top_word_count == 3
def test_user_volume_on_yarn(spark_setup): """ Test user volume support on yarn/hdfs. These are volumes that are mounted in docker for a whole file reference, shared among all mappers and reducers. """ sc, conf = spark_setup tempdir = tempfile.mkdtemp() temp_input_file = os.path.join(tempdir, 'test.txt') with open(temp_input_file, 'w') as f: f.write("a\nb\nc") subprocess.call(['hadoop', 'fs', '-put', '{}'.format(tempdir)]) user = getpass.getuser() hadoop_target = os.path.join('/user/', user + '/') hadoop_path = os.path.join(hadoop_target, os.path.basename(tempdir)) rdd = sc.parallelize(['a', 'b']) mapped_rdd = DockerPipe( conf, volumes={hadoop_path: {'bind': '/user_volume/'}} ).map( image_name='ubuntu:xenial', command="cat /shared/input.txt > /shared/output.txt; cat " "/user_volume/test.txt " ">> /shared/output.txt", rdd=rdd) result = DockerPipe(conf).reduce( image_name='ubuntu:xenial', command="cat /shared/input.txt | sort > /shared/output.txt", rdd=mapped_rdd ) shutil.rmtree(tempdir) subprocess.call(['hadoop', 'fs', '-rm', '-r', '{}'.format(hadoop_path)]) assert result == ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c']
def test_docker_failure(spark_setup): """ Test docker failures are bubbled up - causes lofn to fail and reports why. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) test_object = DockerPipe(conf) # since pyspark catches the exception (docker fails) this is surfaced as # a PythonException from spark, which is a Py4JJavaError from py4j.protocol import Py4JJavaError with pytest.raises(Py4JJavaError): mapped_RDD = test_object.map( image_name='ubuntu:xenial', command="cat /shared/does_not_exist.txt > /shared/output.txt", rdd=rdd) mapped_RDD.collect() # need to perform an action for the map to happen
def test_binary_map_rdd(spark_setup): """ Test lofn binary map is handling binary as expected. We want to return a directory, user side reads in using sc.binaryFiles presumably, and the output should be partitions of tuples containing (path, contents). """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='head -c 1024 </dev/urandom > /shared/output.bin', ) as f: binary_dir = f mappedRDD = sc.binaryFiles(binary_dir) results = mappedRDD.collect()[0] assert results[0].split(':')[0] == 'file' and not results[1].isalnum()
def test_binary_map_context_manager(spark_setup): """ Test lofn binary map context manager. We want a context manager because spark's rdd binary file function takes a dictionary of files then reads them in, so we want to keep temp files until these are read then destroy the temporary directory. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='cat /shared/input.txt > /shared/output.bin', ) as f: binary_dir = f os.listdir(binary_dir) # should work inside the context manager with pytest.raises(OSError): # this is outside of the context manager, raises FileNotFoundError os.listdir(binary_dir)
def test_custom_tempfile_directory(spark_setup): """ Test user defined tempfile directory. This allows setting the name of the parent directory for the temp directories used to store tempfiles by lofn. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) def unique_dir_name(root, count=0): """Recursive function to find a unique name for testing so we don't have to destroy anything""" if not os.path.exists(root): return root else: count += 1 new_name = root components = root.split('_') if len(components) > 1: new_name = '_'.join(components[:-1]) new_name = new_name + '_' + str(count) return unique_dir_name(new_name, count) # still writing inside of /tmp to make sure we have write permissions tempdir_name = unique_dir_name('/tmp/lofntestdir') assert not os.path.exists(tempdir_name) test_object = DockerPipe(conf, temporary_directory_parent=tempdir_name) # using map_binary since the context manager gives us the ability to # keep the temp dirs open for us to check it's existence and use before # removing it all with test_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='head -c 1024 </dev/urandom > /shared/output.bin', ) as binary_dir: assert os.path.exists(tempdir_name) # make sure the path has been # created assert tempdir_name in binary_dir # make sure the specified # directory is being used shutil.rmtree(tempdir_name)
def test_reduce_udf(spark_setup): """ Test reduce UDF for writing temp files. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) def reduce_udf(part_1, part_2): output = [part_1[0][0]] + [part_2[0][0]] return {'input.txt': output} mappedRDD = DockerPipe(conf).map( image_name='ubuntu:xenial', command="cat /shared/input.txt > /shared/output.txt", rdd=rdd) result = DockerPipe(conf).reduce( image_name='ubuntu:xenial', command="cat /shared/input.txt | sort > /shared/output.txt", rdd=mappedRDD, reduce_udf=reduce_udf ) assert result == ['t', 'w']
def test_binary_map_context_manager(spark_setup): """ Test lofn binary map context manager. We want a context manager because spark's rdd binary file function takes a dictionary of files then reads them in, so we want to keep temp files until these are read then destroy the temporary directory. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'word']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='cat /shared/input.txt > /shared/output.bin' ) as f: binary_dir = f # should work inside the context manager subprocess.call(['hadoop', 'fs', '-ls', '{}'.format(binary_dir)]) with pytest.raises(subprocess.CalledProcessError): # this is outside of the context manager, should raise # non-zero exist status subprocess.check_call(['hadoop', 'fs', '-ls', '{}'.format(binary_dir)])
def test_map_udf(spark_setup): """ Test map and reduce UDF for writing temp files. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) def map_udf(input_split): part1 = input_split[0][0] part2 = input_split[0][1:] return {'1.txt': part1, '2.txt': part2} mappedRDD = DockerPipe(conf).map( image_name='ubuntu:xenial', command="cat /shared/1.txt > /shared/output.txt", rdd=rdd, map_udf=map_udf) result = DockerPipe(conf).reduce( image_name='ubuntu:xenial', command="cat /shared/input.txt | sort > /shared/output.txt", rdd=mappedRDD ) assert result == ['t', 'w']
def test_binary_reduce(spark_setup): """ Test the binary reduce. Takes output from binary map, a binary rdd, and writes them wholesale into two temp files for operations on the binary files. """ sc, conf = spark_setup rdd = sc.parallelize(['test', 'words']) new_object = DockerPipe(conf) with new_object.map_binary( rdd=rdd, image_name='ubuntu:xenial', command='head -c 1024 </dev/urandom > /shared/output.bin', ) as f: binary_dir = f mappedRDD = sc.binaryFiles(binary_dir) results = new_object.reduce_binary( rdd=mappedRDD, image_name='ubuntu:xenial', command='cat /shared/input_1.bin > /shared/output.bin; ' 'cat /shared/input_2.bin >> /shared/output.bin' ) assert not results.isalnum() and 3000 > sys.getsizeof(results) >= 2048
def test_user_volume_bad_input_format(spark_setup): """ Test exception case of user volume input, failing to provide a dictionary. """ sc, conf = spark_setup tempdir = tempfile.mkdtemp() temp_input_file = os.path.join(tempdir, 'test.txt') with open(temp_input_file, 'w') as f: f.write("a\nb\nc") rdd = sc.parallelize(['a', 'b']) with pytest.raises(TypeError): DockerPipe(conf, volumes=(tempdir, ('bind', '/user_volume/')) ).map( image_name='ubuntu:xenial', command="cat /shared/input.txt > /shared/output.txt; cat " "/user_volume/test.txt " ">> /shared/output.txt", rdd=rdd) shutil.rmtree(tempdir)
def test_user_volume_illegal_input(spark_setup): """ Test exception case of user volume input, using invalid keyword 'binding' instead of 'bind'. """ sc, conf = spark_setup tempdir = tempfile.mkdtemp() temp_input_file = os.path.join(tempdir, 'test.txt') with open(temp_input_file, 'w') as f: f.write("a\nb\nc") rdd = sc.parallelize(['a', 'b']) with pytest.raises(ValueError): DockerPipe( conf, volumes={tempdir: {'binding': '/user_volume/'}} ).map( image_name='ubuntu:xenial', command="cat /shared/input.txt > /shared/output.txt; cat " "/user_volume/test.txt " ">> /shared/output.txt", rdd=rdd) shutil.rmtree(tempdir)
(index, ([read_1], [read_2])) ] """ return { 'input_1.txt': ('\n'.join(p[1][0]) for p in partition), 'input_2.txt': ('\n'.join(p[1][1]) for p in partition) } with DockerPipe(SparkConf(), volumes={ '/data/gmap': { 'bind': '/data/gmap' } }).map_binary(image_name='gsnap_samtools', command="gsnap -B 5 -A sam -N 1 -t 4 -s splicesites " "--sam-multiple-primaries --maxsearch=1000 --npaths=100 -D " "/data/gmap -d ref_genome /shared/input_1.txt " "/shared/input_2.txt > /shared/output.txt; " "samtools view /shared/output.txt -b -o /shared/output.bin", docker_options=['--ipc=host'], rdd=joined_rdd, map_udf=map_udf) as bamfiles: rdd = sc.binaryFiles(bamfiles) results = DockerPipe(SparkConf()).reduce_binary( rdd=rdd, command='samtools merge /shared/output.bin /shared/input_1.bin ' '/shared/input_2.bin', image_name='gsnap_samtools') with open('SRR1975008.bam', 'wb') as fh: fh.write(results)
parser.add_argument('input_file', help="An input text file from which " "to count words") parser.add_argument('-o', '--output', help="Output file name. Default is 'word_counts.txt", default='word_counts.txt') return parser.parse_args() arguments = command_line() conf = SparkConf() sc = SparkContext(conf=conf) rdd = sc.textFile(arguments.input_file) mappedRDD = DockerPipe(SparkConf()).map( image_name='ubuntu:xenial', command="cat /shared/input.txt | tr -s ' ' | tr ' ' '\\n' | sort | uniq " "-c | awk '\$1=\$1' > /shared/output.txt", rdd=rdd) def transform(partition): """ Nest each partition in a list so it gets reduced faster, otherwise it takes one pair of elements at a time to start. """ return [(list(partition))] mappedRDD = mappedRDD.mapPartitions(transform) result = DockerPipe(SparkConf()).reduce( image_name='ubuntu:xenial',