Esempio n. 1
0
def test_binary_map_rdd(spark_setup):
    """
    Test lofn binary map is handling binary as expected. We want to return a
    directory, user side reads in using sc.binaryFiles presumably, and the
    output should be partitions of tuples containing (path, contents).
    """
    sc, conf = spark_setup
    rdd = sc.parallelize(['test', 'words'])

    new_object = DockerPipe(conf)
    with new_object.map_binary(
        rdd=rdd,
        image_name='ubuntu:xenial',
        command='head -c 1024 </dev/urandom > /shared/output.bin',
    ) as f:
        binary_dir = f
        mappedRDD = sc.binaryFiles(binary_dir)
        results = mappedRDD.collect()[0]
    assert results[0].split(':')[0] == 'file' and not results[1].isalnum()
Esempio n. 2
0
def test_binary_map_context_manager(spark_setup):
    """
    Test lofn binary map context manager. We want a context manager
    because spark's rdd binary file function takes a dictionary of files then
    reads them in, so we want to keep temp files until these are read
     then destroy the temporary directory.
    """
    sc, conf = spark_setup
    rdd = sc.parallelize(['test', 'words'])
    new_object = DockerPipe(conf)
    with new_object.map_binary(
        rdd=rdd,
        image_name='ubuntu:xenial',
        command='cat /shared/input.txt > /shared/output.bin',
    ) as f:
        binary_dir = f
        os.listdir(binary_dir)  # should work inside the context manager
    with pytest.raises(OSError):
        # this is outside of the context manager, raises FileNotFoundError
        os.listdir(binary_dir)
Esempio n. 3
0
def test_custom_tempfile_directory(spark_setup):
    """
    Test user defined tempfile directory. This allows setting the
    name of the parent directory for the temp directories used to store
    tempfiles by lofn.
    """
    sc, conf = spark_setup
    rdd = sc.parallelize(['test', 'words'])

    def unique_dir_name(root, count=0):
        """Recursive function to find a unique name for testing so we don't
        have to destroy anything"""
        if not os.path.exists(root):
            return root
        else:
            count += 1
            new_name = root
            components = root.split('_')
            if len(components) > 1:
                new_name = '_'.join(components[:-1])
            new_name = new_name + '_' + str(count)
            return unique_dir_name(new_name, count)
    # still writing inside of /tmp to make sure we have write permissions
    tempdir_name = unique_dir_name('/tmp/lofntestdir')
    assert not os.path.exists(tempdir_name)
    test_object = DockerPipe(conf, temporary_directory_parent=tempdir_name)
    # using map_binary since the context manager gives us the ability to 
    # keep the temp dirs open for us to check it's existence and use before 
    # removing it all
    with test_object.map_binary(
            rdd=rdd,
            image_name='ubuntu:xenial',
            command='head -c 1024 </dev/urandom > /shared/output.bin',
    ) as binary_dir:
        assert os.path.exists(tempdir_name)  # make sure the path has been
        # created
        assert tempdir_name in binary_dir  # make sure the specified
        # directory is being used
    shutil.rmtree(tempdir_name)
Esempio n. 4
0
def test_binary_map_context_manager(spark_setup):
    """
    Test lofn binary map context manager. We want a context manager
    because spark's rdd binary file function takes a dictionary of files then
    reads them in, so we want to keep temp files until these are read
     then destroy the temporary directory.
    """
    sc, conf = spark_setup
    rdd = sc.parallelize(['test', 'word'])
    new_object = DockerPipe(conf)
    with new_object.map_binary(
        rdd=rdd,
        image_name='ubuntu:xenial',
        command='cat /shared/input.txt > /shared/output.bin'
    ) as f:
        binary_dir = f
        # should work inside the context manager
        subprocess.call(['hadoop', 'fs', '-ls', '{}'.format(binary_dir)])
    with pytest.raises(subprocess.CalledProcessError):
        # this is outside of the context manager, should raise
        # non-zero exist status
        subprocess.check_call(['hadoop', 'fs', '-ls', '{}'.format(binary_dir)])
Esempio n. 5
0
def test_binary_reduce(spark_setup):
    """
    Test the binary reduce. Takes output from binary map, a binary rdd,
    and writes them wholesale into two temp files for operations on the
    binary files.
    """
    sc, conf = spark_setup
    rdd = sc.parallelize(['test', 'words'])

    new_object = DockerPipe(conf)
    with new_object.map_binary(
            rdd=rdd,
            image_name='ubuntu:xenial',
            command='head -c 1024 </dev/urandom > /shared/output.bin',
    ) as f:
        binary_dir = f
        mappedRDD = sc.binaryFiles(binary_dir)
        results = new_object.reduce_binary(
            rdd=mappedRDD,
            image_name='ubuntu:xenial',
            command='cat /shared/input_1.bin > /shared/output.bin; '
                    'cat /shared/input_2.bin >> /shared/output.bin'
        )
    assert not results.isalnum() and 3000 > sys.getsizeof(results) >= 2048