Example #1
0
class TestMapReduceJob(TestCase):
    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_mr_job_command_generation_with_arguments(self):
        _job_name = "test_mr_job_%s" % uuid.uuid4()

        _base_dir = HDFS(os.path.join("/tmp", _job_name))
        _base_dir.create_directory()
        try:
            jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar')
            # configure job inputs
            _job_input = HDFS(os.path.join(_base_dir.path, "input"))
            _job_input.create_directory()
            LocalFS(os.path.join(
                os.path.dirname(__file__),
                'resources',
                'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _job_input.path
            )

            # configure job output
            _job_output = HDFS(os.path.join(_base_dir.path, "output"))
            if not os.path.exists(jar):
                self.skipTest("'%s' not found" % jar)

            job = MapReduce.prepare_mapreduce_job(jar=jar,
                                                  main_class="wordcount",
                                                  name=_job_name) \
                .with_config_option("split.by", "'\\t'") \
                .with_number_of_reducers(3) \
                .with_arguments(
                _job_input.path,
                _job_output.path
            )
            _command_submission_result = job.run()
            _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed")
            self.assertTrue(_job_output.exists(), "Error: empty job output")
            #     check counters
            self.assertEqual(6, _job_status.counter(group='File System Counters',
                                                    counter='HDFS: Number of write operations'))
            self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks'))
            self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _base_dir.delete_directory()
Example #2
0
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

import socket

from unittest2 import TestCase, skipUnless

from merlin.tools.distcp import DistCp
import merlin.common.shell_command_executor as shell
from merlin.common.test_utils import has_command


@skipUnless(has_command('hadoop'), "Hadoop client should be installed")
class TestDistCp(TestCase):
    def setUp(self):
        super(TestDistCp, self).setUp()
        shell.execute_shell_command('hadoop fs', '-mkdir /tmp/foo')
        shell.execute_shell_command('hadoop fs', '-mkdir /tmp/bar')
        shell.execute_shell_command('hadoop fs', '-touchz /tmp/foo/test.txt')
        shell.execute_shell_command('hadoop fs', '-touchz /tmp/foo/test2.txt')

    def test_command(self):
        _host = "sandbox.hortonworks.com"
        cmd = DistCp().take(path="hdfs://{host}:8020/tmp/foo".format(host=_host)).copy_to(
            path="hdfs://{host}:8020/tmp/bar".format(host=_host)
        ).use(
            mappers=12
        ).update_destination(
Example #3
0
                               "{0}:{1}".format(CLUSTER_NAME,
                                                PORT), "--topic", "test123"
                           ])


class KafkaThreadConsumer(threading.Thread):
    def run(self):
        Kafka.run_consumer(
            name='kafka.consumer.ConsoleConsumer',
            args=[
                "--zookeeper sandbox.hortonworks.com:2181 --from-beginning --topic test123"
            ])


@skipUnless(
    has_command("kafka-run-class.sh") and has_command("netstat"),
    "./kafka/bin should be set to $PATH and netstat should be installed")
class TestKafka(unittest2.TestCase):
    def test_broker(self):
        shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
        local = LocalFS("/tmp/kafka-test")
        if not local.exists():
            local.create_directory()
        thread = KafkaThreadBroker()
        thread.daemon = True
        thread.start()
        sleep(TIME)
        cmd = shell.execute_shell_command('netstat -lntu')
        self.assertTrue("9010" in cmd.stdout, cmd.stdout)
        local.delete_directory()
        shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
Example #4
0
# for additional information regarding copyright ownership and licensing.
#

from tempfile import NamedTemporaryFile
import uuid

from unittest2.case import TestCase, skipUnless, skip
from merlin.common.configurations import Configuration
from merlin.common.shell_command_executor import execute_shell_command
from merlin.fs.hdfs import HDFS
from merlin.tools.pig import Pig, TaskOptions
from merlin.common.test_utils import has_command
TEZ_MODE_ENABLE = True


@skipUnless(has_command('pig'), "pig client should be installed")
class TestPigClient(TestCase):
    @skip(
        "Parameter substitution does not supported while trying to run commands from string"
    )
    def test_run_commands_from_string(self):
        _test_id = str(uuid.uuid4())
        _output_dir = "/tmp/data_{}".format(_test_id)
        _input_dir = self.copy_file_from_local(
            self.temp_file("hello,world,world", ".txt"))

        commands = "A = load '$input_dir' using PigStorage(',');"
        commands += "B = foreach A generate \$0 as id;"
        commands += "STORE B into '$output_dir';"
        try:
            _pig = Pig.load_commands_from_string(commands) \
Example #5
0
import socket

from unittest2.case import skipUnless
from merlin.common.test_utils import has_command
from merlin.tools.webhcat import WebHCatalog, TableProperties
import merlin.common.shell_command_executor as shell

import unittest2


WEBHCAT_IS_INSTALLED_AND_ENABLED = True
HOST = "{0}:50111".format("sandbox.hortonworks.com")
USER = "******"


@skipUnless(has_command('hive') and WEBHCAT_IS_INSTALLED_AND_ENABLED, "Hive and WebHCatalog clients should be installed and enabled")
class TestWebHCatalog(unittest2.TestCase):
    @classmethod
    def setUpClass(cls):
        shell.execute_shell_command('hive -e \"drop database if EXISTS testdb CASCADE\"')
        shell.execute_shell_command('hive -e \"create database testdb\"')
        c = 'hive -e \"create table testdb.some_table(strings STRING) ' \
            'ROW FORMAT DELIMITED ' \
            'FIELDS TERMINATED BY \\",\\" ' \
            'STORED AS TEXTFILE\"'
        shell.execute_shell_command(c)

    def test_get_property(self):
        output = WebHCatalog(host=HOST, username=USER).table_properties(database="testdb",
                                                                        table="some_table").get_property("table")
        self.assertEquals(output, "some_table")
Example #6
0
from tempfile import NamedTemporaryFile
import uuid

import unittest2
from unittest2.case import skipUnless, expectedFailure

from merlin.common.exceptions import HiveCommandError
from merlin.common.shell_command_executor import execute_shell_command
import merlin.fs.cli.hdfs_commands as hdfs
from merlin.tools.hive import Hive
from merlin.common.test_utils import has_command
import merlin.common.shell_command_executor as shell


@skipUnless(has_command('hive'), "Hive client should be installed")
class TestExecuteQuery(unittest2.TestCase):
    
    @classmethod
    def setUpClass(cls):
        shell.execute_shell_command('hive -e "drop database if EXISTS testdb CASCADE;"')
    
    def test_execute_query_string(self):
        hive = Hive.load_queries_from_string("show tables").with_hive_conf("A", "B").add_hivevar("A", "B") \
            .define_variable("A", "B")
        res = hive.run()
        self.assertEqual(res.is_ok(), True)

    def test_create_database(self):
        db_exist = False
        try:
Example #7
0

MYSQLDUMP = True
MYSQL_SERVER = "sandbox.hortonworks.com"
BASE_DIR = "/tmp"
RUN_TEST = True
HBASE_IS_RUNNING = True
HIVE_IS_RUNNING = True
ZOOKEEPER_IS_RUNNING = False
USER = "******"

# Must be equals to text in resources/rdbms.password
PASSWORD = "******"


@skipUnless(has_command('sqoop') and has_command('mysql') and RUN_TEST,
            "sqoop and mysql clients should be installed and mysql must have user 'root' with password 'root'. Also we must add to /var/lib/sqoop jar with jdbc driver")
class TestSqoop(TestCase):
    @classmethod
    def setUpClass(cls):
        shell.execute_shell_command('hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR))
        shell.execute_shell_command('hadoop fs', '-copyFromLocal',
                                    os.path.join(os.path.dirname(__file__),
                                                 'resources/sqoop/rdbms.password'),
                                    BASE_DIR)
        shell.execute_shell_command('hadoop fs', '-mkdir', os.path.join(BASE_DIR, "data_custom_directory"))
        shell.execute_shell_command('hadoop fs', '-copyFromLocal',
                                    os.path.join(os.path.dirname(__file__),
                                                 'resources/sqoop/data_to_export.txt'),
                                    os.path.join(BASE_DIR, "data_custom_directory"))
        shell.execute_shell_command(
Example #8
0
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

import socket

from unittest2 import TestCase, skipUnless

from merlin.tools.distcp import DistCp
import merlin.common.shell_command_executor as shell
from merlin.common.test_utils import has_command


@skipUnless(has_command('hadoop'), "Hadoop client should be installed")
class TestDistCp(TestCase):
    def setUp(self):
        super(TestDistCp, self).setUp()
        shell.execute_shell_command('hadoop fs', '-mkdir /tmp/foo')
        shell.execute_shell_command('hadoop fs', '-mkdir /tmp/bar')
        shell.execute_shell_command('hadoop fs', '-touchz /tmp/foo/test.txt')
        shell.execute_shell_command('hadoop fs', '-touchz /tmp/foo/test2.txt')

    def test_command(self):
        _host = "sandbox.hortonworks.com"
        cmd = DistCp().take(path="hdfs://{host}:8020/tmp/foo".format(host=_host)).copy_to(
            path="hdfs://{host}:8020/tmp/bar".format(host=_host)
        ).use(
            mappers=12
        ).update_destination(
Example #9
0
class TestSparkAppSubmit(TestCase):
    masters = [
        SparkMaster.local(),
        SparkMaster.local(1),
        SparkMaster.yarn_client(),
        SparkMaster.yarn_cluster()
    ]

    input_path = os.path.join(os.path.dirname(__file__), "resources", "spark",
                              "input.txt")

    def _spark_application_template_(self, master):
        return SparkApplication().application(
            application_jar=os.path.join(os.path.dirname(__file__),
                                         "resources", "spark",
                                         "SparkExample.jar"),
            main_class="example.spark.WordCounter").master(master)

    def spark_app_config_template(self, master, name=str(uuid.uuid4())):
        _config = Configuration.create()
        _config.set(section=name,
                    key=TaskOptions.SPARK_APP_CONFIG_MASTER,
                    value=master)
        _config.set(section=name,
                    key=TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR,
                    value=os.path.join(os.path.dirname(__file__), "resources",
                                       "spark", "SparkExample.jar"))
        _config.set(section=name,
                    key=TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS,
                    value="example.spark.WordCounter")
        return _config

    @skipUnless(has_command('spark-submit'),
                "Cannot find spark-submit command-line utility")
    def test_spark_app_submit(self):
        # self.run_test(application=self._spark_application_template_(SparkMaster.local()))
        self._run_(application=self._spark_application_template_(
            SparkMaster.local(1)))
        # self.run_test(application=self._spark_application_template_(SparkMaster.yarn_cluster()))
        # self.run_test(application=self._spark_application_template_(SparkMaster.yarn_client()))

    @skipUnless(has_command('spark-submit'),
                "Cannot find spark-submit command-line utility")
    def test_preconfigured_spark_app_submit(self):
        section = str(uuid.uuid4())
        _app_config = self.spark_app_config_template(
            master=SparkMaster.local(1), name=section)
        self._run_(
            application=SparkApplication(config=_app_config, name=section))

    def _run_(self, application, test_id=str(uuid.uuid4())):
        basedir = LocalFS(os.path.join("/tmp", "test_spark", test_id))
        try:
            basedir.create_directory()
            _app_input = self.input_path
            _app_output_dir = os.path.join(basedir.path, "output")
            status = application.run('file:' + _app_input,
                                     'file:' + _app_output_dir)
            self.assertTrue(status.is_ok(), status.stderr())
            self.assertTrue(os.path.exists(_app_output_dir), status.stderr())
        finally:
            basedir.delete_directory()
Example #10
0
class TestMapReduceStreamingJob(TestCase):
    @classmethod
    def setUpClass(cls):
        path_to_file = os.path.join(os.path.dirname(__file__), 'resources',
                                 'mapreduce', 'mapper.py')
        cls._delete_carriage_return(path_to_file)
        path_to_file = os.path.join(os.path.dirname(__file__), 'resources',
                                 'mapreduce', 'reducer.py')
        cls._delete_carriage_return(path_to_file)

    @staticmethod
    def _delete_carriage_return(path_to_file):
        new_file = open(path_to_file + ".new", 'w')
        for line in open(path_to_file):
            line = line.replace('\r', '')
            new_file.write(line)
        new_file.close()
        os.remove(path_to_file)
        os.rename(new_file.name, path_to_file)

    def _run_and_assert(self, job):
        command_result = job.run()
        command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed"))
        _job_status = job.status()
        self.assertTrue(_job_status is not None and _job_status.is_succeeded())
        return _job_status

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_streaming_job(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            job = self._template_streaming_job_(base_dir=_job_basedir.path)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())
            # counters
            self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Spilled Records'),
                             "counters['Map-Reduce Framework']['Spilled Records']")
            self.assertEqual(143, _job_status.counter(group='Map-Reduce Framework', counter='Reduce output records'),
                             "counters['Map-Reduce Framework']['Reduce output records']")
            self.assertEqual(370, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'),
                             "counters['Map-Reduce Framework']['Reduce input records']")
        finally:
            _job_basedir.delete_directory()

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_streaming_job_with_multiple_inputs(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:

            job = self._template_streaming_job_(base_dir=_job_basedir.path)

            _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2"))
            _additional_datasource.create_directory()
            LocalFS(os.path.join(os.path.dirname(__file__), 'resources',
                                 'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _additional_datasource.path)
            job.take(_additional_datasource.path)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())
            # check counters
            self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'),
                             "counters['Map-Reduce Framework']['Reduce input records']")
        finally:
            _job_basedir.delete_directory()

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_streaming_job_without_reducer(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            job = self._template_streaming_job_(base_dir=_job_basedir.path, map_only_job=True)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("Cannot run map-only job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())

            #   check counters
            self.assertEqual(2, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(11, _job_status.counter(group='Map-Reduce Framework', counter='Map input records'))
            self.assertEqual(3252, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _job_basedir.delete_directory()

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False):
        if not os.path.exists(HADOOP_STREAMING_JAR):
            self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR)
        _hdfs_basdir = HDFS(base_dir)
        if not _hdfs_basdir.exists():
            _hdfs_basdir.create_directory()
        _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input"))
        _job_input.create_directory()
        _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output"))
        home = os.path.dirname(__file__)
        _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py')
        _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py')

        LocalFS(
            os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt')
        ).copy_to_hdfs(
            _job_input.path
        )

        return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \
            .take(_job_input.path) \
            .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \
            .save(_job_output.path)
Example #11
0
class TestSqoop(TestCase):
    @classmethod
    def setUpClass(cls):
        shell.execute_shell_command(
            'hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR))
        shell.execute_shell_command(
            'hadoop fs', '-copyFromLocal',
            os.path.join(os.path.dirname(__file__),
                         'resources/sqoop/rdbms.password'), BASE_DIR)
        shell.execute_shell_command(
            'hadoop fs', '-mkdir',
            os.path.join(BASE_DIR, "data_custom_directory"))
        shell.execute_shell_command(
            'hadoop fs', '-copyFromLocal',
            os.path.join(os.path.dirname(__file__),
                         'resources/sqoop/data_to_export.txt'),
            os.path.join(BASE_DIR, "data_custom_directory"))
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "'DROP DATABASE IF EXISTS sqoop_tests'")
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} -e'.format(
                USER, PASSWORD, MYSQL_SERVER), "'CREATE DATABASE sqoop_tests'")
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "'CREATE TABLE IF NOT EXISTS table_name(id INT(11) NOT NULL AUTO_INCREMENT,"
            "last_name varchar(255) NOT NULL, first_name varchar(255), city varchar(255),"
            "PRIMARY KEY (id))'")
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "'CREATE TABLE IF NOT EXISTS table_name_second(id INT(11) NOT NULL AUTO_INCREMENT,"
            "last_name varchar(255) NOT NULL, first_name varchar(255), city varchar(255),"
            "PRIMARY KEY (id))'")
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "'CREATE TABLE IF NOT EXISTS stag(id INT(11) NOT NULL AUTO_INCREMENT,"
            "last_name varchar(255) NOT NULL, first_name varchar(255), city varchar(255),"
            "PRIMARY KEY (id))'")
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "\"INSERT INTO table_name (last_name) VALUES ('Bob')\"")
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "\"INSERT INTO table_name (last_name, first_name, city) VALUES ('Alex','Log','New York')\""
        )
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "\"INSERT INTO table_name (last_name, first_name, city) VALUES ('Merry','Log','New York')\""
        )
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "\"INSERT INTO table_name (last_name, first_name, city) VALUES ('Bob','Log','New York')\""
        )
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "\"delimiter //\ncreate procedure p(in p_id INT, in p_last_name varchar(255), "
            "in p_first_name varchar(255), in p_city varchar(255)) begin insert into table_name_second("
            "id, last_name, first_name, city) values(p_id,p_last_name,p_first_name,p_city);\nend//\""
        )

    def test_import_table(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopImport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).from_rdbms().table(
                table="table_name", where="id>2",
                columns="id,last_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(BASE_DIR)).run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_export_table(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopExport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).to_rdbms().table(table="table_name_second").from_hdfs(
                export_dir="{0}/data_custom_directory".format(BASE_DIR)).run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'SELECT * FROM table_name_second'")
            self.assertNotEqual(
                result.stdout.split(' ')[0], 'Empty', result.stdout)
        finally:
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'DELETE FROM table_name_second'")

    @skipUnless(
        MYSQLDUMP,
        "mysqldump utility should be installed on individual node machines")
    def test_export_table_with_direct_mode(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopExport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).to_rdbms().table(table="table_name_second").from_hdfs(
                export_dir="{0}/data_custom_directory".format(
                    BASE_DIR)).with_direct_mode().run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'SELECT * FROM table_name_second'")
            self.assertNotEqual(
                result.stdout.split(' ')[0], 'Empty', result.stdout)
        finally:
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'DELETE FROM table_name_second'")

    def test_export_table_with_batch(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopExport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).to_rdbms().table(table="table_name_second").from_hdfs(
                export_dir="{0}/data_custom_directory".format(
                    BASE_DIR)).with_batch().with_hadoop_properties(
                        sqoop_export_records_per_statement="10").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'SELECT * FROM table_name_second'")
            self.assertNotEqual(
                result.stdout.split(' ')[0], 'Empty', result.stdout)
        finally:
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'DELETE FROM table_name_second'")

    def test_export_table_with_encoding(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopExport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).to_rdbms().table(table="table_name_second").from_hdfs(
                export_dir="{0}/data_custom_directory".format(
                    BASE_DIR)).with_encoding(input_null_string="NNN").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'SELECT * FROM table_name_second'")
            self.assertNotEqual(
                result.stdout.split(' ')[0], 'Empty', result.stdout)
        finally:
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'DELETE FROM table_name_second'")

    def test_export_table_with_staging(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopExport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).to_rdbms().table(table="table_name_second").from_hdfs(
                export_dir="{0}/data_custom_directory".format(
                    BASE_DIR)).with_staging_table(staging_table="stag").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'SELECT * FROM table_name_second'")
            self.assertNotEqual(
                result.stdout.split(' ')[0], 'Empty', result.stdout)
        finally:
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'DELETE FROM table_name_second'")
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM stag'")

    def test_export_table_with_call(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopExport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).to_rdbms().from_hdfs(
                export_dir="{0}/data_custom_directory".format(BASE_DIR)).call(
                    stored_procedure="p").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'SELECT * FROM table_name_second'")
            self.assertNotEqual(
                result.stdout.split(' ')[0], 'Empty', result.stdout)
        finally:
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'DELETE FROM table_name_second'")

    def test_export_table_with_update(self):
        try:
            metastore = IniFileMetaStore(file=os.path.join(
                os.path.dirname(__file__), 'resources/sqoop/custom.ini'))
            cmd = SqoopExport.load_preconfigured_job(config=Configuration.load(
                metastore=metastore, readonly=False, accepts_nulls=True
            )).to_rdbms().table(table="table_name_second").from_hdfs(
                export_dir="{0}/data_custom_directory".format(
                    BASE_DIR)).with_update(update_key="id",
                                           update_mode="allowinsert").run()
            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'SELECT * FROM table_name_second'")
            self.assertNotEqual(
                result.stdout.split(' ')[0], 'Empty', result.stdout)
        finally:
            shell.execute_shell_command(
                'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.
                format(USER, PASSWORD,
                       MYSQL_SERVER), "'DELETE FROM table_name_second'")

    def test_import_query(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests"
            ).query(
                query="'SELECT * FROM table_name WHERE $CONDITIONS AND id>$id'",
                split_by="id",
                id="2").to_hdfs(
                    target_dir="{0}/custom_directory".format(BASE_DIR)).run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_import_to_sequencefile(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(BASE_DIR)
                ).use_file_format(file_format="--as-sequencefile").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_import_to_avrodatafile(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(BASE_DIR)
                ).use_file_format(file_format="--as-avrodatafile").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*.avro'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    @skipUnless(
        MYSQLDUMP,
        "mysqldump utility should be installed on individual node machines")
    def test_import_with_direct_mode(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(
                        BASE_DIR)).with_direct_mode().run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_import_with_compress(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests"
            ).table(table="table_name").to_hdfs(
                target_dir="{0}/custom_directory".format(BASE_DIR)
            ).with_compress(
                compression_codec="org.apache.hadoop.io.compress.BZip2Codec"
            ).run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*.bz2'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_import_with_incremental(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(BASE_DIR)).run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(
                        BASE_DIR)).with_incremental(incremental="append",
                                                    last_value="5",
                                                    check_column="id").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_import_with_null(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(
                        BASE_DIR)).with_encoding(null_string="N").run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_import_with_connection_manager(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(BASE_DIR)
                ).with_attr(
                    connection_manager="org.apache.sqoop.manager.MySQLManager"
                ).run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    def test_import_with_enclosing(self):
        try:
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(table="table_name").to_hdfs(
                    target_dir="{0}/custom_directory".format(BASE_DIR)
                ).with_input_parsing(escaped_by="\\").with_output_parsing(
                    escaped_by="\\", mysql_delimiters=True).run()

            self.assertEquals(cmd.status, 0, cmd.stderr)
            result = shell.execute_shell_command(
                'hadoop fs',
                '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR))
            self.assertNotEqual(
                result.stdout.split(' ')[0], '0', result.stdout)
        finally:
            shell.execute_shell_command(
                'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR))

    @skipUnless(
        has_command('hbase') and HBASE_IS_RUNNING and ZOOKEEPER_IS_RUNNING,
        "hbase client should be installed")
    def test_import_to_hbase(self):
        cmd = Sqoop.import_data().from_rdbms(
            host=MYSQL_SERVER,
            rdbms="mysql",
            username="******",
            password_file="{0}/rdbms.password".format(BASE_DIR),
            database="sqoop_tests").table(table="table_name").to_hbase(
                hbase_table="custom_table",
                hbase_create_table=True,
                hbase_row_key="id",
                column_family="f1").run()

        self.assertEquals(cmd.status, 0, cmd.stderr)
        # HDP Cluster has another path to HBase data
        # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*')
        # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout)

    @skipUnless(
        has_command('hive') and HIVE_IS_RUNNING,
        "hive client should be installed")
    def test_import_to_hive(self):
        _path = HDFS(os.path.join('/user', getpass.getuser(), 'table_name'))
        try:
            if _path.exists():
                _path.delete(recursive=_path.is_directory())
                # shell.execute_shell_command('hadoop fs', '-rm -r /user/', getpass.getuser(), '/table_name')
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(
                    table="table_name").to_hive().run()

            # self.assertEquals(cmd.status, 0, cmd.stderr)
            # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*')
            # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout)
        finally:

            shell.execute_shell_command(
                'hive', "-e 'DROP TABLE IF EXISTS table_name'")

    @classmethod
    def tearDownClass(cls):
        shell.execute_shell_command(
            'hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR))
        shell.execute_shell_command(
            'hadoop fs', '-rm -r {0}/data_custom_directory'.format(BASE_DIR))
        shell.execute_shell_command(
            'mysql --user {0} --password={1} --host={2} -e'.format(
                USER, PASSWORD, MYSQL_SERVER),
            "'DROP DATABASE IF EXISTS sqoop_tests'")
Example #12
0
MYSQLDUMP = True
MYSQL_SERVER = "sandbox.hortonworks.com"
BASE_DIR = "/tmp"
RUN_TEST = True
HBASE_IS_RUNNING = True
HIVE_IS_RUNNING = True
ZOOKEEPER_IS_RUNNING = False
USER = "******"

# Must be equals to text in resources/rdbms.password
PASSWORD = "******"


@skipUnless(
    has_command('sqoop') and has_command('mysql') and RUN_TEST,
    "sqoop and mysql clients should be installed and mysql must have user 'root' with password 'root'. Also we must add to /var/lib/sqoop jar with jdbc driver"
)
class TestSqoop(TestCase):
    @classmethod
    def setUpClass(cls):
        shell.execute_shell_command(
            'hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR))
        shell.execute_shell_command(
            'hadoop fs', '-copyFromLocal',
            os.path.join(os.path.dirname(__file__),
                         'resources/sqoop/rdbms.password'), BASE_DIR)
        shell.execute_shell_command(
            'hadoop fs', '-mkdir',
            os.path.join(BASE_DIR, "data_custom_directory"))
        shell.execute_shell_command(
Example #13
0
#

from tempfile import NamedTemporaryFile
import uuid

from unittest2.case import TestCase, skipUnless, skip
from merlin.common.configurations import Configuration
from merlin.common.shell_command_executor import execute_shell_command
from merlin.fs.hdfs import HDFS
from merlin.tools.pig import Pig, TaskOptions
from merlin.common.test_utils import has_command

TEZ_MODE_ENABLE = True


@skipUnless(has_command("pig"), "pig client should be installed")
class TestPigClient(TestCase):
    @skip("Parameter substitution does not supported while trying to run commands from string")
    def test_run_commands_from_string(self):
        _test_id = str(uuid.uuid4())
        _output_dir = "/tmp/data_{}".format(_test_id)
        _input_dir = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt"))

        commands = "A = load '$input_dir' using PigStorage(',');"
        commands += "B = foreach A generate \$0 as id;"
        commands += "STORE B into '$output_dir';"
        try:
            _pig = (
                Pig.load_commands_from_string(commands)
                .with_parameter("input_dir", _input_dir)
                .with_parameter("output_dir", _output_dir)
Example #14
0
class KafkaThreadBroker(threading.Thread):
    def run(self):
        Kafka.start_broker(path_to_config=os.path.join(os.path.dirname(__file__), 'resources/kafka/server.properties'))


class KafkaThreadProducer(threading.Thread):
    def run(self):
        Kafka.run_producer(name='kafka.producer.ConsoleProducer', args=["--broker-list", "{0}:{1}".format(CLUSTER_NAME, PORT), "--topic", "test123"])


class KafkaThreadConsumer(threading.Thread):
    def run(self):
        Kafka.run_consumer(name='kafka.consumer.ConsoleConsumer', args=["--zookeeper sandbox.hortonworks.com:2181 --from-beginning --topic test123"])


@skipUnless(has_command("kafka-run-class.sh") and has_command("netstat"), "./kafka/bin should be set to $PATH and netstat should be installed")
class TestKafka(unittest2.TestCase):

    def test_broker(self):
        shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
        local = LocalFS("/tmp/kafka-test")
        if not local.exists():
            local.create_directory()
        thread = KafkaThreadBroker()
        thread.daemon = True
        thread.start()
        sleep(TIME)
        cmd = shell.execute_shell_command('netstat -lntu')
        self.assertTrue("9010" in cmd.stdout, cmd.stdout)
        local.delete_directory()
        shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
Example #15
0
from merlin.tools.flume import Flume
import merlin.common.shell_command_executor as shell

PORT = 41414

TIME_TO_OPEN_PORT = 60


class AgentThread(threading.Thread):
    def run(self):
        Flume.agent(agent="a1",
                    conf_file=os.path.join(os.path.dirname(__file__),
                                           'resources/flume/flume.conf')).run()


@skipUnless(has_command('flume-ng') and has_command('netstat'), "flume-ng and netstat clients should be installed ")
class TestFlume(unittest2.TestCase):
    def setUp(self):
        super(TestFlume, self).setUp()
        shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))

    def test_agent(self):
        thread = AgentThread()
        thread.daemon = True
        thread.start()
        time.sleep(TIME_TO_OPEN_PORT)
        cmd = shell.execute_shell_command('netstat -lntu')
        self.assertTrue("41414" in cmd.stdout, cmd.stdout)

    def tearDown(self):
        shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
Example #16
0
class TestHDFS(TestCase):
    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_exists(self):
        self.assertTrue(HDFS("/tmp").exists())
        self.assertFalse(HDFS("/tmp_12345").exists())

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_is_dir(self):
        self.assertTrue(HDFS("/tmp").is_directory(), "/tmp is not a dir")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_list_files(self):
        basedir = HDFS("/tmp")
        new_file = HDFS("/tmp/test.txt")
        try:
            new_file.create(directory=False)
            self.assertTrue(new_file.exists(), "File was not created")
            files = basedir.list_files()
            self.assertTrue(new_file in files)
        finally:
            new_file.delete()
            self.assertFalse(new_file.exists(), "File was not deleted")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_recursive_list_files(self):
        basedir = HDFS("/tmp")
        new_folder = HDFS("/tmp/test123")
        new_file = HDFS("/tmp/test123/test.txt")
        try:
            new_folder.create(directory=True)
            self.assertTrue(new_folder.exists(), "Folder was not created")
            new_file.create(directory=False)
            self.assertTrue(new_file.exists(), "File was not created")
            files = basedir.recursive_list_files()
            self.assertTrue(new_file in files)
            self.assertTrue(new_folder in files)
        finally:
            new_folder.delete(recursive=True)
            self.assertFalse(new_file.exists(), "Folder was not deleted")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_create(self):
        new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        # tets new file creation
        try:
            new_file.create(directory=False)
            self.assertTrue(new_file.exists(), "File was not created")
            self.assertFalse(new_file.is_directory(), "New file should not be a directory")
        finally:
            new_file.delete()
            self.assertFalse(new_file.exists(), "File was not removed")
            # test new folder creation
        try:
            new_dir.create(directory=True)
            self.assertTrue(new_dir.exists(), "Directory was not created")
            self.assertTrue(new_dir.is_directory(), "New file should be a directory")
        finally:
            new_dir.delete(recursive=True)
            self.assertFalse(new_dir.exists(), "Directory was not removed")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_create_directory(self):
        new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        self.assertFalse(new_dir.exists(), "Directory is already exists")
        try:
            new_dir.create_directory()
            self.assertTrue(new_dir.exists(), "Directory was not created")
            self.assertTrue(new_dir.is_directory())
        finally:
            new_dir.delete(recursive=True)
            self.assertFalse(new_dir.exists(), "Directory was not removed")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_create_file(self):
        new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        self.assertFalse(new_file.exists(), "File is already exists")
        try:
            new_file.create_file()
            self.assertTrue(new_file.exists(), "File was not created")
            self.assertFalse(new_file.is_directory(), "New file should not be a folder")
        finally:
            new_file.delete()
            self.assertFalse(new_file.exists(), "File was not removed")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def should_create_directory_recursively(self):
        _base_dir = os.path.join('/tmp', str(uuid.uuid4()))
        _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()))
        _dir = HDFS(_path)
        self.assertFalse(_dir.exists(), "Folder is already exists")
        try:
            _dir.create_directory(recursive=True)
            self.assertTrue(_dir.exists(), "Folder was not created")
            self.assertTrue(_dir.is_directory(), "New file should be a directory")
        finally:
            HDFS(_base_dir).delete_directory()
            self.assertFalse(_dir.exists(), "File was not removed")
            self.assertFalse(HDFS(_base_dir).exists(), "Base dir was not removed")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def should_create_file_recursively(self):
        _base_dir = os.path.join('/tmp', str(uuid.uuid4()))
        _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()), 'file.txt')
        _file = HDFS(_path)
        self.assertFalse(_file.exists(), "File is already exists")
        try:
            _file.create_file(recursive=True)
            self.assertTrue(_file.exists(), "File was not created")
            self.assertFalse(_file.is_directory(), "New file should not be a directory")
        finally:
            HDFS(_base_dir).delete_directory()
            self.assertFalse(_file.exists(), "File was not removed")
            self.assertFalse(HDFS(_base_dir).exists(), "Bse dir was not removed")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def should_raise_error_mkdir_not_recursive(self):
        _base_dir = os.path.join('/tmp', str(uuid.uuid4()))
        _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()))
        _dir = HDFS(_path)
        self.assertFalse(_base_dir.exists(), "Folder is already exists")
        try:
            self.assertRaises(FileSystemException, _dir.create_directory, recursive=False)
        finally:
            self.assertFalse(_dir.exists(), "File was created")


    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_copy_to_local(self):
        new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        local_path = os.path.join("/tmp", "copied_from_hdfs")
        self.assertFalse(os.path.exists(local_path))
        try:
            new_file.create_file()
            self.assertTrue(new_file.exists(), "File was not created")
            new_file.copy_to_local(local_path)
            self.assertTrue(os.path.exists(local_path), "File was not copied from HDFS")
        finally:
            new_file.delete()
            self.assertFalse(new_file.exists(), "File was not removed")
            os.remove(local_path)
            self.assertFalse(os.path.exists(local_path))

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_copy_file(self):
        _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        dst = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            _file.create_file()
            self.assertTrue(_file.exists(), "original file not found")
            self.assertFalse(dst.exists(), "destination file already exists")
            _file.create()
            _file.copy(dst)
            self.assertTrue(dst.exists(), "file was not copied")
            self.assertTrue(_file.exists(), "original file should not be deleted")
        finally:
            _file.delete()
            dst.delete()
            self.assertFalse(_file.exists(), "File was not deleted")
            self.assertFalse(dst.exists(), "destination file was not deleted")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_copy_empty_dir(self):
        _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
        try:
            _dir.create(directory=True)
            self.assertTrue(_dir.exists(), "directory not found")
            self.assertFalse(dst.exists(), "dst directory is already exists")
            _dir.copy(dst)
            self.assertTrue(dst.exists(), "directory was not copied")
        finally:
            _dir.delete(True)
            dst.delete(True)
            self.assertFalse(_dir.exists(), "File was not deleted")
            self.assertFalse(dst.exists(), "File was not deleted")

    def _create_non_empty_dir_(self, path):
        _dir = HDFS(path)
        _dir.create_directory()
        self.assertTrue(_dir.exists(), "source directory not found")
        for i in range(5):
            _file = HDFS(os.path.join(path, str(uuid.uuid4())))
            _file.create(directory=(i % 2 == 0))
            self.assertTrue(_file.exists(), "File was not created")
        return _dir

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_copy_non_empty_dir(self):
        dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
        _dir = None
        try:
            _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4())))
            self.assertFalse(dst.exists(), "dst directory is already exists")
            _dir.copy(dst)
            self.assertTrue(dst.exists(), "directory was not copied")
            self.assertTrue(_dir.exists(), "original directory should not be deleted")
        finally:
            if _dir:
                _dir.delete_directory()
                self.assertFalse(_dir.exists(), "Folder was not deleted")
            dst.delete_directory()
            self.assertFalse(dst.exists(), "Dst Folder was not deleted")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_move_file(self):
        _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        dst = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            _file.create_file()
            self.assertTrue(_file.exists(), "File was not created")
            self.assertFalse(dst.exists(), "Destination file should not exist")
            _file.move(dst.path)
            self.assertFalse(_file.exists(), "Original file should be deleted")
            self.assertTrue(dst.exists(), "Destination file should be created")
        finally:
            _file.delete()
            dst.delete()
            self.assertFalse(_file.exists(), "File was not deleted")
            self.assertFalse(dst.exists(), "destination file was not deleted")


    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_move_empty_dir(self):
        _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
        try:
            _dir.create(directory=True)
            self.assertTrue(_dir.exists(), "directory not found")
            self.assertFalse(dst.exists(), "destination directory is already exists")
            _dir.move(dst.path)
            self.assertFalse(_dir.exists(), "Original directory was not removed")
            self.assertTrue(dst.exists(), "destination directory was not created")
        finally:
            _dir.delete(True)
            dst.delete(True)
            self.assertFalse(_dir.exists(), "File was not deleted")
            self.assertFalse(dst.exists(), "File was not deleted")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_move_non_empty_dir(self):
        dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
        _dir = None
        try:
            _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4())))
            self.assertFalse(dst.exists(), "dst directory is already exists")
            _dir.move(dst.path)
            self.assertFalse(_dir.exists(), "original directory should be deleted")
            self.assertTrue(dst.exists(), "directory move operation failed")
        finally:
            if _dir:
                _dir.delete_directory()
                self.assertFalse(_dir.exists(), "Folder was not deleted")
            dst.delete_directory()
            self.assertFalse(dst.exists(), "Dst Folder was not deleted")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_file_size(self):
        local = LocalFS(os.path.realpath(__file__))
        hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            local.copy_to_hdfs(hdfs_file.path)
            self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
            self.assertEqual(hdfs_file.size(), local.size())
        finally:
            hdfs_file.delete()

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_dir_size(self):
        local_basedir = os.path.dirname(os.path.realpath(__file__))
        local = LocalFS(os.path.join(local_basedir, "resources", "test_dir_size"))
        hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            local.copy_to_hdfs(hdfs_file.path)
            self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
            expected_fsize = local.size()
            self.assertEqual(hdfs_file.size(), expected_fsize)
        finally:
            hdfs_file.delete(recursive=True)

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_merge(self):
        basedir = os.path.dirname(os.path.realpath(__file__))
        local = LocalFS(os.path.join(basedir, "resources", "test_merge"))
        hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt"))
        try:
            local.copy_to_hdfs(hdfs_file.path)
            self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
            hdfs_file.merge(merged_file.path)
            self.assertTrue(merged_file.exists(), "merged file was not copied to local fs")
        finally:
            hdfs_file.delete_directory()
            merged_file.delete()

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_delete_file(self):
        _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        _file.create_file()
        self.assertTrue(_file.exists(), "Target file can not be found")
        _file.delete()
        self.assertFalse(_file.exists(), "Target file was not deleted")


    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_delete_dir(self):
        local = LocalFS(os.path.dirname(os.path.realpath(__file__)))
        hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        local.copy_to_hdfs(hdfs_file.path)
        self.assertTrue(hdfs_file.exists(), "Target HDFS dir does not exists")
        hdfs_file.delete(recursive=True)
        self.assertFalse(hdfs_file.exists(), "Target HDFS dir was not deleted")

    # todo FIXIT file permissions can be different in different hadoop versions
    @skipUnless(has_command('hadoop') and True, "Hadoop client should be installed")
    def test_get_permissions(self):
        self.assertEqual("drwxr-xr-x", HDFS("/").permissions(), "Root dir permissions should be 'drwxr-xr-x'")
        # Permissions to '/tmp' folder are different on different CDH versions
        # self.assertEqual("drwxrwxrwt", HDFS("/tmp").permissions(), "Tmp dir permissions should be 'drwxrwxrwxt'")
        hbase_file = HDFS("/hbase/hbase.id")
        if hbase_file.exists():
            self.assertEqual("-rw-r--r--",
                             hbase_file.permissions(),
                             "/hbase/hbase.id permissions should be '-rw-r--r--'")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_get_replicas(self):
        self.assertEqual('0', HDFS("/").replicas(), "Root dir replicas should be 0")
        self.assertNotEqual('0', HDFS("/tmp").replicas(), "dir replicas should be 0")
        name = uuid.uuid4()
        hdfs_file = HDFS("/tmp/{0}".format(name))
        hdfs_file.create_file()
        shell.execute_shell_command('hadoop dfs', '-setrep -w 1 /tmp/{0}'.format(name))
        if hdfs_file.exists():
            self.assertEqual('1',
                             hdfs_file.replicas(),
                             "Number replicas of file must be 1")
            hdfs_file.delete()
            self.assertFalse(hdfs_file.exists())


    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_get_owner(self):
        self.assertEqual('hdfs', HDFS("/").owner(), "ERROR: Root dir owner")
        self.assertEqual('hdfs', HDFS("/tmp").owner(), "ERROR: /tmp dir owner")
        hbase_file = HDFS("/hbase/hbase.id")
        if hbase_file.exists():
            self.assertEqual('hbase', HDFS("/hbase/hbase.id").owner(), "ERROR: /hbase/hbase.id dir owner")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_get_modification_time(self):
        now = datetime.now().strftime("%Y-%m-%d")
        _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            _dir.create_directory()
            _file.create_file()
            self.assertTrue(_dir.exists(), "Dir was not created")
            self.assertTrue(_file.exists(), "File was not created")
            self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time")
            self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time")
        finally:
            _dir.delete_directory()
            _file.delete()

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_distcp(self):
        directory = HDFS("/tmp/bar")
        directory.create()
        new_file = HDFS("/tmp/test_dist.txt")
        new_file.create(directory=False)
        _host = "sandbox.hortonworks.com"
        try:
            self.assertTrue(new_file.exists(), "File was not created")
            _file = HDFS("hdfs://{host}:8020/tmp/test_dist.txt".format(host=_host))
            _file.distcp(dest="hdfs://{host}:8020/tmp/bar/test_dist.txt".format(host=_host))
            file_after_copy = HDFS("/tmp/bar/test_dist.txt")
            self.assertTrue(file_after_copy.exists(), "File was not copied")
        finally:
            new_file.delete()
            directory.delete(recursive=True)
            self.assertFalse(new_file.exists(), "File was not deleted")
            self.assertFalse(directory.delete(), "File was not deleted")

    @skipUnless(has_command('hadoop'), "Hadoop client should be installed")
    def test_get_description(self):
        directory = HDFS("/tmp/bar")
        try:
            directory.create()
            self.assertEqual(directory.get_description().name, "/tmp/bar")
            self.assertEqual(directory.get_description().size, 0)
            self.assertEqual(directory.get_description().owner, getpass.getuser())
            self.assertEqual(directory.get_description().create_date, None)

        finally:
            directory.delete(recursive=True)
            self.assertFalse(directory.delete(), "File was not deleted")