class TestMapReduceJob(TestCase): @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_mr_job_command_generation_with_arguments(self): _job_name = "test_mr_job_%s" % uuid.uuid4() _base_dir = HDFS(os.path.join("/tmp", _job_name)) _base_dir.create_directory() try: jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar') # configure job inputs _job_input = HDFS(os.path.join(_base_dir.path, "input")) _job_input.create_directory() LocalFS(os.path.join( os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) # configure job output _job_output = HDFS(os.path.join(_base_dir.path, "output")) if not os.path.exists(jar): self.skipTest("'%s' not found" % jar) job = MapReduce.prepare_mapreduce_job(jar=jar, main_class="wordcount", name=_job_name) \ .with_config_option("split.by", "'\\t'") \ .with_number_of_reducers(3) \ .with_arguments( _job_input.path, _job_output.path ) _command_submission_result = job.run() _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed") self.assertTrue(_job_output.exists(), "Error: empty job output") # check counters self.assertEqual(6, _job_status.counter(group='File System Counters', counter='HDFS: Number of write operations')) self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks')) self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _base_dir.delete_directory()
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # See the NOTICE file and the LICENSE file distributed with this work # for additional information regarding copyright ownership and licensing. # import socket from unittest2 import TestCase, skipUnless from merlin.tools.distcp import DistCp import merlin.common.shell_command_executor as shell from merlin.common.test_utils import has_command @skipUnless(has_command('hadoop'), "Hadoop client should be installed") class TestDistCp(TestCase): def setUp(self): super(TestDistCp, self).setUp() shell.execute_shell_command('hadoop fs', '-mkdir /tmp/foo') shell.execute_shell_command('hadoop fs', '-mkdir /tmp/bar') shell.execute_shell_command('hadoop fs', '-touchz /tmp/foo/test.txt') shell.execute_shell_command('hadoop fs', '-touchz /tmp/foo/test2.txt') def test_command(self): _host = "sandbox.hortonworks.com" cmd = DistCp().take(path="hdfs://{host}:8020/tmp/foo".format(host=_host)).copy_to( path="hdfs://{host}:8020/tmp/bar".format(host=_host) ).use( mappers=12 ).update_destination(
"{0}:{1}".format(CLUSTER_NAME, PORT), "--topic", "test123" ]) class KafkaThreadConsumer(threading.Thread): def run(self): Kafka.run_consumer( name='kafka.consumer.ConsoleConsumer', args=[ "--zookeeper sandbox.hortonworks.com:2181 --from-beginning --topic test123" ]) @skipUnless( has_command("kafka-run-class.sh") and has_command("netstat"), "./kafka/bin should be set to $PATH and netstat should be installed") class TestKafka(unittest2.TestCase): def test_broker(self): shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT)) local = LocalFS("/tmp/kafka-test") if not local.exists(): local.create_directory() thread = KafkaThreadBroker() thread.daemon = True thread.start() sleep(TIME) cmd = shell.execute_shell_command('netstat -lntu') self.assertTrue("9010" in cmd.stdout, cmd.stdout) local.delete_directory() shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
# for additional information regarding copyright ownership and licensing. # from tempfile import NamedTemporaryFile import uuid from unittest2.case import TestCase, skipUnless, skip from merlin.common.configurations import Configuration from merlin.common.shell_command_executor import execute_shell_command from merlin.fs.hdfs import HDFS from merlin.tools.pig import Pig, TaskOptions from merlin.common.test_utils import has_command TEZ_MODE_ENABLE = True @skipUnless(has_command('pig'), "pig client should be installed") class TestPigClient(TestCase): @skip( "Parameter substitution does not supported while trying to run commands from string" ) def test_run_commands_from_string(self): _test_id = str(uuid.uuid4()) _output_dir = "/tmp/data_{}".format(_test_id) _input_dir = self.copy_file_from_local( self.temp_file("hello,world,world", ".txt")) commands = "A = load '$input_dir' using PigStorage(',');" commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '$output_dir';" try: _pig = Pig.load_commands_from_string(commands) \
import socket from unittest2.case import skipUnless from merlin.common.test_utils import has_command from merlin.tools.webhcat import WebHCatalog, TableProperties import merlin.common.shell_command_executor as shell import unittest2 WEBHCAT_IS_INSTALLED_AND_ENABLED = True HOST = "{0}:50111".format("sandbox.hortonworks.com") USER = "******" @skipUnless(has_command('hive') and WEBHCAT_IS_INSTALLED_AND_ENABLED, "Hive and WebHCatalog clients should be installed and enabled") class TestWebHCatalog(unittest2.TestCase): @classmethod def setUpClass(cls): shell.execute_shell_command('hive -e \"drop database if EXISTS testdb CASCADE\"') shell.execute_shell_command('hive -e \"create database testdb\"') c = 'hive -e \"create table testdb.some_table(strings STRING) ' \ 'ROW FORMAT DELIMITED ' \ 'FIELDS TERMINATED BY \\",\\" ' \ 'STORED AS TEXTFILE\"' shell.execute_shell_command(c) def test_get_property(self): output = WebHCatalog(host=HOST, username=USER).table_properties(database="testdb", table="some_table").get_property("table") self.assertEquals(output, "some_table")
from tempfile import NamedTemporaryFile import uuid import unittest2 from unittest2.case import skipUnless, expectedFailure from merlin.common.exceptions import HiveCommandError from merlin.common.shell_command_executor import execute_shell_command import merlin.fs.cli.hdfs_commands as hdfs from merlin.tools.hive import Hive from merlin.common.test_utils import has_command import merlin.common.shell_command_executor as shell @skipUnless(has_command('hive'), "Hive client should be installed") class TestExecuteQuery(unittest2.TestCase): @classmethod def setUpClass(cls): shell.execute_shell_command('hive -e "drop database if EXISTS testdb CASCADE;"') def test_execute_query_string(self): hive = Hive.load_queries_from_string("show tables").with_hive_conf("A", "B").add_hivevar("A", "B") \ .define_variable("A", "B") res = hive.run() self.assertEqual(res.is_ok(), True) def test_create_database(self): db_exist = False try:
MYSQLDUMP = True MYSQL_SERVER = "sandbox.hortonworks.com" BASE_DIR = "/tmp" RUN_TEST = True HBASE_IS_RUNNING = True HIVE_IS_RUNNING = True ZOOKEEPER_IS_RUNNING = False USER = "******" # Must be equals to text in resources/rdbms.password PASSWORD = "******" @skipUnless(has_command('sqoop') and has_command('mysql') and RUN_TEST, "sqoop and mysql clients should be installed and mysql must have user 'root' with password 'root'. Also we must add to /var/lib/sqoop jar with jdbc driver") class TestSqoop(TestCase): @classmethod def setUpClass(cls): shell.execute_shell_command('hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR)) shell.execute_shell_command('hadoop fs', '-copyFromLocal', os.path.join(os.path.dirname(__file__), 'resources/sqoop/rdbms.password'), BASE_DIR) shell.execute_shell_command('hadoop fs', '-mkdir', os.path.join(BASE_DIR, "data_custom_directory")) shell.execute_shell_command('hadoop fs', '-copyFromLocal', os.path.join(os.path.dirname(__file__), 'resources/sqoop/data_to_export.txt'), os.path.join(BASE_DIR, "data_custom_directory")) shell.execute_shell_command(
class TestSparkAppSubmit(TestCase): masters = [ SparkMaster.local(), SparkMaster.local(1), SparkMaster.yarn_client(), SparkMaster.yarn_cluster() ] input_path = os.path.join(os.path.dirname(__file__), "resources", "spark", "input.txt") def _spark_application_template_(self, master): return SparkApplication().application( application_jar=os.path.join(os.path.dirname(__file__), "resources", "spark", "SparkExample.jar"), main_class="example.spark.WordCounter").master(master) def spark_app_config_template(self, master, name=str(uuid.uuid4())): _config = Configuration.create() _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_MASTER, value=master) _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_APPLICATION_JAR, value=os.path.join(os.path.dirname(__file__), "resources", "spark", "SparkExample.jar")) _config.set(section=name, key=TaskOptions.SPARK_APP_CONFIG_MAIN_CLASS, value="example.spark.WordCounter") return _config @skipUnless(has_command('spark-submit'), "Cannot find spark-submit command-line utility") def test_spark_app_submit(self): # self.run_test(application=self._spark_application_template_(SparkMaster.local())) self._run_(application=self._spark_application_template_( SparkMaster.local(1))) # self.run_test(application=self._spark_application_template_(SparkMaster.yarn_cluster())) # self.run_test(application=self._spark_application_template_(SparkMaster.yarn_client())) @skipUnless(has_command('spark-submit'), "Cannot find spark-submit command-line utility") def test_preconfigured_spark_app_submit(self): section = str(uuid.uuid4()) _app_config = self.spark_app_config_template( master=SparkMaster.local(1), name=section) self._run_( application=SparkApplication(config=_app_config, name=section)) def _run_(self, application, test_id=str(uuid.uuid4())): basedir = LocalFS(os.path.join("/tmp", "test_spark", test_id)) try: basedir.create_directory() _app_input = self.input_path _app_output_dir = os.path.join(basedir.path, "output") status = application.run('file:' + _app_input, 'file:' + _app_output_dir) self.assertTrue(status.is_ok(), status.stderr()) self.assertTrue(os.path.exists(_app_output_dir), status.stderr()) finally: basedir.delete_directory()
class TestMapReduceStreamingJob(TestCase): @classmethod def setUpClass(cls): path_to_file = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'mapper.py') cls._delete_carriage_return(path_to_file) path_to_file = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'reducer.py') cls._delete_carriage_return(path_to_file) @staticmethod def _delete_carriage_return(path_to_file): new_file = open(path_to_file + ".new", 'w') for line in open(path_to_file): line = line.replace('\r', '') new_file.write(line) new_file.close() os.remove(path_to_file) os.rename(new_file.name, path_to_file) def _run_and_assert(self, job): command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) return _job_status @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_streaming_job(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Spilled Records'), "counters['Map-Reduce Framework']['Spilled Records']") self.assertEqual(143, _job_status.counter(group='Map-Reduce Framework', counter='Reduce output records'), "counters['Map-Reduce Framework']['Reduce output records']") self.assertEqual(370, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory() @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_streaming_job_with_multiple_inputs(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path) _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2")) _additional_datasource.create_directory() LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _additional_datasource.path) job.take(_additional_datasource.path) command_result = job.run() command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'), "counters['Map-Reduce Framework']['Reduce input records']") finally: _job_basedir.delete_directory() @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_streaming_job_without_reducer(self): _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: job = self._template_streaming_job_(base_dir=_job_basedir.path, map_only_job=True) command_result = job.run() command_result.if_failed_raise(AssertionError("Cannot run map-only job")) _job_status = job.status() self.assertTrue(_job_status is not None and _job_status.is_succeeded()) # check counters self.assertEqual(2, _job_status.counter(group='Job Counters', counter='Launched map tasks')) self.assertEqual(11, _job_status.counter(group='Map-Reduce Framework', counter='Map input records')) self.assertEqual(3252, _job_status.counter(group='File Input Format Counters', counter='Bytes Read')) finally: _job_basedir.delete_directory() @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False): if not os.path.exists(HADOOP_STREAMING_JAR): self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR) _hdfs_basdir = HDFS(base_dir) if not _hdfs_basdir.exists(): _hdfs_basdir.create_directory() _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input")) _job_input.create_directory() _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output")) home = os.path.dirname(__file__) _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py') _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py') LocalFS( os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt') ).copy_to_hdfs( _job_input.path ) return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \ .take(_job_input.path) \ .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \ .save(_job_output.path)
class TestSqoop(TestCase): @classmethod def setUpClass(cls): shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR)) shell.execute_shell_command( 'hadoop fs', '-copyFromLocal', os.path.join(os.path.dirname(__file__), 'resources/sqoop/rdbms.password'), BASE_DIR) shell.execute_shell_command( 'hadoop fs', '-mkdir', os.path.join(BASE_DIR, "data_custom_directory")) shell.execute_shell_command( 'hadoop fs', '-copyFromLocal', os.path.join(os.path.dirname(__file__), 'resources/sqoop/data_to_export.txt'), os.path.join(BASE_DIR, "data_custom_directory")) shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} -e'.format( USER, PASSWORD, MYSQL_SERVER), "'DROP DATABASE IF EXISTS sqoop_tests'") shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} -e'.format( USER, PASSWORD, MYSQL_SERVER), "'CREATE DATABASE sqoop_tests'") shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "'CREATE TABLE IF NOT EXISTS table_name(id INT(11) NOT NULL AUTO_INCREMENT," "last_name varchar(255) NOT NULL, first_name varchar(255), city varchar(255)," "PRIMARY KEY (id))'") shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "'CREATE TABLE IF NOT EXISTS table_name_second(id INT(11) NOT NULL AUTO_INCREMENT," "last_name varchar(255) NOT NULL, first_name varchar(255), city varchar(255)," "PRIMARY KEY (id))'") shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "'CREATE TABLE IF NOT EXISTS stag(id INT(11) NOT NULL AUTO_INCREMENT," "last_name varchar(255) NOT NULL, first_name varchar(255), city varchar(255)," "PRIMARY KEY (id))'") shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "\"INSERT INTO table_name (last_name) VALUES ('Bob')\"") shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "\"INSERT INTO table_name (last_name, first_name, city) VALUES ('Alex','Log','New York')\"" ) shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "\"INSERT INTO table_name (last_name, first_name, city) VALUES ('Merry','Log','New York')\"" ) shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "\"INSERT INTO table_name (last_name, first_name, city) VALUES ('Bob','Log','New York')\"" ) shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'.format( USER, PASSWORD, MYSQL_SERVER), "\"delimiter //\ncreate procedure p(in p_id INT, in p_last_name varchar(255), " "in p_first_name varchar(255), in p_city varchar(255)) begin insert into table_name_second(" "id, last_name, first_name, city) values(p_id,p_last_name,p_first_name,p_city);\nend//\"" ) def test_import_table(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopImport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).from_rdbms().table( table="table_name", where="id>2", columns="id,last_name").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR)).run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_export_table(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopExport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).to_rdbms().table(table="table_name_second").from_hdfs( export_dir="{0}/data_custom_directory".format(BASE_DIR)).run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'SELECT * FROM table_name_second'") self.assertNotEqual( result.stdout.split(' ')[0], 'Empty', result.stdout) finally: shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM table_name_second'") @skipUnless( MYSQLDUMP, "mysqldump utility should be installed on individual node machines") def test_export_table_with_direct_mode(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopExport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).to_rdbms().table(table="table_name_second").from_hdfs( export_dir="{0}/data_custom_directory".format( BASE_DIR)).with_direct_mode().run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'SELECT * FROM table_name_second'") self.assertNotEqual( result.stdout.split(' ')[0], 'Empty', result.stdout) finally: shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM table_name_second'") def test_export_table_with_batch(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopExport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).to_rdbms().table(table="table_name_second").from_hdfs( export_dir="{0}/data_custom_directory".format( BASE_DIR)).with_batch().with_hadoop_properties( sqoop_export_records_per_statement="10").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'SELECT * FROM table_name_second'") self.assertNotEqual( result.stdout.split(' ')[0], 'Empty', result.stdout) finally: shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM table_name_second'") def test_export_table_with_encoding(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopExport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).to_rdbms().table(table="table_name_second").from_hdfs( export_dir="{0}/data_custom_directory".format( BASE_DIR)).with_encoding(input_null_string="NNN").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'SELECT * FROM table_name_second'") self.assertNotEqual( result.stdout.split(' ')[0], 'Empty', result.stdout) finally: shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM table_name_second'") def test_export_table_with_staging(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopExport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).to_rdbms().table(table="table_name_second").from_hdfs( export_dir="{0}/data_custom_directory".format( BASE_DIR)).with_staging_table(staging_table="stag").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'SELECT * FROM table_name_second'") self.assertNotEqual( result.stdout.split(' ')[0], 'Empty', result.stdout) finally: shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM table_name_second'") shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM stag'") def test_export_table_with_call(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopExport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).to_rdbms().from_hdfs( export_dir="{0}/data_custom_directory".format(BASE_DIR)).call( stored_procedure="p").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'SELECT * FROM table_name_second'") self.assertNotEqual( result.stdout.split(' ')[0], 'Empty', result.stdout) finally: shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM table_name_second'") def test_export_table_with_update(self): try: metastore = IniFileMetaStore(file=os.path.join( os.path.dirname(__file__), 'resources/sqoop/custom.ini')) cmd = SqoopExport.load_preconfigured_job(config=Configuration.load( metastore=metastore, readonly=False, accepts_nulls=True )).to_rdbms().table(table="table_name_second").from_hdfs( export_dir="{0}/data_custom_directory".format( BASE_DIR)).with_update(update_key="id", update_mode="allowinsert").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'SELECT * FROM table_name_second'") self.assertNotEqual( result.stdout.split(' ')[0], 'Empty', result.stdout) finally: shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} sqoop_tests -e'. format(USER, PASSWORD, MYSQL_SERVER), "'DELETE FROM table_name_second'") def test_import_query(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests" ).query( query="'SELECT * FROM table_name WHERE $CONDITIONS AND id>$id'", split_by="id", id="2").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR)).run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_import_to_sequencefile(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR) ).use_file_format(file_format="--as-sequencefile").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_import_to_avrodatafile(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR) ).use_file_format(file_format="--as-avrodatafile").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*.avro'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) @skipUnless( MYSQLDUMP, "mysqldump utility should be installed on individual node machines") def test_import_with_direct_mode(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format( BASE_DIR)).with_direct_mode().run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_import_with_compress(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests" ).table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR) ).with_compress( compression_codec="org.apache.hadoop.io.compress.BZip2Codec" ).run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*.bz2'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_import_with_incremental(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR)).run() self.assertEquals(cmd.status, 0, cmd.stderr) cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format( BASE_DIR)).with_incremental(incremental="append", last_value="5", check_column="id").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_import_with_null(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format( BASE_DIR)).with_encoding(null_string="N").run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_import_with_connection_manager(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR) ).with_attr( connection_manager="org.apache.sqoop.manager.MySQLManager" ).run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) def test_import_with_enclosing(self): try: cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hdfs( target_dir="{0}/custom_directory".format(BASE_DIR) ).with_input_parsing(escaped_by="\\").with_output_parsing( escaped_by="\\", mysql_delimiters=True).run() self.assertEquals(cmd.status, 0, cmd.stderr) result = shell.execute_shell_command( 'hadoop fs', '-du -s {0}/custom_directory/part-m-*'.format(BASE_DIR)) self.assertNotEqual( result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/custom_directory'.format(BASE_DIR)) @skipUnless( has_command('hbase') and HBASE_IS_RUNNING and ZOOKEEPER_IS_RUNNING, "hbase client should be installed") def test_import_to_hbase(self): cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table(table="table_name").to_hbase( hbase_table="custom_table", hbase_create_table=True, hbase_row_key="id", column_family="f1").run() self.assertEquals(cmd.status, 0, cmd.stderr) # HDP Cluster has another path to HBase data # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*') # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout) @skipUnless( has_command('hive') and HIVE_IS_RUNNING, "hive client should be installed") def test_import_to_hive(self): _path = HDFS(os.path.join('/user', getpass.getuser(), 'table_name')) try: if _path.exists(): _path.delete(recursive=_path.is_directory()) # shell.execute_shell_command('hadoop fs', '-rm -r /user/', getpass.getuser(), '/table_name') cmd = Sqoop.import_data().from_rdbms( host=MYSQL_SERVER, rdbms="mysql", username="******", password_file="{0}/rdbms.password".format(BASE_DIR), database="sqoop_tests").table( table="table_name").to_hive().run() # self.assertEquals(cmd.status, 0, cmd.stderr) # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*') # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout) finally: shell.execute_shell_command( 'hive', "-e 'DROP TABLE IF EXISTS table_name'") @classmethod def tearDownClass(cls): shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR)) shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/data_custom_directory'.format(BASE_DIR)) shell.execute_shell_command( 'mysql --user {0} --password={1} --host={2} -e'.format( USER, PASSWORD, MYSQL_SERVER), "'DROP DATABASE IF EXISTS sqoop_tests'")
MYSQLDUMP = True MYSQL_SERVER = "sandbox.hortonworks.com" BASE_DIR = "/tmp" RUN_TEST = True HBASE_IS_RUNNING = True HIVE_IS_RUNNING = True ZOOKEEPER_IS_RUNNING = False USER = "******" # Must be equals to text in resources/rdbms.password PASSWORD = "******" @skipUnless( has_command('sqoop') and has_command('mysql') and RUN_TEST, "sqoop and mysql clients should be installed and mysql must have user 'root' with password 'root'. Also we must add to /var/lib/sqoop jar with jdbc driver" ) class TestSqoop(TestCase): @classmethod def setUpClass(cls): shell.execute_shell_command( 'hadoop fs', '-rm -r {0}/rdbms.password'.format(BASE_DIR)) shell.execute_shell_command( 'hadoop fs', '-copyFromLocal', os.path.join(os.path.dirname(__file__), 'resources/sqoop/rdbms.password'), BASE_DIR) shell.execute_shell_command( 'hadoop fs', '-mkdir', os.path.join(BASE_DIR, "data_custom_directory")) shell.execute_shell_command(
# from tempfile import NamedTemporaryFile import uuid from unittest2.case import TestCase, skipUnless, skip from merlin.common.configurations import Configuration from merlin.common.shell_command_executor import execute_shell_command from merlin.fs.hdfs import HDFS from merlin.tools.pig import Pig, TaskOptions from merlin.common.test_utils import has_command TEZ_MODE_ENABLE = True @skipUnless(has_command("pig"), "pig client should be installed") class TestPigClient(TestCase): @skip("Parameter substitution does not supported while trying to run commands from string") def test_run_commands_from_string(self): _test_id = str(uuid.uuid4()) _output_dir = "/tmp/data_{}".format(_test_id) _input_dir = self.copy_file_from_local(self.temp_file("hello,world,world", ".txt")) commands = "A = load '$input_dir' using PigStorage(',');" commands += "B = foreach A generate \$0 as id;" commands += "STORE B into '$output_dir';" try: _pig = ( Pig.load_commands_from_string(commands) .with_parameter("input_dir", _input_dir) .with_parameter("output_dir", _output_dir)
class KafkaThreadBroker(threading.Thread): def run(self): Kafka.start_broker(path_to_config=os.path.join(os.path.dirname(__file__), 'resources/kafka/server.properties')) class KafkaThreadProducer(threading.Thread): def run(self): Kafka.run_producer(name='kafka.producer.ConsoleProducer', args=["--broker-list", "{0}:{1}".format(CLUSTER_NAME, PORT), "--topic", "test123"]) class KafkaThreadConsumer(threading.Thread): def run(self): Kafka.run_consumer(name='kafka.consumer.ConsoleConsumer', args=["--zookeeper sandbox.hortonworks.com:2181 --from-beginning --topic test123"]) @skipUnless(has_command("kafka-run-class.sh") and has_command("netstat"), "./kafka/bin should be set to $PATH and netstat should be installed") class TestKafka(unittest2.TestCase): def test_broker(self): shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT)) local = LocalFS("/tmp/kafka-test") if not local.exists(): local.create_directory() thread = KafkaThreadBroker() thread.daemon = True thread.start() sleep(TIME) cmd = shell.execute_shell_command('netstat -lntu') self.assertTrue("9010" in cmd.stdout, cmd.stdout) local.delete_directory() shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
from merlin.tools.flume import Flume import merlin.common.shell_command_executor as shell PORT = 41414 TIME_TO_OPEN_PORT = 60 class AgentThread(threading.Thread): def run(self): Flume.agent(agent="a1", conf_file=os.path.join(os.path.dirname(__file__), 'resources/flume/flume.conf')).run() @skipUnless(has_command('flume-ng') and has_command('netstat'), "flume-ng and netstat clients should be installed ") class TestFlume(unittest2.TestCase): def setUp(self): super(TestFlume, self).setUp() shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT)) def test_agent(self): thread = AgentThread() thread.daemon = True thread.start() time.sleep(TIME_TO_OPEN_PORT) cmd = shell.execute_shell_command('netstat -lntu') self.assertTrue("41414" in cmd.stdout, cmd.stdout) def tearDown(self): shell.execute_shell_command('fuser -k -n tcp {0}'.format(PORT))
class TestHDFS(TestCase): @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_exists(self): self.assertTrue(HDFS("/tmp").exists()) self.assertFalse(HDFS("/tmp_12345").exists()) @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_is_dir(self): self.assertTrue(HDFS("/tmp").is_directory(), "/tmp is not a dir") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_list_files(self): basedir = HDFS("/tmp") new_file = HDFS("/tmp/test.txt") try: new_file.create(directory=False) self.assertTrue(new_file.exists(), "File was not created") files = basedir.list_files() self.assertTrue(new_file in files) finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_recursive_list_files(self): basedir = HDFS("/tmp") new_folder = HDFS("/tmp/test123") new_file = HDFS("/tmp/test123/test.txt") try: new_folder.create(directory=True) self.assertTrue(new_folder.exists(), "Folder was not created") new_file.create(directory=False) self.assertTrue(new_file.exists(), "File was not created") files = basedir.recursive_list_files() self.assertTrue(new_file in files) self.assertTrue(new_folder in files) finally: new_folder.delete(recursive=True) self.assertFalse(new_file.exists(), "Folder was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_create(self): new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) # tets new file creation try: new_file.create(directory=False) self.assertTrue(new_file.exists(), "File was not created") self.assertFalse(new_file.is_directory(), "New file should not be a directory") finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not removed") # test new folder creation try: new_dir.create(directory=True) self.assertTrue(new_dir.exists(), "Directory was not created") self.assertTrue(new_dir.is_directory(), "New file should be a directory") finally: new_dir.delete(recursive=True) self.assertFalse(new_dir.exists(), "Directory was not removed") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_create_directory(self): new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(new_dir.exists(), "Directory is already exists") try: new_dir.create_directory() self.assertTrue(new_dir.exists(), "Directory was not created") self.assertTrue(new_dir.is_directory()) finally: new_dir.delete(recursive=True) self.assertFalse(new_dir.exists(), "Directory was not removed") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_create_file(self): new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(new_file.exists(), "File is already exists") try: new_file.create_file() self.assertTrue(new_file.exists(), "File was not created") self.assertFalse(new_file.is_directory(), "New file should not be a folder") finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not removed") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def should_create_directory_recursively(self): _base_dir = os.path.join('/tmp', str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4())) _dir = HDFS(_path) self.assertFalse(_dir.exists(), "Folder is already exists") try: _dir.create_directory(recursive=True) self.assertTrue(_dir.exists(), "Folder was not created") self.assertTrue(_dir.is_directory(), "New file should be a directory") finally: HDFS(_base_dir).delete_directory() self.assertFalse(_dir.exists(), "File was not removed") self.assertFalse(HDFS(_base_dir).exists(), "Base dir was not removed") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def should_create_file_recursively(self): _base_dir = os.path.join('/tmp', str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()), 'file.txt') _file = HDFS(_path) self.assertFalse(_file.exists(), "File is already exists") try: _file.create_file(recursive=True) self.assertTrue(_file.exists(), "File was not created") self.assertFalse(_file.is_directory(), "New file should not be a directory") finally: HDFS(_base_dir).delete_directory() self.assertFalse(_file.exists(), "File was not removed") self.assertFalse(HDFS(_base_dir).exists(), "Bse dir was not removed") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def should_raise_error_mkdir_not_recursive(self): _base_dir = os.path.join('/tmp', str(uuid.uuid4())) _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4())) _dir = HDFS(_path) self.assertFalse(_base_dir.exists(), "Folder is already exists") try: self.assertRaises(FileSystemException, _dir.create_directory, recursive=False) finally: self.assertFalse(_dir.exists(), "File was created") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_copy_to_local(self): new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) local_path = os.path.join("/tmp", "copied_from_hdfs") self.assertFalse(os.path.exists(local_path)) try: new_file.create_file() self.assertTrue(new_file.exists(), "File was not created") new_file.copy_to_local(local_path) self.assertTrue(os.path.exists(local_path), "File was not copied from HDFS") finally: new_file.delete() self.assertFalse(new_file.exists(), "File was not removed") os.remove(local_path) self.assertFalse(os.path.exists(local_path)) @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_copy_file(self): _file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) dst = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: _file.create_file() self.assertTrue(_file.exists(), "original file not found") self.assertFalse(dst.exists(), "destination file already exists") _file.create() _file.copy(dst) self.assertTrue(dst.exists(), "file was not copied") self.assertTrue(_file.exists(), "original file should not be deleted") finally: _file.delete() dst.delete() self.assertFalse(_file.exists(), "File was not deleted") self.assertFalse(dst.exists(), "destination file was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_copy_empty_dir(self): _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) dst = HDFS("/tmp/dst_" + str(uuid.uuid4())) try: _dir.create(directory=True) self.assertTrue(_dir.exists(), "directory not found") self.assertFalse(dst.exists(), "dst directory is already exists") _dir.copy(dst) self.assertTrue(dst.exists(), "directory was not copied") finally: _dir.delete(True) dst.delete(True) self.assertFalse(_dir.exists(), "File was not deleted") self.assertFalse(dst.exists(), "File was not deleted") def _create_non_empty_dir_(self, path): _dir = HDFS(path) _dir.create_directory() self.assertTrue(_dir.exists(), "source directory not found") for i in range(5): _file = HDFS(os.path.join(path, str(uuid.uuid4()))) _file.create(directory=(i % 2 == 0)) self.assertTrue(_file.exists(), "File was not created") return _dir @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_copy_non_empty_dir(self): dst = HDFS("/tmp/dst_" + str(uuid.uuid4())) _dir = None try: _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(dst.exists(), "dst directory is already exists") _dir.copy(dst) self.assertTrue(dst.exists(), "directory was not copied") self.assertTrue(_dir.exists(), "original directory should not be deleted") finally: if _dir: _dir.delete_directory() self.assertFalse(_dir.exists(), "Folder was not deleted") dst.delete_directory() self.assertFalse(dst.exists(), "Dst Folder was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_move_file(self): _file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) dst = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: _file.create_file() self.assertTrue(_file.exists(), "File was not created") self.assertFalse(dst.exists(), "Destination file should not exist") _file.move(dst.path) self.assertFalse(_file.exists(), "Original file should be deleted") self.assertTrue(dst.exists(), "Destination file should be created") finally: _file.delete() dst.delete() self.assertFalse(_file.exists(), "File was not deleted") self.assertFalse(dst.exists(), "destination file was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_move_empty_dir(self): _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) dst = HDFS("/tmp/dst_" + str(uuid.uuid4())) try: _dir.create(directory=True) self.assertTrue(_dir.exists(), "directory not found") self.assertFalse(dst.exists(), "destination directory is already exists") _dir.move(dst.path) self.assertFalse(_dir.exists(), "Original directory was not removed") self.assertTrue(dst.exists(), "destination directory was not created") finally: _dir.delete(True) dst.delete(True) self.assertFalse(_dir.exists(), "File was not deleted") self.assertFalse(dst.exists(), "File was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_move_non_empty_dir(self): dst = HDFS("/tmp/dst_" + str(uuid.uuid4())) _dir = None try: _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4()))) self.assertFalse(dst.exists(), "dst directory is already exists") _dir.move(dst.path) self.assertFalse(_dir.exists(), "original directory should be deleted") self.assertTrue(dst.exists(), "directory move operation failed") finally: if _dir: _dir.delete_directory() self.assertFalse(_dir.exists(), "Folder was not deleted") dst.delete_directory() self.assertFalse(dst.exists(), "Dst Folder was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_file_size(self): local = LocalFS(os.path.realpath(__file__)) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") self.assertEqual(hdfs_file.size(), local.size()) finally: hdfs_file.delete() @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_dir_size(self): local_basedir = os.path.dirname(os.path.realpath(__file__)) local = LocalFS(os.path.join(local_basedir, "resources", "test_dir_size")) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") expected_fsize = local.size() self.assertEqual(hdfs_file.size(), expected_fsize) finally: hdfs_file.delete(recursive=True) @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_merge(self): basedir = os.path.dirname(os.path.realpath(__file__)) local = LocalFS(os.path.join(basedir, "resources", "test_merge")) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt")) try: local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS") hdfs_file.merge(merged_file.path) self.assertTrue(merged_file.exists(), "merged file was not copied to local fs") finally: hdfs_file.delete_directory() merged_file.delete() @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_delete_file(self): _file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) _file.create_file() self.assertTrue(_file.exists(), "Target file can not be found") _file.delete() self.assertFalse(_file.exists(), "Target file was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_delete_dir(self): local = LocalFS(os.path.dirname(os.path.realpath(__file__))) hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) local.copy_to_hdfs(hdfs_file.path) self.assertTrue(hdfs_file.exists(), "Target HDFS dir does not exists") hdfs_file.delete(recursive=True) self.assertFalse(hdfs_file.exists(), "Target HDFS dir was not deleted") # todo FIXIT file permissions can be different in different hadoop versions @skipUnless(has_command('hadoop') and True, "Hadoop client should be installed") def test_get_permissions(self): self.assertEqual("drwxr-xr-x", HDFS("/").permissions(), "Root dir permissions should be 'drwxr-xr-x'") # Permissions to '/tmp' folder are different on different CDH versions # self.assertEqual("drwxrwxrwt", HDFS("/tmp").permissions(), "Tmp dir permissions should be 'drwxrwxrwxt'") hbase_file = HDFS("/hbase/hbase.id") if hbase_file.exists(): self.assertEqual("-rw-r--r--", hbase_file.permissions(), "/hbase/hbase.id permissions should be '-rw-r--r--'") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_get_replicas(self): self.assertEqual('0', HDFS("/").replicas(), "Root dir replicas should be 0") self.assertNotEqual('0', HDFS("/tmp").replicas(), "dir replicas should be 0") name = uuid.uuid4() hdfs_file = HDFS("/tmp/{0}".format(name)) hdfs_file.create_file() shell.execute_shell_command('hadoop dfs', '-setrep -w 1 /tmp/{0}'.format(name)) if hdfs_file.exists(): self.assertEqual('1', hdfs_file.replicas(), "Number replicas of file must be 1") hdfs_file.delete() self.assertFalse(hdfs_file.exists()) @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_get_owner(self): self.assertEqual('hdfs', HDFS("/").owner(), "ERROR: Root dir owner") self.assertEqual('hdfs', HDFS("/tmp").owner(), "ERROR: /tmp dir owner") hbase_file = HDFS("/hbase/hbase.id") if hbase_file.exists(): self.assertEqual('hbase', HDFS("/hbase/hbase.id").owner(), "ERROR: /hbase/hbase.id dir owner") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_get_modification_time(self): now = datetime.now().strftime("%Y-%m-%d") _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) _file = HDFS(os.path.join("/tmp", str(uuid.uuid4()))) try: _dir.create_directory() _file.create_file() self.assertTrue(_dir.exists(), "Dir was not created") self.assertTrue(_file.exists(), "File was not created") self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time") self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time") finally: _dir.delete_directory() _file.delete() @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_distcp(self): directory = HDFS("/tmp/bar") directory.create() new_file = HDFS("/tmp/test_dist.txt") new_file.create(directory=False) _host = "sandbox.hortonworks.com" try: self.assertTrue(new_file.exists(), "File was not created") _file = HDFS("hdfs://{host}:8020/tmp/test_dist.txt".format(host=_host)) _file.distcp(dest="hdfs://{host}:8020/tmp/bar/test_dist.txt".format(host=_host)) file_after_copy = HDFS("/tmp/bar/test_dist.txt") self.assertTrue(file_after_copy.exists(), "File was not copied") finally: new_file.delete() directory.delete(recursive=True) self.assertFalse(new_file.exists(), "File was not deleted") self.assertFalse(directory.delete(), "File was not deleted") @skipUnless(has_command('hadoop'), "Hadoop client should be installed") def test_get_description(self): directory = HDFS("/tmp/bar") try: directory.create() self.assertEqual(directory.get_description().name, "/tmp/bar") self.assertEqual(directory.get_description().size, 0) self.assertEqual(directory.get_description().owner, getpass.getuser()) self.assertEqual(directory.get_description().create_date, None) finally: directory.delete(recursive=True) self.assertFalse(directory.delete(), "File was not deleted")