Ejemplo n.º 1
0
 def test_delete_dir(self):
     local = LocalFS(os.path.dirname(os.path.realpath(__file__)))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     local.copy_to_hdfs(hdfs_file.path)
     self.assertTrue(hdfs_file.exists(), "Target HDFS dir does not exists")
     hdfs_file.delete(recursive=True)
     self.assertFalse(hdfs_file.exists(), "Target HDFS dir was not deleted")
Ejemplo n.º 2
0
def apply_hdfs_snapshot(config):
    """Creates initial directory structure on HDFS and applies ACL rules """
    _hdfs_snapshot = FsSnapshot.load_from_config(
        config, fs_section=CONFIG_HDFS_DIRS_KEY, acl_section=CONFIG_ACLS_KEY)
    _hdfs_snapshot.apply(
        mkdir_command=lambda path: HDFS(path).create_directory(recursive=True),
        apply_acls_command=lambda path, acls: HDFS(path).apply_acl(acls))
Ejemplo n.º 3
0
 def test_file_size(self):
     local = LocalFS(os.path.realpath(__file__))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         self.assertEqual(hdfs_file.size(), local.size())
     finally:
         hdfs_file.delete()
Ejemplo n.º 4
0
 def should_raise_error_mkdir_not_recursive(self):
     _base_dir = os.path.join("/tmp", str(uuid.uuid4()))
     _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()))
     _dir = HDFS(_path)
     self.assertFalse(_base_dir.exists(), "Folder is already exists")
     try:
         self.assertRaises(FileSystemException, _dir.create_directory, recursive=False)
     finally:
         self.assertFalse(_dir.exists(), "File was created")
Ejemplo n.º 5
0
 def should_raise_error_mkdir_not_recursive(self):
     _base_dir = os.path.join('/tmp', str(uuid.uuid4()))
     _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()))
     _dir = HDFS(_path)
     self.assertFalse(_base_dir.exists(), "Folder is already exists")
     try:
         self.assertRaises(FileSystemException, _dir.create_directory, recursive=False)
     finally:
         self.assertFalse(_dir.exists(), "File was created")
Ejemplo n.º 6
0
 def _create_non_empty_dir_(self, path):
     _dir = HDFS(path)
     _dir.create_directory()
     self.assertTrue(_dir.exists(), "source directory not found")
     for i in range(5):
         _file = HDFS(os.path.join(path, str(uuid.uuid4())))
         _file.create(directory=(i % 2 == 0))
         self.assertTrue(_file.exists(), "File was not created")
     return _dir
Ejemplo n.º 7
0
 def test_get_permissions(self):
     self.assertEqual("drwxr-xr-x", HDFS("/").permissions(), "Root dir permissions should be 'drwxr-xr-x'")
     # Permissions to '/tmp' folder are different on different CDH versions
     # self.assertEqual("drwxrwxrwt", HDFS("/tmp").permissions(), "Tmp dir permissions should be 'drwxrwxrwxt'")
     hbase_file = HDFS("/hbase/hbase.id")
     if hbase_file.exists():
         self.assertEqual(
             "-rw-r--r--", hbase_file.permissions(), "/hbase/hbase.id permissions should be '-rw-r--r--'"
         )
Ejemplo n.º 8
0
 def test_get_permissions(self):
     self.assertEqual("drwxr-xr-x", HDFS("/").permissions(), "Root dir permissions should be 'drwxr-xr-x'")
     # Permissions to '/tmp' folder are different on different CDH versions
     # self.assertEqual("drwxrwxrwt", HDFS("/tmp").permissions(), "Tmp dir permissions should be 'drwxrwxrwxt'")
     hbase_file = HDFS("/hbase/hbase.id")
     if hbase_file.exists():
         self.assertEqual("-rw-r--r--",
                          hbase_file.permissions(),
                          "/hbase/hbase.id permissions should be '-rw-r--r--'")
Ejemplo n.º 9
0
 def test_create_directory(self):
     new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     self.assertFalse(new_dir.exists(), "Directory is already exists")
     try:
         new_dir.create_directory()
         self.assertTrue(new_dir.exists(), "Directory was not created")
         self.assertTrue(new_dir.is_directory())
     finally:
         new_dir.delete(recursive=True)
         self.assertFalse(new_dir.exists(), "Directory was not removed")
Ejemplo n.º 10
0
 def test_create_file(self):
     new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     self.assertFalse(new_file.exists(), "File is already exists")
     try:
         new_file.create_file()
         self.assertTrue(new_file.exists(), "File was not created")
         self.assertFalse(new_file.is_directory(), "New file should not be a folder")
     finally:
         new_file.delete()
         self.assertFalse(new_file.exists(), "File was not removed")
Ejemplo n.º 11
0
 def test_dir_size(self):
     local_basedir = os.path.dirname(os.path.realpath(__file__))
     local = LocalFS(os.path.join(local_basedir, "resources", "test_dir_size"))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         expected_fsize = local.size()
         self.assertEqual(hdfs_file.size(), expected_fsize)
     finally:
         hdfs_file.delete(recursive=True)
Ejemplo n.º 12
0
 def should_create_file_recursively(self):
     _base_dir = os.path.join('/tmp', str(uuid.uuid4()))
     _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()), 'file.txt')
     _file = HDFS(_path)
     self.assertFalse(_file.exists(), "File is already exists")
     try:
         _file.create_file(recursive=True)
         self.assertTrue(_file.exists(), "File was not created")
         self.assertFalse(_file.is_directory(), "New file should not be a directory")
     finally:
         HDFS(_base_dir).delete_directory()
         self.assertFalse(_file.exists(), "File was not removed")
         self.assertFalse(HDFS(_base_dir).exists(), "Bse dir was not removed")
Ejemplo n.º 13
0
 def test_create_file(self):
     new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     self.assertFalse(new_file.exists(), "File is already exists")
     try:
         new_file.create_file()
         self.assertTrue(new_file.exists(), "File was not created")
         self.assertFalse(new_file.is_directory(), "New file should not be a folder")
     finally:
         new_file.delete()
         self.assertFalse(new_file.exists(), "File was not removed")
Ejemplo n.º 14
0
 def test_create_directory(self):
     new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     self.assertFalse(new_dir.exists(), "Directory is already exists")
     try:
         new_dir.create_directory()
         self.assertTrue(new_dir.exists(), "Directory was not created")
         self.assertTrue(new_dir.is_directory())
     finally:
         new_dir.delete(recursive=True)
         self.assertFalse(new_dir.exists(), "Directory was not removed")
Ejemplo n.º 15
0
 def test_copy_to_local(self):
     new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     local_path = os.path.join("/tmp", "copied_from_hdfs")
     self.assertFalse(os.path.exists(local_path))
     try:
         new_file.create_file()
         self.assertTrue(new_file.exists(), "File was not created")
         new_file.copy_to_local(local_path)
         self.assertTrue(os.path.exists(local_path), "File was not copied from HDFS")
     finally:
         new_file.delete()
         self.assertFalse(new_file.exists(), "File was not removed")
         os.remove(local_path)
         self.assertFalse(os.path.exists(local_path))
Ejemplo n.º 16
0
 def test_list_files(self):
     basedir = HDFS("/tmp")
     new_file = HDFS("/tmp/test.txt")
     try:
         new_file.create(directory=False)
         self.assertTrue(new_file.exists(), "File was not created")
         files = basedir.list_files()
         self.assertTrue(new_file in files)
     finally:
         new_file.delete()
         self.assertFalse(new_file.exists(), "File was not deleted")
Ejemplo n.º 17
0
    def test_streaming_job_without_reducer(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:
            job = self._template_streaming_job_(base_dir=_job_basedir.path, map_only_job=True)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("Cannot run map-only job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())

            #   check counters
            self.assertEqual(2, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(11, _job_status.counter(group='Map-Reduce Framework', counter='Map input records'))
            self.assertEqual(3252, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _job_basedir.delete_directory()
Ejemplo n.º 18
0
    def test_get_description(self):
        directory = HDFS("/tmp/bar")
        try:
            directory.create()
            self.assertEqual(directory.get_description().name, "/tmp/bar")
            self.assertEqual(directory.get_description().size, 0)
            self.assertEqual(directory.get_description().owner, getpass.getuser())
            self.assertEqual(directory.get_description().create_date, None)

        finally:
            directory.delete(recursive=True)
            self.assertFalse(directory.delete(), "File was not deleted")
Ejemplo n.º 19
0
    def test_run_preconfigured_job_without_parameters_substitution(self):
        _test_id = str(uuid.uuid4())
        _job_name = "TEST_PIG_{}".format(_test_id)
        _input_dir = self.copy_file_from_local(
            self.temp_file("hello,world,world", ".txt"))
        _output_dir = "/tmp/data_{}".format(_test_id)

        _commands = "A = load '{}' using PigStorage(',');".format(_input_dir)
        _commands += "B = foreach A generate \$0 as id;"
        _commands += "STORE B into '{}';".format(_output_dir)
        # create job configuration. can also be loaded from .ini file
        _config = Configuration.create()
        _config.set(_job_name, TaskOptions.CONFIG_KEY_COMMANDS_STRING,
                    _commands)
        _config.set(_job_name, TaskOptions.CONFIG_KEY_LOG_BRIEF, 'enabled')
        _config.set(
            _job_name, TaskOptions.CONFIG_KEY_PARAMETER_VALUE,
            'input_dir={}\noutput_dir={}'.format(_input_dir, _output_dir))
        try:
            _pig = Pig.load_preconfigured_job(config=_config,
                                              job_name=_job_name)
            _result = _pig.run()
            _result.if_failed_raise(
                AssertionError("test_run_preconfigured_job failed"))
            self.assertTrue(
                HDFS(_output_dir).exists(), "Cannot find job output")
        finally:
            self.delete_file_in_hdfs(_input_dir)
            self.delete_file_in_hdfs(_output_dir)
Ejemplo n.º 20
0
 def _create_non_empty_dir_(self, path):
     _dir = HDFS(path)
     _dir.create_directory()
     self.assertTrue(_dir.exists(), "source directory not found")
     for i in range(5):
         _file = HDFS(os.path.join(path, str(uuid.uuid4())))
         _file.create(directory=(i % 2 == 0))
         self.assertTrue(_file.exists(), "File was not created")
     return _dir
Ejemplo n.º 21
0
 def test_streaming_job(self):
     _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         job = self._template_streaming_job_(base_dir=_job_basedir.path)
         command_result = job.run()
         command_result.if_failed_raise(AssertionError("test_streaming_job_generated test failed"))
         _job_status = job.status()
         self.assertTrue(_job_status is not None and _job_status.is_succeeded())
         # counters
         self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Spilled Records'),
                          "counters['Map-Reduce Framework']['Spilled Records']")
         self.assertEqual(143, _job_status.counter(group='Map-Reduce Framework', counter='Reduce output records'),
                          "counters['Map-Reduce Framework']['Reduce output records']")
         self.assertEqual(370, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'),
                          "counters['Map-Reduce Framework']['Reduce input records']")
     finally:
         _job_basedir.delete_directory()
Ejemplo n.º 22
0
 def test_copy_to_local(self):
     new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     local_path = os.path.join("/tmp", "copied_from_hdfs")
     self.assertFalse(os.path.exists(local_path))
     try:
         new_file.create_file()
         self.assertTrue(new_file.exists(), "File was not created")
         new_file.copy_to_local(local_path)
         self.assertTrue(os.path.exists(local_path), "File was not copied from HDFS")
     finally:
         new_file.delete()
         self.assertFalse(new_file.exists(), "File was not removed")
         os.remove(local_path)
         self.assertFalse(os.path.exists(local_path))
Ejemplo n.º 23
0
    def test_mr_job_command_generation_with_arguments(self):
        _job_name = "test_mr_job_%s" % uuid.uuid4()

        _base_dir = HDFS(os.path.join("/tmp", _job_name))
        _base_dir.create_directory()
        try:
            jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar')
            # configure job inputs
            _job_input = HDFS(os.path.join(_base_dir.path, "input"))
            _job_input.create_directory()
            LocalFS(os.path.join(
                os.path.dirname(__file__),
                'resources',
                'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _job_input.path
            )

            # configure job output
            _job_output = HDFS(os.path.join(_base_dir.path, "output"))
            if not os.path.exists(jar):
                self.skipTest("'%s' not found" % jar)

            job = MapReduce.prepare_mapreduce_job(jar=jar,
                                                  main_class="wordcount",
                                                  name=_job_name) \
                .with_config_option("split.by", "'\\t'") \
                .with_number_of_reducers(3) \
                .with_arguments(
                _job_input.path,
                _job_output.path
            )
            _command_submission_result = job.run()
            _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed")
            self.assertTrue(_job_output.exists(), "Error: empty job output")
            #     check counters
            self.assertEqual(6, _job_status.counter(group='File System Counters',
                                                    counter='HDFS: Number of write operations'))
            self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks'))
            self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _base_dir.delete_directory()
Ejemplo n.º 24
0
 def test_get_replicas(self):
     self.assertEqual("0", HDFS("/").replicas(), "Root dir replicas should be 0")
     self.assertNotEqual("0", HDFS("/tmp").replicas(), "dir replicas should be 0")
     name = uuid.uuid4()
     hdfs_file = HDFS("/tmp/{0}".format(name))
     hdfs_file.create_file()
     shell.execute_shell_command("hadoop dfs", "-setrep -w 1 /tmp/{0}".format(name))
     if hdfs_file.exists():
         self.assertEqual("1", hdfs_file.replicas(), "Number replicas of file must be 1")
         hdfs_file.delete()
         self.assertFalse(hdfs_file.exists())
Ejemplo n.º 25
0
    def test_get_description(self):
        directory = HDFS("/tmp/bar")
        try:
            directory.create()
            self.assertEqual(directory.get_description().name, "/tmp/bar")
            self.assertEqual(directory.get_description().size, 0)
            self.assertEqual(directory.get_description().owner, getpass.getuser())
            self.assertEqual(directory.get_description().create_date, None)

        finally:
            directory.delete(recursive=True)
            self.assertFalse(directory.delete(), "File was not deleted")
Ejemplo n.º 26
0
def load_file_from_local_to_hdfs(context):
    context['new_pathes'] = []
    for _file in LocalFS(
            os.path.join(os.path.dirname(__file__), "resources/tmp")):
        HDFS("/tmp/raw/{0}".format(parser_partition(_file.path))) \
            .create(directory=True)
        LocalFS(os.path.join(os.path.dirname(__file__),
                             "resources/tmp/{0}").format(_file.path)) \
            .copy_to_hdfs(hdfs_path="/tmp/raw/{0}/".format(parser_partition(_file.path)))
        context['new_pathes'].append("/tmp/raw/{0}".format(
            parser_partition(_file.path)))
Ejemplo n.º 27
0
 def test_delete_dir(self):
     local = LocalFS(os.path.dirname(os.path.realpath(__file__)))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     local.copy_to_hdfs(hdfs_file.path)
     self.assertTrue(hdfs_file.exists(), "Target HDFS dir does not exists")
     hdfs_file.delete(recursive=True)
     self.assertFalse(hdfs_file.exists(), "Target HDFS dir was not deleted")
Ejemplo n.º 28
0
    def test_import_to_hive(self):
        _path = HDFS(os.path.join('/user', getpass.getuser(), 'table_name'))
        try:
            if _path.exists():
                _path.delete(recursive=_path.is_directory())
                # shell.execute_shell_command('hadoop fs', '-rm -r /user/', getpass.getuser(), '/table_name')
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests"
            ).table(
                table="table_name"
            ).to_hive().run()

            # self.assertEquals(cmd.status, 0, cmd.stderr)
            # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*')
            # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout)
        finally:

            shell.execute_shell_command('hive', "-e 'DROP TABLE IF EXISTS table_name'")
Ejemplo n.º 29
0
    def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False):
        if not os.path.exists(HADOOP_STREAMING_JAR):
            self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR)
        _hdfs_basdir = HDFS(base_dir)
        if not _hdfs_basdir.exists():
            _hdfs_basdir.create_directory()
        _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input"))
        _job_input.create_directory()
        _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output"))
        home = os.path.dirname(__file__)
        _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py')
        _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py')

        LocalFS(
            os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt')
        ).copy_to_hdfs(
            _job_input.path
        )

        return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \
            .take(_job_input.path) \
            .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \
            .save(_job_output.path)
Ejemplo n.º 30
0
 def test_file_size(self):
     local = LocalFS(os.path.realpath(__file__))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         self.assertEqual(hdfs_file.size(), local.size())
     finally:
         hdfs_file.delete()
Ejemplo n.º 31
0
 def should_create_file_recursively(self):
     _base_dir = os.path.join("/tmp", str(uuid.uuid4()))
     _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()), "file.txt")
     _file = HDFS(_path)
     self.assertFalse(_file.exists(), "File is already exists")
     try:
         _file.create_file(recursive=True)
         self.assertTrue(_file.exists(), "File was not created")
         self.assertFalse(_file.is_directory(), "New file should not be a directory")
     finally:
         HDFS(_base_dir).delete_directory()
         self.assertFalse(_file.exists(), "File was not removed")
         self.assertFalse(HDFS(_base_dir).exists(), "Bse dir was not removed")
Ejemplo n.º 32
0
 def test_merge(self):
     basedir = os.path.dirname(os.path.realpath(__file__))
     local = LocalFS(os.path.join(basedir, "resources", "test_merge"))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt"))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         hdfs_file.merge(merged_file.path)
         self.assertTrue(merged_file.exists(), "merged file was not copied to local fs")
     finally:
         hdfs_file.delete_directory()
         merged_file.delete()
Ejemplo n.º 33
0
 def test_dir_size(self):
     local_basedir = os.path.dirname(os.path.realpath(__file__))
     local = LocalFS(os.path.join(local_basedir, "resources", "test_dir_size"))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         expected_fsize = local.size()
         self.assertEqual(hdfs_file.size(), expected_fsize)
     finally:
         hdfs_file.delete(recursive=True)
Ejemplo n.º 34
0
 def test_move_non_empty_dir(self):
     dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
     _dir = None
     try:
         _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4())))
         self.assertFalse(dst.exists(), "dst directory is already exists")
         _dir.move(dst.path)
         self.assertFalse(_dir.exists(), "original directory should be deleted")
         self.assertTrue(dst.exists(), "directory move operation failed")
     finally:
         if _dir:
             _dir.delete_directory()
             self.assertFalse(_dir.exists(), "Folder was not deleted")
         dst.delete_directory()
         self.assertFalse(dst.exists(), "Dst Folder was not deleted")
Ejemplo n.º 35
0
 def test_merge(self):
     basedir = os.path.dirname(os.path.realpath(__file__))
     local = LocalFS(os.path.join(basedir, "resources", "test_merge"))
     hdfs_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     merged_file = LocalFS(os.path.join(basedir, "resources", "merged.txt"))
     try:
         local.copy_to_hdfs(hdfs_file.path)
         self.assertTrue(hdfs_file.exists(), "Local file was not copied to HDFS")
         hdfs_file.merge(merged_file.path)
         self.assertTrue(merged_file.exists(), "merged file was not copied to local fs")
     finally:
         hdfs_file.delete_directory()
         merged_file.delete()
Ejemplo n.º 36
0
 def test_run_commands_from_file(self):
     _test_id = str(uuid.uuid4())
     _inputs = self.copy_file_from_local(
         self.temp_file("hello,world,world", ".txt"))
     commands = "A = load '$input_dir' using PigStorage(',');"
     commands += "B = foreach A generate \$0 as id;"
     commands += "STORE B into '$output_dir';"
     files_s = self.temp_file(commands)
     try:
         _output_dir = "/tmp/data_{}".format(_test_id)
         pig = Pig.load_commands_from_file(files_s) \
             .with_parameter("input_dir", _inputs) \
             .with_parameter("output_dir", _output_dir)
         self.assertTrue(pig.run().is_ok())
         self.assertTrue(HDFS(_output_dir).exists())
     finally:
         self.delete_local(files_s)
         self.delete_file_in_hdfs()
         self.delete_file_in_hdfs(_inputs)
Ejemplo n.º 37
0
    def test_run_commands_from_string_without_param_substitution(self):
        _test_id = str(uuid.uuid4())
        _output_dir = "/tmp/data_{}".format(_test_id)
        _input_dir = self.copy_file_from_local(
            self.temp_file("hello,world,world", ".txt"))

        commands = "A = load '{}' using PigStorage(',');".format(_input_dir)
        commands += "B = foreach A generate \$0 as id;"
        commands += "STORE B into '{}';".format(_output_dir)
        try:
            _pig = Pig.load_commands_from_string(commands)
            _result = _pig.run()
            _result.if_failed_raise(
                AssertionError("test_run_commands_from_string failed"))
            self.assertTrue(
                HDFS(_output_dir).exists(), "Cannot find job output")
        finally:
            self.delete_file_in_hdfs(_input_dir)
            self.delete_file_in_hdfs(_output_dir)
Ejemplo n.º 38
0
    def test_streaming_job_with_multiple_inputs(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:

            job = self._template_streaming_job_(base_dir=_job_basedir.path)

            _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2"))
            _additional_datasource.create_directory()
            LocalFS(os.path.join(os.path.dirname(__file__), 'resources',
                                 'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _additional_datasource.path)
            job.take(_additional_datasource.path)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())
            # check counters
            self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'),
                             "counters['Map-Reduce Framework']['Reduce input records']")
        finally:
            _job_basedir.delete_directory()
Ejemplo n.º 39
0
 def test_move_non_empty_dir(self):
     dst = HDFS("/tmp/dst_" + str(uuid.uuid4()))
     _dir = None
     try:
         _dir = self._create_non_empty_dir_(os.path.join("/tmp", str(uuid.uuid4())))
         self.assertFalse(dst.exists(), "dst directory is already exists")
         _dir.move(dst.path)
         self.assertFalse(_dir.exists(), "original directory should be deleted")
         self.assertTrue(dst.exists(), "directory move operation failed")
     finally:
         if _dir:
             _dir.delete_directory()
             self.assertFalse(_dir.exists(), "Folder was not deleted")
         dst.delete_directory()
         self.assertFalse(dst.exists(), "Dst Folder was not deleted")
Ejemplo n.º 40
0
    def test_import_to_hive(self):
        _path = HDFS(os.path.join('/user', getpass.getuser(), 'table_name'))
        try:
            if _path.exists():
                _path.delete(recursive=_path.is_directory())
                # shell.execute_shell_command('hadoop fs', '-rm -r /user/', getpass.getuser(), '/table_name')
            cmd = Sqoop.import_data().from_rdbms(
                host=MYSQL_SERVER,
                rdbms="mysql",
                username="******",
                password_file="{0}/rdbms.password".format(BASE_DIR),
                database="sqoop_tests").table(
                    table="table_name").to_hive().run()

            # self.assertEquals(cmd.status, 0, cmd.stderr)
            # result = shell.execute_shell_command('hadoop fs', '-du -s /user/hive/warehouse/table_name/part-m-*')
            # self.assertNotEqual(result.stdout.split(' ')[0], '0', result.stdout)
        finally:

            shell.execute_shell_command(
                'hive', "-e 'DROP TABLE IF EXISTS table_name'")
Ejemplo n.º 41
0
 def test_recursive_list_files(self):
     basedir = HDFS("/tmp")
     new_folder = HDFS("/tmp/test123")
     new_file = HDFS("/tmp/test123/test.txt")
     try:
         new_folder.create(directory=True)
         self.assertTrue(new_folder.exists(), "Folder was not created")
         new_file.create(directory=False)
         self.assertTrue(new_file.exists(), "File was not created")
         files = basedir.recursive_list_files()
         self.assertTrue(new_file in files)
         self.assertTrue(new_folder in files)
     finally:
         new_folder.delete(recursive=True)
         self.assertFalse(new_file.exists(), "Folder was not deleted")
Ejemplo n.º 42
0
 def test_get_modification_time(self):
     now = datetime.now().strftime("%Y-%m-%d")
     _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         _dir.create_directory()
         _file.create_file()
         self.assertTrue(_dir.exists(), "Dir was not created")
         self.assertTrue(_file.exists(), "File was not created")
         self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time")
         self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time")
     finally:
         _dir.delete_directory()
         _file.delete()
Ejemplo n.º 43
0
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

from merlin.fs.hdfs import HDFS

BASE_DIR = "/tmp"

if __name__ == "__main__":
    # Cleans resources after flow.
    hdfs_file = HDFS("{0}/data_to_export".format(BASE_DIR))
    if hdfs_file.exists():
        hdfs_file.delete(recursive=True)

    hdfs_file = HDFS("{0}/data_from_import".format(BASE_DIR))
    if hdfs_file.exists():
        hdfs_file.delete(recursive=True)

    hdfs_file = HDFS("{0}/rdbms.password".format(BASE_DIR))
    if hdfs_file.exists():
        hdfs_file.delete()

Ejemplo n.º 44
0
def on_flow_failed(context):
    hdfs_file = HDFS("{0}/raw".format(BASE_DIR))
    if hdfs_file.exists():
        hdfs_file.delete(recursive=True)
Ejemplo n.º 45
0
def load_file_on_hdfs(context):
    _hdfs = HDFS('/tmp/raw')
    context['files_on_HDFS'] = []
    for _file in _hdfs.recursive_list_files():
        if not _file.is_directory():
            context['files_on_HDFS'].append(_file.get_description())
Ejemplo n.º 46
0
 def test_distcp(self):
     directory = HDFS("/tmp/bar")
     directory.create()
     new_file = HDFS("/tmp/test_dist.txt")
     new_file.create(directory=False)
     _host = "sandbox.hortonworks.com"
     try:
         self.assertTrue(new_file.exists(), "File was not created")
         _file = HDFS("hdfs://{host}:8020/tmp/test_dist.txt".format(host=_host))
         _file.distcp(dest="hdfs://{host}:8020/tmp/bar/test_dist.txt".format(host=_host))
         file_after_copy = HDFS("/tmp/bar/test_dist.txt")
         self.assertTrue(file_after_copy.exists(), "File was not copied")
     finally:
         new_file.delete()
         directory.delete(recursive=True)
         self.assertFalse(new_file.exists(), "File was not deleted")
         self.assertFalse(directory.delete(), "File was not deleted")
Ejemplo n.º 47
0
    def test_apply_hdfs_snapshot(self):
        _config_file = os.path.join(os.path.dirname(__file__),
                                    'resources',
                                    'bootsrap',
                                    'bootstrap.ini')
        _raw_sales_dir = HDFS('/tmp/raw/sales')
        _raw_users_dir = HDFS('/tmp/raw/users')
        _raw_tmp_dir = HDFS('/tmp/raw/tmp')
        try:
            # run bootstrap script
            metastore = IniFileMetaStore(file=_config_file)
            _config = Configuration.load(metastore)
            apply_hdfs_snapshot(_config)

            # asserts
            # assert directories were created
            self.assertTrue(_raw_sales_dir.exists(), "Directory '/tmp/raw/sales' was not created")
            self.assertTrue(_raw_users_dir.exists(), "Directory '/tmp/raw/users' was not created")
            self.assertTrue(_raw_tmp_dir.exists(), "Directory '/tmp/raw/tmp' was not created")
            # assert acls were applied
            sales_dir_acls = _raw_sales_dir.get_acls()
            users_dir_acls = _raw_users_dir.get_acls()

            self.assertIsNotNone(sales_dir_acls, '/tmp/raw/sales : ACL were not applied')
            self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/sales : pii acl was not applied')
            self.assertTrue('group:sales:r--' in sales_dir_acls, '/tmp/raw/sales : salse acl was not applied')

            self.assertIsNotNone(users_dir_acls, '/tmp/raw/users : ACL were not applied')
            self.assertTrue('group:sys-pii:r-x' in sales_dir_acls, '/tmp/raw/users : pii acl was not applied')
        finally:
            _test_basedir = HDFS('/tmp/raw')
            _test_basedir.delete_directory()
            self.assertFalse(_test_basedir.exists(), "ERROR: clean up failed")
Ejemplo n.º 48
0
Archivo: setup.py Proyecto: epam/Merlin
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

import os

from merlin.fs.hdfs import HDFS
from merlin.fs.localfs import LocalFS


if __name__ == "__main__":
    _basedir = HDFS(os.path.join('/tmp', 'scd.active'))

    _basedir.create_directory()
    _scd_active_snapshot = LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv'))
    _scd_active_snapshot.copy_to_hdfs(_basedir.path)

Ejemplo n.º 49
0
Archivo: flow.py Proyecto: epam/Merlin
        file.write(action_name)
        file.close()


if __name__ == '__main__':
    log = get_logger("SCD")

    # Prepare paths
    _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig')
    _scd_active_snapshot = '/tmp/scd.active/scd.active.csv'
    _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv')
    _hdfs_job_output = '/tmp/scd.updated'

    _local_folder_to_monitor = LocalFS(os.path.join(os.path.dirname(__file__), 'resources'))
    _hdfs_basedir = HDFS('/tmp/scd.active')
    _hdfs_tmpdir = HDFS('/tmp/scd.tmp')
    _hdfs_tmpdir.create_directory()

    if _scd_updates and LocalFS(_scd_updates).exists():

        # Checks if file with last failed step is exists
        # and reads this step
        step = 'Copying scd updates to raw area on HDFS'
        if os.path.isfile('resources/step'):
            file = open('resources/step', 'r')
            step = file.read()
            file.close()

        flow = FlowRegistry.flow('Flow')

        # Runs flow
Ejemplo n.º 50
0
# for additional information regarding copyright ownership and licensing.
#

from merlin.tools.hive import Hive
from ConfigParser import RawConfigParser
from merlin.fs.localfs import LocalFS
from merlin.fs.hdfs import HDFS
from merlin.fs.ftp import ftp_client
import os

BASE_DIR = "/tmp"

if __name__ == "__main__":

    # create empty directory '/tmp/raw' on HDFS
    hdfs_file = HDFS("{0}/raw".format(BASE_DIR))
    if hdfs_file.exists():
        hdfs_file.delete(recursive=True)
    hdfs_file.create(directory=True)

    # create empty directory '/tmp/base_dir' on FTP
    config = RawConfigParser()
    config.read(
        os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini"))
    host_download = config.get("ftp", "host.download")
    user_name = config.get("ftp", "user.name")
    password = config.get("ftp", "password")
    path = config.get("ftp", "path")
    ftp = ftp_client(host=host_download,
                     login=user_name,
                     password=password,
Ejemplo n.º 51
0
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

import os

from merlin.fs.hdfs import HDFS
from merlin.fs.localfs import LocalFS

if __name__ == "__main__":
    _basedir = HDFS(os.path.join('/tmp', 'scd.active'))

    _basedir.create_directory()
    _scd_active_snapshot = LocalFS(
        os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv'))
    _scd_active_snapshot.copy_to_hdfs(_basedir.path)
Ejemplo n.º 52
0
 def test_create(self):
     new_file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     # tets new file creation
     try:
         new_file.create(directory=False)
         self.assertTrue(new_file.exists(), "File was not created")
         self.assertFalse(new_file.is_directory(), "New file should not be a directory")
     finally:
         new_file.delete()
         self.assertFalse(new_file.exists(), "File was not removed")
         # test new folder creation
     try:
         new_dir.create(directory=True)
         self.assertTrue(new_dir.exists(), "Directory was not created")
         self.assertTrue(new_dir.is_directory(), "New file should be a directory")
     finally:
         new_dir.delete(recursive=True)
         self.assertFalse(new_dir.exists(), "Directory was not removed")
Ejemplo n.º 53
0
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

from merlin.tools.hive import Hive
from ConfigParser import RawConfigParser
from merlin.fs.localfs import LocalFS
from merlin.fs.hdfs import HDFS
from merlin.fs.ftp import ftp_client
import os

BASE_DIR = "/tmp"

if __name__ == "__main__":

    hdfs_file = HDFS("{0}/raw".format(BASE_DIR))
    if hdfs_file.exists():
        hdfs_file.delete(recursive=True)

    config = RawConfigParser()
    config.read(os.path.join(os.path.dirname(__file__), "resources/ftp_config.ini"))
    host_download = config.get("ftp", "host.download")
    user_name = config.get("ftp", "user.name")
    password = config.get("ftp", "password")
    path = config.get("ftp", "path")
    ftp = ftp_client(host=host_download,
                     login=user_name,
                     password=password,
                     path="/tmp")

    if ftp.exists():
Ejemplo n.º 54
0
 def test_get_owner(self):
     self.assertEqual("hdfs", HDFS("/").owner(), "ERROR: Root dir owner")
     self.assertEqual("hdfs", HDFS("/tmp").owner(), "ERROR: /tmp dir owner")
     hbase_file = HDFS("/hbase/hbase.id")
     if hbase_file.exists():
         self.assertEqual("hbase", HDFS("/hbase/hbase.id").owner(), "ERROR: /hbase/hbase.id dir owner")