Ejemplo n.º 1
0
 def _create_non_empty_dir_(self, path):
     _dir = HDFS(path)
     _dir.create_directory()
     self.assertTrue(_dir.exists(), "source directory not found")
     for i in range(5):
         _file = HDFS(os.path.join(path, str(uuid.uuid4())))
         _file.create(directory=(i % 2 == 0))
         self.assertTrue(_file.exists(), "File was not created")
     return _dir
Ejemplo n.º 2
0
 def _create_non_empty_dir_(self, path):
     _dir = HDFS(path)
     _dir.create_directory()
     self.assertTrue(_dir.exists(), "source directory not found")
     for i in range(5):
         _file = HDFS(os.path.join(path, str(uuid.uuid4())))
         _file.create(directory=(i % 2 == 0))
         self.assertTrue(_file.exists(), "File was not created")
     return _dir
Ejemplo n.º 3
0
 def test_create_directory(self):
     new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     self.assertFalse(new_dir.exists(), "Directory is already exists")
     try:
         new_dir.create_directory()
         self.assertTrue(new_dir.exists(), "Directory was not created")
         self.assertTrue(new_dir.is_directory())
     finally:
         new_dir.delete(recursive=True)
         self.assertFalse(new_dir.exists(), "Directory was not removed")
Ejemplo n.º 4
0
 def test_create_directory(self):
     new_dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     self.assertFalse(new_dir.exists(), "Directory is already exists")
     try:
         new_dir.create_directory()
         self.assertTrue(new_dir.exists(), "Directory was not created")
         self.assertTrue(new_dir.is_directory())
     finally:
         new_dir.delete(recursive=True)
         self.assertFalse(new_dir.exists(), "Directory was not removed")
Ejemplo n.º 5
0
 def should_create_directory_recursively(self):
     _base_dir = os.path.join("/tmp", str(uuid.uuid4()))
     _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()))
     _dir = HDFS(_path)
     self.assertFalse(_dir.exists(), "Folder is already exists")
     try:
         _dir.create_directory(recursive=True)
         self.assertTrue(_dir.exists(), "Folder was not created")
         self.assertTrue(_dir.is_directory(), "New file should be a directory")
     finally:
         HDFS(_base_dir).delete_directory()
         self.assertFalse(_dir.exists(), "File was not removed")
         self.assertFalse(HDFS(_base_dir).exists(), "Base dir was not removed")
Ejemplo n.º 6
0
 def should_create_directory_recursively(self):
     _base_dir = os.path.join('/tmp', str(uuid.uuid4()))
     _path = os.path.join(_base_dir, str(uuid.uuid4()), str(uuid.uuid4()))
     _dir = HDFS(_path)
     self.assertFalse(_dir.exists(), "Folder is already exists")
     try:
         _dir.create_directory(recursive=True)
         self.assertTrue(_dir.exists(), "Folder was not created")
         self.assertTrue(_dir.is_directory(), "New file should be a directory")
     finally:
         HDFS(_base_dir).delete_directory()
         self.assertFalse(_dir.exists(), "File was not removed")
         self.assertFalse(HDFS(_base_dir).exists(), "Base dir was not removed")
Ejemplo n.º 7
0
 def test_get_modification_time(self):
     now = datetime.now().strftime("%Y-%m-%d")
     _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         _dir.create_directory()
         _file.create_file()
         self.assertTrue(_dir.exists(), "Dir was not created")
         self.assertTrue(_file.exists(), "File was not created")
         self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time")
         self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time")
     finally:
         _dir.delete_directory()
         _file.delete()
Ejemplo n.º 8
0
 def test_get_modification_time(self):
     now = datetime.now().strftime("%Y-%m-%d")
     _dir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     _file = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
     try:
         _dir.create_directory()
         _file.create_file()
         self.assertTrue(_dir.exists(), "Dir was not created")
         self.assertTrue(_file.exists(), "File was not created")
         self.assertEqual(now, _dir.modification_time().strftime("%Y-%m-%d"), "Error: dir modification time")
         self.assertEqual(now, _file.modification_time().strftime("%Y-%m-%d"), "Error: File modification time")
     finally:
         _dir.delete_directory()
         _file.delete()
Ejemplo n.º 9
0
    def test_mr_job_command_generation_with_arguments(self):
        _job_name = "test_mr_job_%s" % uuid.uuid4()

        _base_dir = HDFS(os.path.join("/tmp", _job_name))
        _base_dir.create_directory()
        try:
            jar = os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'hadoop-mapreduce-examples.jar')
            # configure job inputs
            _job_input = HDFS(os.path.join(_base_dir.path, "input"))
            _job_input.create_directory()
            LocalFS(os.path.join(
                os.path.dirname(__file__),
                'resources',
                'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _job_input.path
            )

            # configure job output
            _job_output = HDFS(os.path.join(_base_dir.path, "output"))
            if not os.path.exists(jar):
                self.skipTest("'%s' not found" % jar)

            job = MapReduce.prepare_mapreduce_job(jar=jar,
                                                  main_class="wordcount",
                                                  name=_job_name) \
                .with_config_option("split.by", "'\\t'") \
                .with_number_of_reducers(3) \
                .with_arguments(
                _job_input.path,
                _job_output.path
            )
            _command_submission_result = job.run()
            _command_submission_result.if_failed_raise(AssertionError("Cannot run MR job"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded(), "MR job Failed")
            self.assertTrue(_job_output.exists(), "Error: empty job output")
            #     check counters
            self.assertEqual(6, _job_status.counter(group='File System Counters',
                                                    counter='HDFS: Number of write operations'))
            self.assertEqual(1, _job_status.counter(group='Job Counters', counter='Launched map tasks'))
            self.assertEqual(3, _job_status.counter(group='Job Counters', counter='Launched reduce tasks'))
            self.assertEqual(2168, _job_status.counter(group='File Input Format Counters', counter='Bytes Read'))
        finally:
            _base_dir.delete_directory()
Ejemplo n.º 10
0
    def test_streaming_job_with_multiple_inputs(self):
        _job_basedir = HDFS(os.path.join("/tmp", str(uuid.uuid4())))
        try:

            job = self._template_streaming_job_(base_dir=_job_basedir.path)

            _additional_datasource = HDFS(os.path.join(_job_basedir.path, "input2"))
            _additional_datasource.create_directory()
            LocalFS(os.path.join(os.path.dirname(__file__), 'resources',
                                 'mapreduce', 'raw-data.txt')
            ).copy_to_hdfs(
                _additional_datasource.path)
            job.take(_additional_datasource.path)
            command_result = job.run()
            command_result.if_failed_raise(AssertionError("test_streaming_job_with_multiple_inputs test failed"))
            _job_status = job.status()
            self.assertTrue(_job_status is not None and _job_status.is_succeeded())
            # check counters
            self.assertEqual(740, _job_status.counter(group='Map-Reduce Framework', counter='Reduce input records'),
                             "counters['Map-Reduce Framework']['Reduce input records']")
        finally:
            _job_basedir.delete_directory()
Ejemplo n.º 11
0
    def _template_streaming_job_(self, base_dir="/tmp", map_only_job=False):
        if not os.path.exists(HADOOP_STREAMING_JAR):
            self.skip("Cannot allocate %s" % HADOOP_STREAMING_JAR)
        _hdfs_basdir = HDFS(base_dir)
        if not _hdfs_basdir.exists():
            _hdfs_basdir.create_directory()
        _job_input = HDFS(os.path.join(_hdfs_basdir.path, "input"))
        _job_input.create_directory()
        _job_output = HDFS(os.path.join(_hdfs_basdir.path, "output"))
        home = os.path.dirname(__file__)
        _mapper = os.path.join(home, 'resources', 'mapreduce', 'mapper.py')
        _reducer = os.path.join(home, 'resources', 'mapreduce', 'reducer.py')

        LocalFS(
            os.path.join(os.path.dirname(__file__), 'resources', 'mapreduce', 'raw-data.txt')
        ).copy_to_hdfs(
            _job_input.path
        )

        return MapReduce.prepare_streaming_job(name="test-mr-streaming-job{}".format(str(uuid.uuid4())), jar=HADOOP_STREAMING_JAR) \
            .take(_job_input.path) \
            .process_with(mapper=_mapper, reducer=None if map_only_job else _reducer) \
            .save(_job_output.path)
Ejemplo n.º 12
0
# may be used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

import os

from merlin.fs.hdfs import HDFS
from merlin.fs.localfs import LocalFS

if __name__ == "__main__":
    _basedir = HDFS(os.path.join('/tmp', 'scd.active'))

    _basedir.create_directory()
    _scd_active_snapshot = LocalFS(
        os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv'))
    _scd_active_snapshot.copy_to_hdfs(_basedir.path)
Ejemplo n.º 13
0
Archivo: setup.py Proyecto: epam/Merlin
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# See the NOTICE file and the LICENSE file distributed with this work
# for additional information regarding copyright ownership and licensing.
#

import os

from merlin.fs.hdfs import HDFS
from merlin.fs.localfs import LocalFS


if __name__ == "__main__":
    _basedir = HDFS(os.path.join('/tmp', 'scd.active'))

    _basedir.create_directory()
    _scd_active_snapshot = LocalFS(os.path.join(os.path.dirname(__file__), 'resources', 'scd.active.csv'))
    _scd_active_snapshot.copy_to_hdfs(_basedir.path)

Ejemplo n.º 14
0
Archivo: flow.py Proyecto: epam/Merlin
        file.close()


if __name__ == '__main__':
    log = get_logger("SCD")

    # Prepare paths
    _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig')
    _scd_active_snapshot = '/tmp/scd.active/scd.active.csv'
    _scd_updates = os.path.join(os.path.dirname(__file__), 'resources', 'scd.update.csv')
    _hdfs_job_output = '/tmp/scd.updated'

    _local_folder_to_monitor = LocalFS(os.path.join(os.path.dirname(__file__), 'resources'))
    _hdfs_basedir = HDFS('/tmp/scd.active')
    _hdfs_tmpdir = HDFS('/tmp/scd.tmp')
    _hdfs_tmpdir.create_directory()

    if _scd_updates and LocalFS(_scd_updates).exists():

        # Checks if file with last failed step is exists
        # and reads this step
        step = 'Copying scd updates to raw area on HDFS'
        if os.path.isfile('resources/step'):
            file = open('resources/step', 'r')
            step = file.read()
            file.close()

        flow = FlowRegistry.flow('Flow')

        # Runs flow
        _context = flow.run(action=step,
Ejemplo n.º 15
0
if __name__ == '__main__':
    log = get_logger("SCD")

    # Prepare paths
    _pig_script = os.path.join(os.path.dirname(__file__), 'scd_processing.pig')
    _scd_active_snapshot = '/tmp/scd.active/scd.active.csv'
    _scd_updates = os.path.join(os.path.dirname(__file__), 'resources',
                                'scd.update.csv')
    _hdfs_job_output = '/tmp/scd.updated'

    _local_folder_to_monitor = LocalFS(
        os.path.join(os.path.dirname(__file__), 'resources'))
    _hdfs_basedir = HDFS('/tmp/scd.active')
    _hdfs_tmpdir = HDFS('/tmp/scd.tmp')
    _hdfs_tmpdir.create_directory()

    if _scd_updates and LocalFS(_scd_updates).exists():

        # Checks if file with last failed step is exists
        # and reads this step
        step = 'Copying scd updates to raw area on HDFS'
        if os.path.isfile('resources/step'):
            file = open('resources/step', 'r')
            step = file.read()
            file.close()

        flow = FlowRegistry.flow('Flow')

        # Runs flow
        _context = flow.run(