Exemple #1
0
    def test_success(self, mock_getsize):
        """Tests calling ScaleFileManager.upload_files() successfully"""
        def new_getsize(path):
            return 100
        mock_getsize.side_effect = new_getsize

        workspace = storage_test_utils.create_workspace()
        file_1 = ScaleFile()
        file_1.set_basic_fields('file.txt', 100, None)  # Scale should auto-detect text/plain
        remote_path_1 = 'my/remote/path/file.txt'
        local_path_1 = 'my/local/path/file.txt'
        file_1.file_path = remote_path_1
        file_2 = ScaleFile()
        file_2.set_basic_fields('file.json', 100, 'application/json')
        remote_path_2 = 'my/remote/path/2/file.json'
        local_path_2 = 'my/local/path/2/file.json'
        file_2.file_path = remote_path_2
        workspace.upload_files = MagicMock()

        files = [FileUpload(file_1, local_path_1), FileUpload(file_2, local_path_2)]
        models = ScaleFile.objects.upload_files(workspace, files)

        workspace.upload_files.assert_called_once_with([FileUpload(file_1, local_path_1),
                                                        FileUpload(file_2, local_path_2)])

        self.assertEqual('file.txt', models[0].file_name)
        self.assertEqual(remote_path_1, models[0].file_path)
        self.assertEqual('text/plain', models[0].media_type)
        self.assertEqual(workspace.id, models[0].workspace_id)
        self.assertEqual('file.json', models[1].file_name)
        self.assertEqual(remote_path_2, models[1].file_path)
        self.assertEqual('application/json', models[1].media_type)
        self.assertEqual(workspace.id, models[1].workspace_id)
    def test_successfully(self, mock_copy, mock_chmod, mock_exists, mock_makedirs):
        """Tests calling HostBroker.upload_files() successfully"""

        def new_exists(path):
            return False
        mock_exists.side_effect = new_exists

        volume_path = os.path.join('the', 'volume', 'path')
        file_name_1 = 'my_file.txt'
        file_name_2 = 'my_file.json'
        local_path_file_1 = os.path.join('my_dir_1', file_name_1)
        local_path_file_2 = os.path.join('my_dir_2', file_name_2)
        workspace_path_file_1 = os.path.join('my_wrk_dir_1', file_name_1)
        workspace_path_file_2 = os.path.join('my_wrk_dir_2', file_name_2)
        full_workspace_path_file_1 = os.path.join(volume_path, workspace_path_file_1)
        full_workspace_path_file_2 = os.path.join(volume_path, workspace_path_file_2)

        file_1 = storage_test_utils.create_file(file_path=workspace_path_file_1)
        file_2 = storage_test_utils.create_file(file_path=workspace_path_file_2)
        file_1_up = FileUpload(file_1, local_path_file_1)
        file_2_up = FileUpload(file_2, local_path_file_2)

        # Call method to test
        self.broker.upload_files(volume_path, [file_1_up, file_2_up])

        # Check results
        two_calls = [call(os.path.dirname(full_workspace_path_file_1), mode=0755),
                     call(os.path.dirname(full_workspace_path_file_2), mode=0755)]
        mock_makedirs.assert_has_calls(two_calls)
        two_calls = [call(local_path_file_1, full_workspace_path_file_1),
                     call(local_path_file_2, full_workspace_path_file_2)]
        mock_copy.assert_has_calls(two_calls)
        two_calls = [call(full_workspace_path_file_1, 0644), call(full_workspace_path_file_2, 0644)]
        mock_chmod.assert_has_calls(two_calls)
Exemple #3
0
    def test_upload_files(self, mock_client_class):
        """Tests uploading files successfully"""

        s3_object_1 = MagicMock()
        s3_object_2 = MagicMock()
        mock_client = MagicMock(S3Client)
        mock_client.get_object.side_effect = [s3_object_1, s3_object_2]
        mock_client_class.return_value.__enter__ = Mock(return_value=mock_client)

        file_name_1 = 'my_file.txt'
        file_name_2 = 'my_file.json'
        local_path_file_1 = os.path.join('my_dir_1', file_name_1)
        local_path_file_2 = os.path.join('my_dir_2', file_name_2)
        workspace_path_file_1 = os.path.join('my_wrk_dir_1', file_name_1)
        workspace_path_file_2 = os.path.join('my_wrk_dir_2', file_name_2)

        file_1 = storage_test_utils.create_file(file_path=workspace_path_file_1, media_type='text/plain')
        file_2 = storage_test_utils.create_file(file_path=workspace_path_file_2, media_type='application/json')
        file_1_up = FileUpload(file_1, local_path_file_1)
        file_2_up = FileUpload(file_2, local_path_file_2)

        # Call method to test
        mo = mock_open()
        with patch('__builtin__.open', mo, create=True):
            self.broker.upload_files(None, [file_1_up, file_2_up])

        # Check results
        self.assertTrue(s3_object_1.upload_file.called)
        self.assertTrue(s3_object_2.upload_file.called)
        self.assertEqual(s3_object_1.upload_file.call_args[0][1]['ContentType'], 'text/plain')
        self.assertEqual(s3_object_2.upload_file.call_args[0][1]['ContentType'], 'application/json')
Exemple #4
0
    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the file upload process.
        """

        file_id = options.get('file_id')
        remote_path = options.get('remote_path')
        workspace_name = options.get('workspace')

        logger.info('Command starting: scale_upload_file')
        logger.info(' - Workspace: %s', workspace_name)

        # Validate the file paths
        file_name = os.path.basename(local_path)
        if not os.path.exists(local_path):
            logger.exception('Local file does not exist: %s', local_path)
            sys.exit(1)

        # Attempt to fetch the workspace model
        try:
            workspace = Workspace.objects.get(name=workspace_name)
        except Workspace.DoesNotExist:
            logger.exception('Workspace does not exist: %s', workspace_name)
            sys.exit(1)

        # Attempt to set up a file model
        try:
            scale_file = ScaleFile.objects.get(file_name=file_name)
        except ScaleFile.DoesNotExist:
            scale_file = ScaleFile()
            scale_file.update_uuid(file_name)
        scale_file.file_path = remote_path

        try:
            ScaleFile.objects.upload_files(
                workspace, [FileUpload(scale_file, local_path)])
        except:
            logger.exception('Unknown error occurred, exit code 1 returning')
            sys.exit(1)
        logger.info('Command completed: scale_upload_file')
Exemple #5
0
    def store_file(self, local_path, data_types, workspace, remote_path):
        """Stores the given local source file in the workspace

        :param local_path: The absolute local path of the source file to store
        :type local_path: str
        :param data_types: The data type tags of the source file
        :type data_types: list of str
        :param workspace: The workspace to use for storing the source file
        :type workspace: :class:`storage.models.Workspace`
        :param remote_path: The relative path for storing the source file
        :type remote_path: str
        :returns: The model of the saved source file
        :rtype: :class:`source.models.SourceFile`
        """

        file_name = os.path.basename(local_path)

        # Check for duplicate file, else create new file
        # TODO: fix race condition with many files with same name?
        try:
            src_file = SourceFile.objects.get(file_name=file_name)
            # Duplicate files that are deleted should be stored again
            if not src_file.is_deleted:
                raise DuplicateFile('\'%s\' already exists' % file_name)
        except SourceFile.DoesNotExist:
            src_file = SourceFile()  # New file

        # Add a stable identifier based on the file name
        src_file.update_uuid(file_name)

        # Add tags and store the new/updated source file
        for tag in data_types:
            src_file.add_data_type_tag(tag)

        src_file.file_path = remote_path

        return ScaleFile.objects.upload_files(
            workspace, [FileUpload(src_file, local_path)])[0]
Exemple #6
0
    def upload_files(self, file_entries, input_file_ids, job_exe, workspace):
        """Uploads the given local product files into the workspace.

        :param file_entries: List of files to upload
        :type file_entries: list[:class:`product.types.ProductFileMetadata`]
        :param input_file_ids: List of identifiers for files used to produce the given file entries
        :type input_file_ids: list of int
        :param job_exe: The job_exe model with the related job and job_type fields
        :type job_exe: :class:`job.models.JobExecution`
        :param workspace: The workspace to use for storing the product files
        :type workspace: :class:`storage.models.Workspace`
        :returns: The list of the saved product models
        :rtype: list of :class:`storage.models.ScaleFile`
        """

        # Build a list of UUIDs for the input files
        input_files = ScaleFile.objects.filter(pk__in=input_file_ids).values(
            'uuid', 'id').order_by('uuid')
        input_file_uuids = [f['uuid'] for f in input_files]

        # Get property names and values as strings
        properties = job_exe.job.get_job_data().get_all_properties()

        # Product UUID will be based in part on input data (UUIDs of input files and name/value pairs of input
        # properties)
        input_strings = input_file_uuids
        input_strings.extend(properties)

        # Determine if any input files are non-operational products
        input_products = ScaleFile.objects.filter(
            id__in=[f['id'] for f in input_files], file_type='PRODUCT')
        input_products_operational = all(
            [f.is_operational for f in input_products])

        source_started = job_exe.job.source_started
        source_ended = job_exe.job.source_ended
        source_sensor_class = job_exe.job.source_sensor_class
        source_sensor = job_exe.job.source_sensor
        source_collection = job_exe.job.source_collection
        source_task = job_exe.job.source_task
        if not source_started:
            # Compute the overall start and stop times for all file_entries
            source_files = FileAncestryLink.objects.get_source_ancestors(
                [f['id'] for f in input_files])
            start_times = [f.data_started for f in source_files]
            end_times = [f.data_ended for f in source_files]
            start_times.sort()
            end_times.sort(reverse=True)
            if start_times:
                source_started = start_times[0]
            if end_times:
                source_ended = end_times[0]

        products_to_save = []
        for entry in file_entries:
            product = ProductFile.create()
            product.job_exe = job_exe
            product.job = job_exe.job
            product.job_type = job_exe.job.job_type
            product.is_operational = input_products_operational and job_exe.job.job_type.is_operational
            file_name = os.path.basename(entry.local_path)
            file_size = os.path.getsize(entry.local_path)
            product.set_basic_fields(file_name, file_size, entry.media_type)
            product.file_path = entry.remote_path
            product.job_output = entry.output_name

            # Add a stable identifier based on the job type, input files, input properties, and file name
            # This is designed to remain stable across re-processing the same type of job on the same inputs
            product.update_uuid(job_exe.job.job_type.id, file_name,
                                *input_strings)

            # Add temporal info to product if available
            if entry.data_start:
                product.data_started = parse_datetime(entry.data_start)
            if entry.data_end:
                product.data_ended = parse_datetime(entry.data_end)

            if entry.geojson:
                geom, props = geo_utils.parse_geo_json(entry.geojson)
                product.geometry = geom
                if props:
                    product.meta_data = props
                product.center_point = geo_utils.get_center_point(geom)

            # Add recipe info to product if available.
            job_recipe = Recipe.objects.get_recipe_for_job(job_exe.job_id)
            if job_recipe:
                product.recipe_id = job_recipe.recipe.id
                product.recipe_type = job_recipe.recipe.recipe_type
                product.recipe_node = job_recipe.node_name

                # Add batch info to product if available.
                try:
                    from batch.models import BatchJob
                    product.batch_id = BatchJob.objects.get(
                        job_id=job_exe.job_id).batch_id
                except BatchJob.DoesNotExist:
                    product.batch_id = None

            # Allow override, if set via side-car metadata, otherwise take derived values from above
            product.source_started = entry.source_started if entry.source_started else source_started
            product.source_ended = entry.source_ended if entry.source_ended else source_ended

            # Supplemental source metadata
            product.source_sensor_class = entry.source_sensor_class if entry.source_sensor_class else source_sensor_class
            product.source_sensor = entry.source_sensor if entry.source_sensor else source_sensor
            product.source_collection = entry.source_collection if entry.source_collection else source_collection
            product.source_task = entry.source_task if entry.source_task else source_task

            # Update product model with details derived from the job_type
            product.meta_data['url'] = product.url
            product.meta_data['job_name'] = job_exe.job_type.name
            product.meta_data[
                'job_version'] = job_exe.job_type.get_job_version()
            product.meta_data[
                'package_version'] = job_exe.job_type.get_package_version()

            products_to_save.append(FileUpload(product, entry.local_path))

        return ScaleFile.objects.upload_files(workspace, products_to_save)
    def test_successfully(self, mock_copy, mock_chmod, mock_exists,
                          mock_makedirs):
        """Tests calling NfsBroker.upload_files() successfully"""
        def new_exists(path):
            return False

        mock_exists.side_effect = new_exists

        volume_path = os.path.join('the', 'volume', 'path')
        file_name_1 = 'my_file.txt'
        file_name_2 = 'my_file.json'
        local_path_file_1 = os.path.join('my_dir_1', file_name_1)
        local_path_file_2 = os.path.join('my_dir_2', file_name_2)
        workspace_path_file_1 = os.path.join('my_wrk_dir_1', file_name_1)
        workspace_path_file_2 = os.path.join('my_wrk_dir_2', file_name_2)
        full_workspace_path_file_1 = os.path.join(volume_path,
                                                  workspace_path_file_1)
        full_workspace_path_file_2 = os.path.join(volume_path,
                                                  workspace_path_file_2)

        file_1 = storage_test_utils.create_file(
            file_path=workspace_path_file_1)
        file_2 = storage_test_utils.create_file(
            file_path=workspace_path_file_2)
        file_1_up = FileUpload(file_1, local_path_file_1)
        file_2_up = FileUpload(file_2, local_path_file_2)

        # Call method to test
        mountstats_data = """16 36 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
17 36 0:16 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw
18 36 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=32977500k,nr_inodes=8244375,mode=755
19 17 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
20 18 0:17 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw
21 18 0:11 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000
22 36 0:18 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,mode=755
23 17 0:19 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:8 - tmpfs tmpfs rw,mode=755
24 23 0:20 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
25 17 0:21 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:19 - pstore pstore rw
26 23 0:22 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpuset
27 23 0:23 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpu,cpuacct
28 23 0:24 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory
29 23 0:25 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices
30 23 0:26 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,freezer
31 23 0:27 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,net_cls,net_prio
32 23 0:28 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,blkio
33 23 0:29 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,perf_event
34 23 0:30 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,hugetlb
35 17 0:31 / /sys/kernel/config rw,relatime shared:20 - configfs configfs rw
36 0 253:0 / / rw,relatime shared:1 - xfs /dev/mapper/vg_root-lv_root rw,attr2,inode64,noquota
14 36 0:14 / /users rw,relatime shared:22 - autofs systemd-1 rw,fd=29,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
39 16 0:34 / /proc/sys/fs/binfmt_misc rw,relatime shared:25 - autofs systemd-1 rw,fd=37,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
41 18 0:13 / /dev/mqueue rw,relatime shared:26 - mqueue mqueue rw
40 17 0:6 / /sys/kernel/debug rw,relatime shared:27 - debugfs debugfs rw
42 18 0:35 / /dev/hugepages rw,relatime shared:28 - hugetlbfs hugetlbfs rw
43 36 0:36 / /var/lib/nfs/rpc_pipefs rw,relatime shared:29 - rpc_pipefs sunrpc rw
44 16 0:37 / /proc/fs/nfsd rw,relatime shared:30 - nfsd nfsd rw
45 36 8:2 / /boot rw,relatime shared:31 - xfs /dev/sda2 rw,attr2,inode64,noquota
46 14 0:40 / /users rw,relatime shared:32 - nfs4 users:/users rw,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=14,retrans=2,sec=sys,local_lock=none
49 39 0:38 / /proc/sys/fs/binfmt_misc rw,relatime shared:35 - binfmt_misc binfmt_misc rw
48 38 0:42 / %s rw,relatime shared:34 - nfs4 fserver:/exports/my_dir_1 rw,vers=4.0,rsize=1048576,wsize=1048576,namlen=255,hard,proto=tcp,port=0,timeo=14,retrans=2,sec=sys,local_lock=none
""" % (os.path.abspath(volume_path), )
        mo = mock_open(read_data=mountstats_data)
        # need to patch readlines() since only read() is patched in mock_open
        mo.return_value.readlines.return_value = mo.return_value.read.return_value.split(
            '\n')
        with patch('__builtin__.open', mo, create=True) as pmo:
            self.broker.upload_files(volume_path, [file_1_up, file_2_up])

        # Check results
        two_calls = [
            call(os.path.dirname(full_workspace_path_file_1), mode=0755),
            call(os.path.dirname(full_workspace_path_file_2), mode=0755)
        ]
        mock_makedirs.assert_has_calls(two_calls)
        two_calls = [
            call(local_path_file_1, full_workspace_path_file_1),
            call(local_path_file_2, full_workspace_path_file_2)
        ]
        mock_copy.assert_has_calls(two_calls)
        two_calls = [
            call(full_workspace_path_file_1, 0644),
            call(full_workspace_path_file_2, 0644)
        ]
        mock_chmod.assert_has_calls(two_calls)
Exemple #8
0
    def upload_files(self, file_entries, input_file_ids, job_exe, workspace):
        """Uploads the given local product files into the workspace.

        :param file_entries: List of files where each file is a tuple of (absolute local path, workspace path for
            storing the file, media_type, output_name)
        :type file_entries: list of tuple(str, str, str, str)
        :param input_file_ids: List of identifiers for files used to produce the given file entries
        :type input_file_ids: list of int
        :param job_exe: The job_exe model with the related job and job_type fields
        :type job_exe: :class:`job.models.JobExecution`
        :param workspace: The workspace to use for storing the product files
        :type workspace: :class:`storage.models.Workspace`
        :returns: The list of the saved product models
        :rtype: list of :class:`storage.models.ScaleFile`
        """

        # Build a list of UUIDs for the input files
        input_files = ScaleFile.objects.filter(pk__in=input_file_ids).values('uuid', 'id').order_by('uuid')
        input_file_uuids = [f['uuid'] for f in input_files]

        # Get property names and values as strings
        properties = job_exe.job.get_job_data().get_all_properties()

        # Product UUID will be based in part on input data (UUIDs of input files and name/value pairs of input
        # properties)
        input_strings = input_file_uuids
        input_strings.extend(properties)

        # Determine if any input files are non-operational products
        input_products = ScaleFile.objects.filter(id__in=[f['id'] for f in input_files], file_type='PRODUCT')
        input_products_operational = all([f.is_operational for f in input_products])

        # Compute the overall start and stop times for all file_entries
        source_files = FileAncestryLink.objects.get_source_ancestors([f['id'] for f in input_files])
        start_times = [f.data_started for f in source_files]
        end_times = [f.data_ended for f in source_files]
        start_times.sort()
        end_times.sort(reverse=True)

        products_to_save = []
        for entry in file_entries:
            local_path = entry[0]
            remote_path = entry[1]
            media_type = entry[2]
            output_name = entry[3]

            product = ProductFile.create()
            product.job_exe = job_exe
            product.job = job_exe.job
            product.job_type = job_exe.job.job_type
            product.is_operational = input_products_operational and job_exe.job.job_type.is_operational
            file_name = os.path.basename(local_path)
            file_size = os.path.getsize(local_path)
            product.set_basic_fields(file_name, file_size, media_type)
            product.file_path = remote_path
            product.job_output = output_name

            # Add a stable identifier based on the job type, input files, input properties, and file name
            # This is designed to remain stable across re-processing the same type of job on the same inputs
            product.update_uuid(job_exe.job.job_type.id, file_name, *input_strings)

            # Add geospatial info to product if available
            if len(entry) > 4:
                geo_metadata = entry[4]
                target_date = None
                if 'data_started' in geo_metadata:
                    product.data_started = parse_datetime(geo_metadata['data_started'])
                    target_date = product.data_started
                if 'data_ended' in geo_metadata:
                    product.data_ended = parse_datetime(geo_metadata['data_ended'])
                    if target_date is None:
                        target_date = product.data_ended
                if target_date is None:
                    target_date = product.created
                if 'geo_json' in geo_metadata:
                    geom, props = geo_utils.parse_geo_json(geo_metadata['geo_json'])
                    product.geometry = geom
                    if props:
                        product.meta_data = props
                    product.center_point = geo_utils.get_center_point(geom)

            # Add recipe info to product if available.
            job_recipe = Recipe.objects.get_recipe_for_job(job_exe.job_id)
            if job_recipe:
                product.recipe_id = job_recipe.recipe.id
                product.recipe_type = job_recipe.recipe.recipe_type
                product.recipe_job = job_recipe.job_name

                # Add batch info to product if available.
                try:
                    from batch.models import BatchJob
                    product.batch_id = BatchJob.objects.get(job_id=job_exe.job_id).batch_id
                except BatchJob.DoesNotExist:
                    product.batch_id = None

            # Add start and stop times if available
            if start_times:
                product.source_started = start_times[0]

            if end_times:
                product.source_ended = end_times[0]

            products_to_save.append(FileUpload(product, local_path))

        return ScaleFile.objects.upload_files(workspace, products_to_save)
Exemple #9
0
def move_files(file_ids, new_workspace=None, new_file_path=None):
    """Moves the given files to a different workspace/uri

    :param file_ids: List of ids of ScaleFile objects to move; should all be from the same workspace
    :type file_ids: [int]
    :param new_workspace: New workspace to move files to
    :type new_workspace: `storage.models.Workspace`
    :param new_file_path: New path for files
    :type new_file_path: string
    """

    try:
        messages = []
        files = ScaleFile.objects.all()
        files = files.select_related('workspace')
        files = files.defer('workspace__json_config')
        files = files.filter(id__in=file_ids).only('id', 'file_name',
                                                   'file_path', 'workspace')
        old_files = []
        old_workspace = files[0].workspace
        if new_workspace:
            # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must
            # download the file and copy from there
            # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra
            # download is not necessary

            paths = old_workspace.get_file_system_paths([files])
            local_paths = []
            if paths:
                local_paths = paths
            else:
                file_downloads = []
                for file in files:
                    local_path = os.path.join('/tmp', file.file_name)
                    file_downloads.append(FileDownload(file, local_path,
                                                       False))
                    local_paths.append(local_path)
                ScaleFile.objects.download_files(file_downloads)

            uploads = []
            for file, path in zip(files, local_paths):
                old_path = file.file_path
                old_files.append(
                    ScaleFile(file_name=file.file_name,
                              file_path=file.file_path))
                file.file_path = new_file_path if new_file_path else file.file_path
                logger.info('Copying %s in workspace %s to %s in workspace %s',
                            old_path, file.workspace.name, file.file_path,
                            new_workspace.name)
                file_upload = FileUpload(file, path)
                uploads.append(file_upload)
                message = create_move_file_message(file_id=file.id)
                messages.append(message)

            ScaleFile.objects.upload_files(new_workspace, uploads)
        elif new_file_path:
            moves = []
            for file in files:
                logger.info('Moving %s to %s in workspace %s', file.file_path,
                            new_file_path, file.workspace.name)
                moves.append(FileMove(file, new_file_path))
                message = create_move_file_message(file_id=file.id)
                messages.append(message)

            ScaleFile.objects.move_files(moves)
        else:
            logger.info('No new workspace or file path. Doing nothing')

        CommandMessageManager().send_messages(messages)

        if new_workspace:
            # Copied files to new workspace, so delete file in old workspace (if workspace provides local path to do so)
            old_workspace.delete_files(old_files, update_model=False)

    except ScaleError as err:
        err.log()
        sys.exit(err.exit_code)
    except Exception as ex:
        exit_code = GENERAL_FAIL_EXIT_CODE
        err = get_error_by_exception(ex.__class__.__name__)
        if err:
            err.log()
            exit_code = err.exit_code
        else:
            logger.exception('Error performing move_files steps')
        sys.exit(exit_code)
Exemple #10
0
def perform_ingest(ingest_id):
    """Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: int
    """

    ingest = _get_ingest(ingest_id)
    file_name = ingest.file_name

    if ingest.status in ['INGESTED', 'DUPLICATE']:
        logger.warning('%s already marked %s, nothing to do', file_name,
                       ingest.status)
        return

    _start_ingest(ingest)
    if ingest.status != 'INGESTING':
        return

    try:
        source_file = ingest.source_file
        if source_file.is_deleted:
            # Source file still marked as deleted, so we must copy/move/register the file
            source_file.set_basic_fields(file_name, ingest.file_size,
                                         ingest.media_type,
                                         ingest.get_data_type_tags())
            source_file.update_uuid(
                file_name)  # Add a stable identifier based on the file name
            source_file.workspace = ingest.workspace
            source_file.file_path = ingest.file_path
            source_file.is_deleted = False
            source_file.is_parsed = False
            source_file.deleted = None
            source_file.parsed = None

            if ingest.new_workspace:
                # We need a local path to copy the file, try to get a direct path from the broker, if that fails we must
                # download the file and copy from there
                # TODO: a future refactor should make the brokers work off of file objects instead of paths so the extra
                # download is not necessary
                paths = ingest.workspace.get_file_system_paths([source_file])
                if paths:
                    local_path = paths[0]
                else:
                    local_path = os.path.join('/tmp', file_name)
                    file_download = FileDownload(source_file, local_path,
                                                 False)
                    ScaleFile.objects.download_files([file_download])
                source_file.file_path = ingest.new_file_path if ingest.new_file_path else ingest.file_path
                logger.info('Copying %s in workspace %s to %s in workspace %s',
                            ingest.file_path, ingest.workspace.name,
                            source_file.file_path, ingest.new_workspace.name)
                file_upload = FileUpload(source_file, local_path)
                ScaleFile.objects.upload_files(ingest.new_workspace,
                                               [file_upload])
            elif ingest.new_file_path:
                logger.info('Moving %s to %s in workspace %s',
                            ingest.file_path, ingest.new_file_path,
                            ingest.workspace.name)
                file_move = FileMove(source_file, ingest.new_file_path)
                ScaleFile.objects.move_files([file_move])
            else:
                logger.info('Registering %s in workspace %s', ingest.file_path,
                            ingest.workspace.name)
                _save_source_file(source_file)

        if ingest.new_workspace:
            # Copied file to new workspace, so delete file in old workspace (if workspace provides local path to do so)
            file_with_old_path = SourceFile.create()
            file_with_old_path.file_name = file_name
            file_with_old_path.file_path = ingest.file_path
            paths = ingest.workspace.get_file_system_paths(
                [file_with_old_path])
            if paths:
                _delete_file(paths[0])

    except Exception:
        _complete_ingest(ingest, 'ERRORED')
        raise

    _complete_ingest(ingest, 'INGESTED')
    logger.info('Ingest successful for %s', file_name)
Exemple #11
0
    def upload_files(self, file_entries, input_file_ids, job_exe, workspace):
        """Uploads the given local product files into the workspace.

        :param file_entries: List of files where each file is a tuple of (absolute local path, workspace path for
            storing the file, media_type)
        :type file_entries: list of tuple(str, str, str)
        :param input_file_ids: List of identifiers for files used to produce the given file entries
        :type input_file_ids: list of int
        :param job_exe: The job_exe model with the related job and job_type fields
        :type job_exe: :class:`job.models.JobExecution`
        :param workspace: The workspace to use for storing the product files
        :type workspace: :class:`storage.models.Workspace`
        :returns: The list of the saved product models
        :rtype: list of :class:`product.models.ProductFile`
        """

        # Build a list of UUIDs for the input files
        input_files = ScaleFile.objects.filter(pk__in=input_file_ids).values(
            'uuid', 'id').order_by('uuid')
        input_file_uuids = [f['uuid'] for f in input_files]

        # Determine if any input files are non-operational products
        input_products = ProductFile.objects.filter(
            file__in=[f['id'] for f in input_files])
        input_products_operational = all(
            [f.is_operational for f in input_products])

        products_to_save = []
        for entry in file_entries:
            local_path = entry[0]
            remote_path = entry[1]
            media_type = entry[2]

            product = ProductFile()
            product.job_exe = job_exe
            product.job = job_exe.job
            product.job_type = job_exe.job.job_type
            product.is_operational = input_products_operational and job_exe.job.job_type.is_operational
            product.media_type = media_type
            product.file_path = remote_path

            # Add a stable identifier based on the job type, input files, and file name
            # This is designed to remain stable across re-processing the same type of job on the same inputs
            file_name = os.path.basename(local_path)
            product.update_uuid(job_exe.job.job_type.id, file_name,
                                *input_file_uuids)

            # Add geospatial info to product if available
            if len(entry) > 3:
                geo_metadata = entry[3]
                target_date = None
                if 'data_started' in geo_metadata:
                    product.data_started = parse_datetime(
                        geo_metadata['data_started'])
                    target_date = product.data_started
                if 'data_ended' in geo_metadata:
                    product.data_ended = parse_datetime(
                        geo_metadata['data_ended'])
                    if target_date is None:
                        target_date = product.data_ended
                if target_date is None:
                    target_date = product.created
                if 'geo_json' in geo_metadata:
                    geom, props = geo_utils.parse_geo_json(
                        geo_metadata['geo_json'])
                    product.geometry = geom
                    product.meta_data = props
                    product.center_point = geo_utils.get_center_point(geom)

            products_to_save.append(FileUpload(product, local_path))

        return ScaleFile.objects.upload_files(workspace, products_to_save)
Exemple #12
0
    def _generate_input_metadata(self, job_exe):
        """Generate the input metadata file for the job execution

        :param job_id: The job ID
        :type job_id: int
        :param exe_num: The execution number
        :type exe_num: int
        """

        job_interface = job_exe.job_type.get_job_interface()

        if not job_interface.needs_input_metadata():
            return

        # Generate input metadata dict
        input_metadata = {}
        config = job_exe.get_execution_configuration
        if 'input_files' in config.get_dict():
            input_metadata['JOB'] = {}
            input_data = job_exe.job.get_input_data()
            for i in input_data.values.keys():
                if type(input_data.values[i]) is JsonValue:
                    input_metadata['JOB'][i] = input_data.values[i].value
                elif type(input_data.values[i]) is FileValue:
                    input_metadata['JOB'][i] = [
                        ScaleFile.objects.get(pk=f)._get_url()
                        for f in input_data.values[i].file_ids
                    ]
        if job_exe.recipe_id and job_exe.recipe.has_input():
            input_metadata['RECIPE'] = {}
            input_data = job_exe.recipe.get_input_data()
            for i in input_data.values.keys():
                if type(input_data.values[i]) is JsonValue:
                    input_metadata['RECIPE'][i] = input_data.values[i].value
                elif type(input_data.values[i]) is FileValue:
                    input_metadata['RECIPE'][i] = [
                        ScaleFile.objects.get(pk=f)._get_url()
                        for f in input_data.values[i].file_ids
                    ]

        workspace_names = config.get_input_workspace_names()
        workspace_models = {
            w.name: w
            for w in Workspace.objects.get_workspaces(names=workspace_names)
        }

        input_metadata_id = None
        if input_metadata:
            file_name = '%d-input_metadata.json' % job_exe.job.id
            local_path = os.path.join(SCALE_JOB_EXE_INPUT_PATH, 'tmp',
                                      file_name)
            with open(local_path, 'w') as metadata_file:
                json.dump(input_metadata, metadata_file)
                try:
                    scale_file = ScaleFile.objects.get(file_name=file_name)
                except ScaleFile.DoesNotExist:
                    scale_file = ScaleFile()
                    scale_file.update_uuid(file_name)
                remote_path = self._calculate_remote_path(job_exe)
                scale_file.file_path = remote_path

                for workspace in workspace_models:
                    try:
                        if not input_metadata_id:
                            ScaleFile.objects.upload_files(
                                workspace,
                                [FileUpload(scale_file, local_path)])
                            input_metadata_id = ScaleFile.objects.get(
                                file_name=file_name).id
                            data = job_exe.job.get_job_data()
                            data.add_file_input('INPUT_METADATA_MANIFEST',
                                                input_metadata_id)
                            job_exe.job.input = data.get_dict()
                            job_exe.job.save()
                    except:
                        continue
                if not input_metadata_id:
                    logger.exception(
                        'Error uploading input_metadata manifest for job_exe %d'
                        % job_exe.job.id)