Example #1
0
    def set_basic_fields(self,
                         file_name,
                         file_size,
                         media_type=None,
                         data_type=None):
        """Sets the basic fields for the Scale file

        :param file_name: The name of the file
        :type file_name: string
        :param file_size: The size of the file in bytes
        :type file_size: long
        :param media_type: The IANA media type of the file
        :type media_type: string
        :param data_type: The set of data type tags for the file
        :type data_type: set
        """

        if not media_type:
            media_type = get_media_type(file_name)

        self.file_name = file_name
        self.file_size = file_size
        self.media_type = media_type
        if data_type:
            for tag in data_type:
                self.add_data_type_tag(tag)
Example #2
0
    def create_ingest(self,
                      file_name,
                      workspace,
                      scan_id=None,
                      strike_id=None):
        """Creates a new ingest for the given file name. The database save is the caller's responsibility.

        :param file_name: The name of the file being ingested
        :type file_name: string
        :param workspace:
        :type workspace: string
        :param scan_id:
        :type scan_id: int
        :param strike_id:
        :type strike_id: int
        :returns: The new ingest model
        :rtype: :class:`ingest.models.Ingest`
        """

        ingest = Ingest()

        if scan_id:
            ingest.scan_id = scan_id
        if strike_id:
            ingest.strike_id = strike_id

        ingest.file_name = file_name
        ingest.media_type = get_media_type(ingest.file_name)
        ingest.workspace = workspace

        return ingest
Example #3
0
    def upload_files(self, workspace, file_uploads):
        """Uploads the given files from the given local file system paths into the given workspace. Each ScaleFile model
        should have its file_path field populated with the relative location where the file should be stored within the
        workspace. This method will update the workspace and other fields (including possibly changing file_path) in
        each ScaleFile model and will save the models to the database.

        :param workspace: The workspace to upload files into
        :type workspace: :class:`storage.models.Workspace`
        :param file_uploads: List of files to upload
        :type file_uploads: [:class:`storage.brokers.broker.FileUpload`]
        :returns: The list of saved file models
        :rtype: [:class:`storage.models.ScaleFile`]

        :raises :class:`storage.exceptions.ArchivedWorkspace`: If one of the files has a workspace that is archived
        :raises :class:`storage.exceptions.MissingRemoteMount`: If a required mount location is missing
        """

        if not workspace.is_active:
            raise ArchivedWorkspace('%s is no longer active' % workspace.name)

        file_list = []
        for file_upload in file_uploads:
            scale_file = file_upload.file
            media_type = scale_file.media_type

            # Determine file properties
            file_name = os.path.basename(file_upload.local_path)
            if not media_type:
                media_type = get_media_type(file_name)
            file_size = os.path.getsize(file_upload.local_path)

            scale_file.file_name = file_name
            scale_file.media_type = media_type
            scale_file.file_size = file_size
            scale_file.workspace = workspace
            scale_file.is_deleted = False
            scale_file.deleted = None

            file_list.append(scale_file)

        # Store files in workspace
        workspace.upload_files(file_uploads)

        # Populate the country list for all files that were saved
        for file_upload in file_uploads:
            scale_file = file_upload.file
            if scale_file.pk:
                scale_file.set_countries()
                scale_file.save()

        return file_list
Example #4
0
    def _create_ingest(self, file_name):
        """Creates a new ingest for the given file name. The database save is the caller's responsibility.

        :param file_name: The name of the file being ingested
        :type file_name: string
        :returns: The new ingest model
        :rtype: :class:`ingest.models.Ingest`
        """

        ingest = Ingest()
        ingest.file_name = file_name
        ingest.strike_id = self.strike_id
        ingest.media_type = get_media_type(file_name)
        ingest.workspace = self._monitored_workspace

        logger.info('New file on %s: %s', ingest.workspace.name, file_name)
        return ingest
Example #5
0
    def set_basic_fields(self, file_name, file_size, media_type=None, data_type=None):
        """Sets the basic fields for the Scale file

        :param file_name: The name of the file
        :type file_name: string
        :param file_size: The size of the file in bytes
        :type file_size: long
        :param media_type: The IANA media type of the file
        :type media_type: string
        :param data_type: The set of data type tags for the file
        :type data_type: set
        """

        if not media_type:
            media_type = get_media_type(file_name)

        self.file_name = file_name
        self.file_size = file_size
        self.media_type = media_type
        if data_type:
            for tag in data_type:
                self.add_data_type_tag(tag)
Example #6
0
    def add_file(self, file_name, workspace, scan_id=None, strike_id=None):
        """Add file source metadata to ingest record

        :param file_name: File name excluding full path
        :type file_name: string
        :param workspace:
        :type workspace: string
        :param scan_id:
        :type scan_id: int
        :param strike_id:
        :type strike_id: int
        """

        if scan_id:
            self.scan_id = scan_id
        if strike_id:
            self.strike_id = strike_id

        self.file_name = file_name
        self.media_type = get_media_type(self.file_name)
        self.workspace = workspace

        logger.info('New file on %s: %s', self.workspace.name, self.file_name)
Example #7
0
    def _complete_transfer(self, ingest, size):
        '''Completes the transfer for the given ingest and updates the database

        :param transfer: The ingest model
        :type transfer: :class:`ingest.models.Ingest`
        :param size: Total size of the file in bytes
        :type size: long
        '''

        file_name = ingest.file_name
        file_path = os.path.join(self.strike_dir, file_name)
        if not ingest.status == 'TRANSFERRING':
            msg = 'Completing transfer for %s requires TRANSFERRING status'
            raise Exception(msg % file_path)
        logger.info('Transfer complete: %s', file_path)
        last_modified = os.path.getmtime(file_path)
        ingest.transfer_ended = datetime.utcfromtimestamp(last_modified)
        ingest.media_type = get_media_type(file_name)
        ingest.file_size = size

        # Check configuration for what to do with this file
        file_config = self.configuration.match_file_name(file_name)
        if file_config:
            for data_type in file_config[0]:
                ingest.add_data_type_tag(data_type)
            today = now()
            # Store file within workspace at /configuration_path/current_year/current_month/current_day/file_name
            year_dir = str(today.year)
            month_dir = '%02d' % today.month
            day_dir = '%02d' % today.day
            ingest.file_path = os.path.join(file_config[1], year_dir, month_dir, day_dir, file_name)
            ingest.workspace = file_config[2]
            ingest.ingest_path = os.path.join(self.rel_ingest_dir, file_name)
        ingest.status = 'TRANSFERRED'
        ingest.save()
        logger.info('Ingest marked as TRANSFERRED: %s', file_name)
Example #8
0
    def _complete_transfer(self, ingest, size):
        '''Completes the transfer for the given ingest and updates the database

        :param transfer: The ingest model
        :type transfer: :class:`ingest.models.Ingest`
        :param size: Total size of the file in bytes
        :type size: long
        '''

        file_name = ingest.file_name
        file_path = os.path.join(self.strike_dir, file_name)
        if not ingest.status == 'TRANSFERRING':
            msg = 'Completing transfer for %s requires TRANSFERRING status'
            raise Exception(msg % file_path)
        logger.info('Transfer complete: %s', file_path)
        last_modified = os.path.getmtime(file_path)
        ingest.transfer_ended = datetime.utcfromtimestamp(last_modified)
        ingest.media_type = get_media_type(file_name)
        ingest.file_size = size

        # Check configuration for what to do with this file
        file_config = self.configuration.match_file_name(file_name)
        if file_config:
            for data_type in file_config[0]:
                ingest.add_data_type_tag(data_type)
            today = now()
            # Store file within workspace at /configuration_path/current_year/current_month/current_day/file_name
            year_dir = str(today.year)
            month_dir = '%02d' % today.month
            day_dir = '%02d' % today.day
            ingest.file_path = os.path.join(file_config[1], year_dir, month_dir, day_dir, file_name)
            ingest.workspace = file_config[2]
            ingest.ingest_path = os.path.join(self.rel_ingest_dir, file_name)
        ingest.status = 'TRANSFERRED'
        ingest.save()
        logger.info('Ingest marked as TRANSFERRED: %s', file_name)
Example #9
0
    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method migrates existing data files into scale.
        """
        logger.info(u'Command starting: migratedata')

        workspace, workspace_path, local_path, data_types = None, None, None, []
        if options['workspace'] is not None and options[
                'workspace_path'] is not None:
            workspace, workspace_path = options['workspace'], options[
                'workspace_path']
            tmp = Workspace.objects.filter(name=workspace)
            if tmp.count() > 0:
                workspace = tmp.first()
            else:
                workspace = Workspace.objects.get(id=int(workspace))
        else:
            logger.error('Must specify workspace and workspace-path.')
            return False
        if options['data_type'] is not None:
            data_types.extend(options['data_type'])

        mnt_dirs = None
        if options['local_path'] is not None:
            local_path = options['local_path']
        else:  # mount
            mnt_dirs = "/tmp", tempfile.mkdtemp()
            workspace.setup_download_dir(*mnt_dirs)
            local_path = os.path.join(mnt_dirs[1], workspace_path)

        logger.info("Ingesting files from %s/%s", workspace.name,
                    workspace_path)
        filenames = self.generate_file_list(local_path, options['include'],
                                            options['exclude'])
        logger.info("Found %d files", len(filenames))

        # prepare for ingest ala strike
        ingest_records = {}
        for filename in filenames:
            logger.info("Generating ingest record for %s" % filename)
            ingest = Ingest()
            ingest.file_name = os.path.basename(filename)
            ingest.file_path = os.path.join(
                workspace_path, os.path.relpath(filename, local_path))
            ingest.transfer_started = datetime.utcfromtimestamp(
                os.path.getatime(filename))
            ingest.file_size = ingest.bytes_transferred = os.path.getsize(
                filename)
            ingest.transfer_ended = timezone.now()
            ingest.media_type = get_media_type(filename)
            ingest.workspace = workspace
            for data_type in data_types:
                ingest.add_data_type_tag(data_type)
            ingest.status = 'TRANSFERRED'
            if options['no_commit']:
                s = IngestDetailsSerializer()
                logger.info(s.to_representation(ingest))
            else:
                ingest.save()
                ingest_records[filename] = ingest.id
        logging.info("Ingests records created")

        # start ingest tasks for all the files
        if not options['no_commit']:
            logging.info("Starting ingest tasks")
            for filename in filenames:
                ingest = Ingest.objects.get(id=ingest_records[filename])
                logging.info("Processing ingest %s" % ingest.file_name)
                with transaction.atomic():
                    ingest.ingest_started = timezone.now()
                    sf = ingest.source_file = SourceFile.create()
                    sf.update_uuid(ingest.file_name)
                    for tag in ingest.get_data_type_tags():
                        sf.add_data_type_tag(tag)
                    sf.media_type = ingest.media_type
                    sf.file_name = ingest.file_name
                    sf.file_size = ingest.file_size
                    sf.file_path = ingest.file_path
                    sf.workspace = workspace
                    sf.is_deleted = False
                    sf.deleted = None
                    sf.save()
                    sf.set_countries()
                    sf.save()
                    ingest.status = 'INGESTED'
                    ingest.ingest_ended = timezone.now()
                    ingest.source_file = sf
                    ingest.save()
                    IngestTriggerHandler().process_ingested_source_file(
                        ingest.source_file, ingest.ingest_ended)

        logging.info(
            "Ingests processed, monitor the queue for triggered jobs.")

        if mnt_dirs is not None:
            workspace.cleanup_download_dir(*mnt_dirs)

        logger.info(u'Command completed: migratedata')
Example #10
0
    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method migrates existing data files into scale.
        """
        logger.info(u'Command starting: migratedata')

        workspace, workspace_path, local_path, data_types = None, None, None, []
        if options['workspace'] is not None and options['workspace_path'] is not None:
            workspace, workspace_path = options['workspace'], options['workspace_path']
            tmp = Workspace.objects.filter(name=workspace)
            if tmp.count() > 0:
                workspace = tmp.first()
            else:
                workspace = Workspace.objects.get(id=int(workspace))
        else:
            logger.error('Must specify workspace and workspace-path.')
            return False
        if options['data_type'] is not None:
            data_types.extend(options['data_type'])

        mnt_dirs = None
        if options['local_path'] is not None:
            local_path = options['local_path']
        else:  # mount
            mnt_dirs = "/tmp", tempfile.mkdtemp()
            workspace.setup_download_dir(*mnt_dirs)
            local_path = os.path.join(mnt_dirs[1], workspace_path)

        logger.info("Ingesting files from %s/%s", workspace.name, workspace_path)
        filenames = self.generate_file_list(local_path, options['include'], options['exclude'])
        logger.info("Found %d files", len(filenames))

        # prepare for ingest ala strike
        ingest_records = {}
        for filename in filenames:
            logger.info("Generating ingest record for %s" % filename)
            ingest = Ingest()
            ingest.file_name = os.path.basename(filename)
            ingest.transfer_path = filename
            ingest.file_path = os.path.join(workspace_path, os.path.relpath(filename, local_path))
            ingest.transfer_started = datetime.utcfromtimestamp(os.path.getatime(filename))
            ingest.file_size = ingest.bytes_transferred = os.path.getsize(filename)
            ingest.transfer_ended = datetime.utcnow()
            ingest.media_type = get_media_type(filename)
            ingest.workspace = workspace
            for data_type in data_types:
                ingest.add_data_type_tag(data_type)
            ingest.status = 'TRANSFERRED'
            if options['no_commit']:
                s = IngestDetailsSerializer()
                logger.info(s.to_representation(ingest))
            else:
                ingest.save()
                ingest_records[filename] = ingest.id
        logging.info("Ingests records created")

        # start ingest tasks for all the files
        if not options['no_commit']:
            logging.info("Starting ingest tasks")
            for filename in filenames:
                ingest = Ingest.objects.get(id=ingest_records[filename])
                logging.info("Processing ingest %s" % ingest.file_name)
                with transaction.atomic():
                    ingest.ingest_started = datetime.utcnow()
                    sf = ingest.source_file = SourceFile()
                    sf.update_uuid(ingest.file_name)
                    for tag in ingest.get_data_type_tags():
                        sf.add_data_type_tag(tag)
                    sf.media_type = ingest.media_type
                    sf.file_name = ingest.file_name
                    sf.file_size = ingest.file_size
                    sf.file_path = ingest.file_path
                    sf.workspace = workspace
                    sf.is_deleted = False
                    sf.deleted = None
                    sf.save()
                    sf.set_countries()
                    sf.save()
                    ingest.status = 'INGESTED'
                    ingest.ingest_ended = datetime.utcnow()
                    ingest.source_file = sf
                    ingest.save()
                    IngestTriggerHandler().process_ingested_source_file(ingest.source_file, ingest.ingest_ended)

        logging.info("Ingests processed, monitor the queue for triggered jobs.")

        if mnt_dirs is not None:
            workspace.cleanup_download_dir(*mnt_dirs)

        logger.info(u'Command completed: migratedata')
Example #11
0
    def upload_files(self, upload_dir, work_dir, workspace, files_to_upload):
        """Uploads the given files in the given upload directory into the workspace. This method assumes that
        setup_upload_dir() has already been called with the same upload and work directories. The ScaleFile models will
        be saved in an atomic database transaction.

        :param upload_dir: Absolute path to the local directory of the files to upload
        :type upload_dir: str
        :param work_dir: Absolute path to a local work directory available to assist in uploading
        :type work_dir: str
        :param workspace: The workspace to upload files into
        :type workspace: :class:`storage.models.Workspace`
        :param files_to_upload: List of tuples (ScaleFile model, source path relative to upload directory, workspace
            path for storing the file)
        :type files_to_upload: list of (:class:`storage.models.ScaleFile`, str, str)
        :returns: The list of the saved file models
        :rtype: list of :class:`storage.models.ScaleFile`
        """

        upload_dir = os.path.normpath(upload_dir)
        work_dir = os.path.normpath(work_dir)
        workspace_work_dir = self._get_workspace_work_dir(work_dir, workspace)

        file_list = []
        wksp_upload_list = [
        ]  # Info to pass the workspace so it can upload files
        wksp_delete_list = [
        ]  # Info needed to delete the files if the database save fails
        for entry in files_to_upload:
            scale_file = entry[0]
            upload_path = entry[1]
            workspace_path = entry[2]
            full_upload_path = os.path.join(upload_dir, upload_path)
            media_type = scale_file.media_type

            # Determine file properties
            file_name = os.path.basename(full_upload_path)
            if not media_type:
                media_type = get_media_type(file_name)
            file_size = os.path.getsize(full_upload_path)

            scale_file.file_name = file_name
            scale_file.media_type = media_type
            scale_file.file_size = file_size
            scale_file.file_path = workspace_path
            scale_file.workspace = workspace
            scale_file.is_deleted = False
            scale_file.deleted = None

            file_list.append(scale_file)
            wksp_upload_list.append((upload_path, workspace_path))
            wksp_delete_list.append(workspace_path)

        try:
            # Store files in workspace
            workspace.upload_files(upload_dir, workspace_work_dir,
                                   wksp_upload_list)

            with transaction.atomic():
                for scale_file in file_list:
                    # save to create a pkey, update the country list, then save again
                    scale_file.save()
                    scale_file.set_countries()
                    scale_file.save()

            return file_list
        except Exception as ex:
            # Attempt to clean up failed files before propagating exception
            try:
                delete_work_dir = self._get_delete_work_dir(
                    work_dir, workspace)
                logger.info('Creating %s', delete_work_dir)
                os.makedirs(delete_work_dir, mode=0755)
                workspace.delete_files(delete_work_dir, wksp_delete_list)
            except Exception:
                # Failure to delete should not override ex
                logger.exception(
                    'Error cleaning up files that failed to upload')
            raise ex
Example #12
0
    def upload_files(self, upload_dir, work_dir, workspace, files_to_upload):
        '''Uploads the given files in the given upload directory into the workspace. This method assumes that
        setup_upload_dir() has already been called with the same upload and work directories. The ScaleFile models will
        be saved in an atomic database transaction.

        :param upload_dir: Absolute path to the local directory of the files to upload
        :type upload_dir: str
        :param work_dir: Absolute path to a local work directory available to assist in uploading
        :type work_dir: str
        :param workspace: The workspace to upload files into
        :type workspace: :class:`storage.models.Workspace`
        :param files_to_upload: List of tuples (ScaleFile model, source path relative to upload directory, workspace
            path for storing the file)
        :type files_to_upload: list of (:class:`storage.models.ScaleFile`, str, str)
        :returns: The list of the saved file models
        :rtype: list of :class:`storage.models.ScaleFile`
        '''

        upload_dir = os.path.normpath(upload_dir)
        workspace_work_dir = self._get_workspace_work_dir(work_dir, workspace)

        file_list = []
        wksp_upload_list = []   # Info to pass the workspace so it can upload files
        wksp_delete_list = []   # Info needed to delete the files if the database save fails
        for entry in files_to_upload:
            scale_file = entry[0]
            upload_path = entry[1]
            workspace_path = entry[2]
            full_upload_path = os.path.join(upload_dir, upload_path)
            media_type = scale_file.media_type

            # Determine file properties
            file_name = os.path.basename(full_upload_path)
            if not media_type:
                media_type = get_media_type(file_name)
            file_size = os.path.getsize(full_upload_path)

            scale_file.file_name = file_name
            scale_file.media_type = media_type
            scale_file.file_size = file_size
            scale_file.file_path = workspace_path
            scale_file.workspace = workspace
            scale_file.is_deleted = False
            scale_file.deleted = None

            file_list.append(scale_file)
            wksp_upload_list.append((upload_path, workspace_path))
            wksp_delete_list.append(workspace_path)

        try:
            # Store files in workspace
            workspace.upload_files(upload_dir, workspace_work_dir, wksp_upload_list)

            with transaction.atomic():
                for scale_file in file_list:
                    # save to create a pkey, update the country list, then save again
                    scale_file.save()
                    scale_file.set_countries()
                    scale_file.save()

            return file_list
        except Exception as ex:
            # Attempt to clean up failed files before propagating exception
            try:
                delete_work_dir = os.path.join(os.path.normpath(work_dir), 'delete', get_valid_filename(workspace.name))
                workspace.delete_files(delete_work_dir, wksp_delete_list)
            except Exception:
                # Failure to delete should not override ex
                logger.exception(u'Error cleaning up files that failed to upload')
            raise ex