Beispiel #1
0
def perform_ingest(ingest_id, mount):
    """Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: int
    :param mount: The file system to mount in the form of host:/dir/path
    :type mount: string
    """

    # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database
    # transaction with as few queries as possible, include retries
    ingest = _get_ingest(ingest_id)
    job_exe_id = _get_job_exe_id(ingest)
    if not os.path.exists(SCALE_INGEST_MOUNT_PATH):
        logger.info('Creating %s', SCALE_INGEST_MOUNT_PATH)
        os.makedirs(SCALE_INGEST_MOUNT_PATH, mode=0755)
    dup_path = os.path.join(SCALE_INGEST_MOUNT_PATH, 'duplicate',
                            ingest.file_name)
    ingest_path = os.path.join(SCALE_INGEST_MOUNT_PATH, ingest.ingest_path)
    nfs_mount(mount, SCALE_INGEST_MOUNT_PATH, read_only=False)

    try:
        # Check condition of the ingest
        ingest = _set_ingesting_status(ingest, ingest_path, dup_path)
        if ingest is None:
            return

        logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path,
                    ingest.workspace.name)
        try:
            # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model
            # This guarantees that source file exists and can be used to check if file is duplicate
            # After this step, the source file should be marked as is_deleted so that it can't be used yet
            src_file = SourceFile.objects.store_file(
                ingest_path, ingest.get_data_type_tags(), ingest.workspace,
                ingest.file_path)

            _complete_ingest(ingest, 'INGESTED', src_file)
            _delete_ingest_file(ingest_path)
            logger.info('Ingest successful: %s', ingest_path)
        except DuplicateFile:
            logger.warning('Duplicate file detected: %i',
                           ingest_id,
                           exc_info=True)
            # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them
            _complete_ingest(ingest, 'DUPLICATE', None)
            _move_ingest_file(ingest_path, dup_path)
        except Exception:
            # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method
            # TODO: future refactor: pass source file model in so source files have errored ingests tied to them
            # TODO: change ERRORED to FAILED
            _complete_ingest(ingest, 'ERRORED', None)
            raise  # File remains where it is so it can be processed again
    finally:
        nfs_umount(SCALE_INGEST_MOUNT_PATH)

    try:
        cleanup_job_exe(job_exe_id)
    except Exception:
        logger.exception('Job Execution %i: Error cleaning up', job_exe_id)
Beispiel #2
0
    def _cleanup(self, exe_id):
        '''Cleans up the work directory for the job. This method is safe and should not throw any exceptions.
        '''

        try:
            cleanup_job_exe(exe_id)
        except Exception:
            logger.exception('Job Execution %i: Error cleaning up', exe_id)
Beispiel #3
0
    def _cleanup(self, exe_id):
        """Cleans up the work directory for the job. This method is safe and should not throw any exceptions.
        """

        try:
            cleanup_job_exe(exe_id)
        except Exception:
            logger.exception('Job Execution %i: Error cleaning up', exe_id)
Beispiel #4
0
def perform_ingest(ingest_id, mount):
    """Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: int
    :param mount: The file system to mount in the form of host:/dir/path
    :type mount: string
    """

    # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database
    # transaction with as few queries as possible, include retries
    ingest = _get_ingest(ingest_id)
    job_exe_id = _get_job_exe_id(ingest)
    if not os.path.exists(SCALE_INGEST_MOUNT_PATH):
        logger.info('Creating %s', SCALE_INGEST_MOUNT_PATH)
        os.makedirs(SCALE_INGEST_MOUNT_PATH, mode=0755)
    dup_path = os.path.join(SCALE_INGEST_MOUNT_PATH, 'duplicate', ingest.file_name)
    ingest_path = os.path.join(SCALE_INGEST_MOUNT_PATH, ingest.ingest_path)
    nfs_mount(mount, SCALE_INGEST_MOUNT_PATH, read_only=False)

    try:
        # Check condition of the ingest
        ingest = _set_ingesting_status(ingest, ingest_path, dup_path)
        if ingest is None:
            return

        logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name)
        try:
            # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model
            # This guarantees that source file exists and can be used to check if file is duplicate
            # After this step, the source file should be marked as is_deleted so that it can't be used yet
            src_file = SourceFile.objects.store_file(ingest_path, ingest.get_data_type_tags(), ingest.workspace,
                                                     ingest.file_path)

            _complete_ingest(ingest, 'INGESTED', src_file)
            _delete_ingest_file(ingest_path)
            logger.info('Ingest successful: %s', ingest_path)
        except DuplicateFile:
            logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True)
            # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them
            _complete_ingest(ingest, 'DUPLICATE', None)
            _move_ingest_file(ingest_path, dup_path)
        except Exception:
            # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method
            # TODO: future refactor: pass source file model in so source files have errored ingests tied to them
            # TODO: change ERRORED to FAILED
            _complete_ingest(ingest, 'ERRORED', None)
            raise  # File remains where it is so it can be processed again
    finally:
        nfs_umount(SCALE_INGEST_MOUNT_PATH)

    try:
        cleanup_job_exe(job_exe_id)
    except Exception:
        logger.exception('Job Execution %i: Error cleaning up', job_exe_id)
Beispiel #5
0
    def handle(self, **options):
        '''See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the command.
        '''

        job_exe_id = options.get('job_exe_id')

        logger.info('Command starting: scale_cleanup - Job Execution ID: %i', job_exe_id)
        try:
            cleanup_job_exe(job_exe_id)
        except Exception:
            logger.exception('Error performing job execution cleanup')

            sys.exit(-1)

        logger.info('Command completed successfully: scale_cleanup')
Beispiel #6
0
    def handle(self, **options):
        '''See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the command.
        '''

        job_exe_id = options.get('job_exe_id')

        logger.info('Command starting: scale_cleanup - Job Execution ID: %i',
                    job_exe_id)
        try:
            cleanup_job_exe(job_exe_id)
        except Exception:
            logger.exception('Error performing job execution cleanup')

            sys.exit(-1)

        logger.info('Command completed successfully: scale_cleanup')
Beispiel #7
0
    def _run_processor(self, strike_id, throttle):
        '''Runs the given Strike processor

        :param strike_id: The ID of the Strike process to run
        :type strike_id: int
        :param throttle: The minimum delay time in seconds before subsequent reads of the directory
        :type throttle: int
        '''
        strike_proc = None

        # TODO: figure out how to guarantee only one Strike process runs at a time
        while self.running:
            secs_passed = 0
            try:
                if not strike_proc:
                    strike_proc = self._init_processor(strike_id)
                else:
                    self._reload_processor(strike_id, strike_proc)

                # Process the directory and record number of seconds used
                started = now()
                strike_proc.mount_and_process_dir()
                ended = now()

                secs_passed = (ended - started).total_seconds()
            except:
                logger.exception('Strike processor encountered error.')
            finally:
                if self.running:
                    # If process time takes less than user-specified time, throttle
                    if secs_passed < throttle:
                        # Delay until full throttle time reached
                        delay = math.ceil(throttle - secs_passed)
                        logger.debug('Pausing for %i seconds', delay)
                        time.sleep(delay)

        if self.job_exe_id:
            cleanup_job_exe(self.job_exe_id)
        logger.info('Strike processor has stopped running')

        # TODO: eventually implement a REST API call to permanently stop a Strike process, which should allow this
        # command line method to return 0 and complete successfully
        sys.exit(1)
Beispiel #8
0
    def _run_processor(self, strike_id, throttle):
        '''Runs the given Strike processor

        :param strike_id: The ID of the Strike process to run
        :type strike_id: int
        :param throttle: The minimum delay time in seconds before subsequent reads of the directory
        :type throttle: int
        '''
        strike_proc = None

        # TODO: figure out how to guarantee only one Strike process runs at a time
        while self.running:
            secs_passed = 0
            try:
                if not strike_proc:
                    strike_proc = self._init_processor(strike_id)
                else:
                    self._reload_processor(strike_id, strike_proc)

                # Process the directory and record number of seconds used
                started = now()
                strike_proc.mount_and_process_dir()
                ended = now()

                secs_passed = (ended - started).total_seconds()
            except:
                logger.exception('Strike processor encountered error.')
            finally:
                if self.running:
                    # If process time takes less than user-specified time, throttle
                    if secs_passed < throttle:
                        # Delay until full throttle time reached
                        delay = math.ceil(throttle - secs_passed)
                        logger.debug('Pausing for %i seconds', delay)
                        time.sleep(delay)

        if self.job_exe_id:
            cleanup_job_exe(self.job_exe_id)
        logger.info('Strike processor has stopped running')

        # TODO: eventually implement a REST API call to permanently stop a Strike process, which should allow this
        # command line method to return 0 and complete successfully
        sys.exit(1)
Beispiel #9
0
def perform_ingest(ingest_id, mount):
    '''Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: long
    :param mount: The file system to mount in the form of host:/dir/path
    :type mount: str
    '''

    job_exe_id = None
    upload_work_dir = None
    try:
        ingest = Ingest.objects.select_related().get(id=ingest_id)
        job_exe_id = JobExecution.objects.get_latest([ingest.job])[ingest.job.id].id
        ingest_work_dir = get_ingest_work_dir(job_exe_id)
        dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name)
        ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path)
        upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id))
        if not os.path.exists(ingest_work_dir):
            logger.info('Creating %s', ingest_work_dir)
            os.makedirs(ingest_work_dir, mode=0755)
        nfs_mount(mount, ingest_work_dir, read_only=False)

        # Check condition of the ingest
        ingest = _set_ingesting_status(ingest, ingest_path, dup_path)
        if ingest is None:
            return

        logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name)
        try:
            src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(),
                                                     ingest.workspace, ingest.file_path)
            # Atomically store file, mark INGESTED, and run ingest trigger rules
            with transaction.atomic():
                # TODO: It's possible that the file will be successfully moved into the workspace but this database
                # transaction might fail. This will result in a file that is in a workspace but doesn't have database
                # entries. Attempts to re-ingest will result in duplicate file errors.
                logger.info('Marking file as INGESTED: %i', ingest_id)
                ingest.source_file = src_file
                ingest.status = 'INGESTED'
                ingest.ingest_ended = timezone.now()
                ingest.save()
                logger.debug('Checking ingest trigger rules')
                for ingest_rule in get_ingest_rules():
                    ingest_rule.process_ingest(ingest, src_file.id)

            # Delete ingest file
            _delete_ingest_file(ingest_path)
            logger.info('Ingest successful: %s', ingest_path)
        except DuplicateFile:
            logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True)
            ingest.status = 'DUPLICATE'
            ingest.save()
            _move_ingest_file(ingest_path, dup_path)
        except Exception:
            # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method
            ingest.status = 'ERRORED'
            ingest.save()
            raise  # File remains where it is so it can be processed again
    finally:
        try:
            if upload_work_dir and os.path.exists(upload_work_dir):
                logger.info('Deleting %s', upload_work_dir)
                shutil.rmtree(upload_work_dir)
        except:
            # Swallow exception so error from main try block isn't covered up
            logger.exception('Failed to delete upload work dir %s', upload_work_dir)

        if job_exe_id:
            cleanup_job_exe(job_exe_id)
Beispiel #10
0
def perform_ingest(ingest_id, mount):
    '''Performs the ingest for the given ingest ID

    :param ingest_id: The ID of the ingest to perform
    :type ingest_id: long
    :param mount: The file system to mount in the form of host:/dir/path
    :type mount: str
    '''

    job_exe_id = None
    upload_work_dir = None
    try:
        # TODO: refactor to combine _get_ingest(), _get_job_exe_id(), and _set_ingesting_status() in one database
        # transaction with as few queries as possible, include retries
        ingest = _get_ingest(ingest_id)
        job_exe_id = _get_job_exe_id(ingest)
        create_job_exe_dir(job_exe_id)
        ingest_work_dir = get_ingest_work_dir(job_exe_id)
        dup_path = os.path.join(ingest_work_dir, 'duplicate', ingest.file_name)
        ingest_path = os.path.join(ingest_work_dir, ingest.ingest_path)
        upload_work_dir = os.path.join(os.path.dirname(ingest_path), 'upload', str(ingest_id))
        if not os.path.exists(ingest_work_dir):
            logger.info('Creating %s', ingest_work_dir)
            os.makedirs(ingest_work_dir, mode=0755)
        nfs_mount(mount, ingest_work_dir, read_only=False)
        if not os.path.exists(upload_work_dir):
            logger.info('Creating %s', upload_work_dir)
            os.makedirs(upload_work_dir, mode=0755)

        # Check condition of the ingest
        ingest = _set_ingesting_status(ingest, ingest_path, dup_path)
        if ingest is None:
            return

        logger.info('Storing %s into %s on %s', ingest_path, ingest.file_path, ingest.workspace.name)
        try:
            # TODO: future refactor: before copying file, grab existing source file (no lock) or create and save model
            # This guarantees that source file exists and can be used to check if file is duplicate
            # After this step, the source file should be marked as is_deleted so that it can't be used yet
            src_file = SourceFile.objects.store_file(upload_work_dir, ingest_path, ingest.get_data_type_tags(),
                                                     ingest.workspace, ingest.file_path)

            _complete_ingest(ingest, 'INGESTED', src_file)
            _delete_ingest_file(ingest_path)
            logger.info('Ingest successful: %s', ingest_path)
        except DuplicateFile:
            logger.warning('Duplicate file detected: %i', ingest_id, exc_info=True)
            # TODO: future refactor: pass source file model in so source files have duplicate ingests tied to them
            _complete_ingest(ingest, 'DUPLICATE', None)
            _move_ingest_file(ingest_path, dup_path)
        except Exception:
            # TODO: have this delete the stored source file using some SourceFile.objects.delete_file method
            # TODO: future refactor: pass source file model in so source files have errored ingests tied to them
            # TODO: change ERRORED to FAILED
            _complete_ingest(ingest, 'ERRORED', None)
            raise  # File remains where it is so it can be processed again
    finally:
        try:
            # Try to clean up the upload directory
            if upload_work_dir and os.path.exists(upload_work_dir):
                upload_dir = os.path.join(upload_work_dir, 'upload')
                workspace_work_dir = os.path.join(upload_work_dir, 'work')
                if os.path.exists(workspace_work_dir):
                    ScaleFile.objects.cleanup_upload_dir(upload_dir, workspace_work_dir, ingest.workspace)
                    logger.info('Deleting %s', workspace_work_dir)
                    os.rmdir(workspace_work_dir)
                if os.path.exists(upload_dir):
                    logger.info('Deleting %s', upload_dir)
                    # Delete everything in upload dir
                    shutil.rmtree(upload_dir)
                logger.info('Deleting %s', upload_work_dir)
                os.rmdir(upload_work_dir)
        except:
            # Swallow exception so error from main try block isn't covered up
            logger.exception('Failed to delete upload work dir %s', upload_work_dir)

    try:
        if job_exe_id:
            cleanup_job_exe(job_exe_id)
    except Exception:
        logger.exception('Job Execution %i: Error cleaning up', job_exe_id)