Example #1
0
    def test_no_tags(self):
        '''Tests calling get_data_type_tags() with no tags'''

        ingest = Ingest()
        tags = ingest.get_data_type_tags()

        self.assertSetEqual(tags, set())
Example #2
0
    def test_no_tags(self):
        '''Tests calling get_data_type_tags() with no tags'''

        ingest = Ingest()
        tags = ingest.get_data_type_tags()

        self.assertSetEqual(tags, set())
Example #3
0
    def test_deduplicate_ingest_list_no_existing(self, ingests_by_scan):
        """Tests calling S3Scanner._deduplicate_ingest_list() without existing"""

        ingests_by_scan.return_value = []

        ingests = [Ingest(file_name='test1'), Ingest(file_name='test2')]
        final_ingests = S3Scanner._deduplicate_ingest_list(None, ingests)

        self.assertItemsEqual(ingests, final_ingests)
Example #4
0
    def test_deduplicate_ingest_list_with_duplicate_file_names(
            self, ingests_by_scan):
        """Tests calling S3Scanner._deduplicate_ingest_list() with duplicates"""

        ingests_by_scan.return_value = []

        ingests = [Ingest(file_name='test1'), Ingest(file_name='test1')]
        final_ingests = S3Scanner._deduplicate_ingest_list(None, ingests)

        self.assertEquals(len(final_ingests), 1)
        self.assertEquals(final_ingests[0].file_name, 'test1')
Example #5
0
    def test_deduplicate_ingest_list_with_existing_no_other_dups(
            self, ingests_by_scan):
        """Tests calling S3Scanner._deduplicate_ingest_list() with existing and no other dups"""

        ingests_by_scan.return_value = [Ingest(file_name='test1')]

        ingests = [Ingest(file_name='test1'), Ingest(file_name='test2')]
        final_ingests = S3Scanner._deduplicate_ingest_list(None, ingests)

        self.assertEquals(len(final_ingests), 1)
        self.assertEquals(final_ingests[0].file_name, 'test2')
Example #6
0
    def test_tags(self):
        '''Tests calling get_data_type_tags() with tags'''

        ingest = Ingest(data_type='A,B,c')
        tags = ingest.get_data_type_tags()

        correct_set = set()
        correct_set.add('A')
        correct_set.add('B')
        correct_set.add('c')

        self.assertSetEqual(tags, correct_set)
Example #7
0
    def test_tags(self):
        """Tests calling get_data_type_tags() with tags"""

        ingest = Ingest(data_type_tags=['A', 'B', 'c'])
        tags = ingest.get_data_type_tags()

        correct_set = set()
        correct_set.add('A')
        correct_set.add('B')
        correct_set.add('c')

        self.assertSetEqual(tags, correct_set)
Example #8
0
    def test_same_tag(self):
        """Tests calling add_data_type_tag() with the same tag twice"""

        ingest = Ingest()
        ingest.add_data_type_tag('Hello1')
        ingest.add_data_type_tag('Hello1')
        tags = ingest.get_data_type_tags()

        correct_set = set()
        correct_set.add('Hello1')

        self.assertSetEqual(tags, correct_set)
Example #9
0
    def test_tags(self):
        '''Tests calling get_data_type_tags() with tags'''

        ingest = Ingest(data_type='A,B,c')
        tags = ingest.get_data_type_tags()

        correct_set = set()
        correct_set.add('A')
        correct_set.add('B')
        correct_set.add('c')

        self.assertSetEqual(tags, correct_set)
Example #10
0
    def test_invalid(self):
        """Tests calling add_data_type_tag() with invalid tags"""

        ingest = Ingest()

        self.assertRaises(InvalidDataTypeTag, ingest.add_data_type_tag, 'my.invalid.tag')
        self.assertRaises(InvalidDataTypeTag, ingest.add_data_type_tag, 'my\invalid\tag!')
Example #11
0
    def _create_ingest(self, file_name):
        """Creates a new ingest for the given file name. The database save is the caller's responsibility.

        :param file_name: The name of the file being ingested
        :type file_name: string
        :returns: The new ingest model
        :rtype: :class:`ingest.models.Ingest`
        """

        ingest = Ingest()
        ingest.file_name = file_name
        ingest.strike_id = self.strike_id
        ingest.media_type = get_media_type(file_name)
        ingest.workspace = self._monitored_workspace

        logger.info('New file on %s: %s', ingest.workspace.name, file_name)
        return ingest
Example #12
0
    def test_valid(self):
        '''Tests calling add_data_type_tag() with valid tags'''

        ingest = Ingest()
        ingest.add_data_type_tag('Hello1')
        ingest.add_data_type_tag('foo_BAR')
        tags = ingest.get_data_type_tags()

        correct_set = set()
        correct_set.add('Hello1')
        correct_set.add('foo_BAR')

        self.assertSetEqual(tags, correct_set)
Example #13
0
    def test_valid(self):
        '''Tests calling add_data_type_tag() with valid tags'''

        ingest = Ingest()
        ingest.add_data_type_tag('Hello1')
        ingest.add_data_type_tag('foo_BAR')
        tags = ingest.get_data_type_tags()

        correct_set = set()
        correct_set.add('Hello1')
        correct_set.add('foo_BAR')

        self.assertSetEqual(tags, correct_set)
Example #14
0
    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method migrates existing data files into scale.
        """
        logger.info(u'Command starting: migratedata')

        workspace, workspace_path, local_path, data_types = None, None, None, []
        if options['workspace'] is not None and options[
                'workspace_path'] is not None:
            workspace, workspace_path = options['workspace'], options[
                'workspace_path']
            tmp = Workspace.objects.filter(name=workspace)
            if tmp.count() > 0:
                workspace = tmp.first()
            else:
                workspace = Workspace.objects.get(id=int(workspace))
        else:
            logger.error('Must specify workspace and workspace-path.')
            return False
        if options['data_type'] is not None:
            data_types.extend(options['data_type'])

        mnt_dirs = None
        if options['local_path'] is not None:
            local_path = options['local_path']
        else:  # mount
            mnt_dirs = "/tmp", tempfile.mkdtemp()
            workspace.setup_download_dir(*mnt_dirs)
            local_path = os.path.join(mnt_dirs[1], workspace_path)

        logger.info("Ingesting files from %s/%s", workspace.name,
                    workspace_path)
        filenames = self.generate_file_list(local_path, options['include'],
                                            options['exclude'])
        logger.info("Found %d files", len(filenames))

        # prepare for ingest ala strike
        ingest_records = {}
        for filename in filenames:
            logger.info("Generating ingest record for %s" % filename)
            ingest = Ingest()
            ingest.file_name = os.path.basename(filename)
            ingest.file_path = os.path.join(
                workspace_path, os.path.relpath(filename, local_path))
            ingest.transfer_started = datetime.utcfromtimestamp(
                os.path.getatime(filename))
            ingest.file_size = ingest.bytes_transferred = os.path.getsize(
                filename)
            ingest.transfer_ended = timezone.now()
            ingest.media_type = get_media_type(filename)
            ingest.workspace = workspace
            for data_type in data_types:
                ingest.add_data_type_tag(data_type)
            ingest.status = 'TRANSFERRED'
            if options['no_commit']:
                s = IngestDetailsSerializer()
                logger.info(s.to_representation(ingest))
            else:
                ingest.save()
                ingest_records[filename] = ingest.id
        logging.info("Ingests records created")

        # start ingest tasks for all the files
        if not options['no_commit']:
            logging.info("Starting ingest tasks")
            for filename in filenames:
                ingest = Ingest.objects.get(id=ingest_records[filename])
                logging.info("Processing ingest %s" % ingest.file_name)
                with transaction.atomic():
                    ingest.ingest_started = timezone.now()
                    sf = ingest.source_file = SourceFile.create()
                    sf.update_uuid(ingest.file_name)
                    for tag in ingest.get_data_type_tags():
                        sf.add_data_type_tag(tag)
                    sf.media_type = ingest.media_type
                    sf.file_name = ingest.file_name
                    sf.file_size = ingest.file_size
                    sf.file_path = ingest.file_path
                    sf.workspace = workspace
                    sf.is_deleted = False
                    sf.deleted = None
                    sf.save()
                    sf.set_countries()
                    sf.save()
                    ingest.status = 'INGESTED'
                    ingest.ingest_ended = timezone.now()
                    ingest.source_file = sf
                    ingest.save()
                    IngestTriggerHandler().process_ingested_source_file(
                        ingest.source_file, ingest.ingest_ended)

        logging.info(
            "Ingests processed, monitor the queue for triggered jobs.")

        if mnt_dirs is not None:
            workspace.cleanup_download_dir(*mnt_dirs)

        logger.info(u'Command completed: migratedata')
Example #15
0
    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method migrates existing data files into scale.
        """
        logger.info(u'Command starting: migratedata')

        workspace, workspace_path, local_path, data_types = None, None, None, []
        if options['workspace'] is not None and options['workspace_path'] is not None:
            workspace, workspace_path = options['workspace'], options['workspace_path']
            tmp = Workspace.objects.filter(name=workspace)
            if tmp.count() > 0:
                workspace = tmp.first()
            else:
                workspace = Workspace.objects.get(id=int(workspace))
        else:
            logger.error('Must specify workspace and workspace-path.')
            return False
        if options['data_type'] is not None:
            data_types.extend(options['data_type'])

        mnt_dirs = None
        if options['local_path'] is not None:
            local_path = options['local_path']
        else:  # mount
            mnt_dirs = "/tmp", tempfile.mkdtemp()
            workspace.setup_download_dir(*mnt_dirs)
            local_path = os.path.join(mnt_dirs[1], workspace_path)

        logger.info("Ingesting files from %s/%s", workspace.name, workspace_path)
        filenames = self.generate_file_list(local_path, options['include'], options['exclude'])
        logger.info("Found %d files", len(filenames))

        # prepare for ingest ala strike
        ingest_records = {}
        for filename in filenames:
            logger.info("Generating ingest record for %s" % filename)
            ingest = Ingest()
            ingest.file_name = os.path.basename(filename)
            ingest.transfer_path = filename
            ingest.file_path = os.path.join(workspace_path, os.path.relpath(filename, local_path))
            ingest.transfer_started = datetime.utcfromtimestamp(os.path.getatime(filename))
            ingest.file_size = ingest.bytes_transferred = os.path.getsize(filename)
            ingest.transfer_ended = datetime.utcnow()
            ingest.media_type = get_media_type(filename)
            ingest.workspace = workspace
            for data_type in data_types:
                ingest.add_data_type_tag(data_type)
            ingest.status = 'TRANSFERRED'
            if options['no_commit']:
                s = IngestDetailsSerializer()
                logger.info(s.to_representation(ingest))
            else:
                ingest.save()
                ingest_records[filename] = ingest.id
        logging.info("Ingests records created")

        # start ingest tasks for all the files
        if not options['no_commit']:
            logging.info("Starting ingest tasks")
            for filename in filenames:
                ingest = Ingest.objects.get(id=ingest_records[filename])
                logging.info("Processing ingest %s" % ingest.file_name)
                with transaction.atomic():
                    ingest.ingest_started = datetime.utcnow()
                    sf = ingest.source_file = SourceFile()
                    sf.update_uuid(ingest.file_name)
                    for tag in ingest.get_data_type_tags():
                        sf.add_data_type_tag(tag)
                    sf.media_type = ingest.media_type
                    sf.file_name = ingest.file_name
                    sf.file_size = ingest.file_size
                    sf.file_path = ingest.file_path
                    sf.workspace = workspace
                    sf.is_deleted = False
                    sf.deleted = None
                    sf.save()
                    sf.set_countries()
                    sf.save()
                    ingest.status = 'INGESTED'
                    ingest.ingest_ended = datetime.utcnow()
                    ingest.source_file = sf
                    ingest.save()
                    IngestTriggerHandler().process_ingested_source_file(ingest.source_file, ingest.ingest_ended)

        logging.info("Ingests processed, monitor the queue for triggered jobs.")

        if mnt_dirs is not None:
            workspace.cleanup_download_dir(*mnt_dirs)

        logger.info(u'Command completed: migratedata')
Example #16
0
    def _process_file(self, file_name, ingest):
        '''Processes the given file in the Strike directory. The file_name
        argument represents a file in the Strike directory to process. If
        file_name is None, then the ingest argument represents an ongoing
        transfer where the file is unexpectedly not in the Strike directory.
        If file_name is not None and ingest is None, then this is a
        new transfer without an ingest record yet. If both arguments are None
        an exception is thrown.

        :param file_name: The name of the file to process (possibly None)
        :type file_name: str
        :param ingest: The ingest model for the file (possibly None)
        :type ingest: :class:`ingest.models.Ingest`
        '''
        if file_name is None and ingest is None:
            raise Exception('Nothing for Strike to process')
        if file_name is None:
            file_name = ingest.file_name
        file_path = os.path.join(self.strike_dir, file_name)
        final_name = self._final_filename(file_name)

        # Create ingest model for new transfer
        if ingest is None:
            msg = 'New file %s has arrived, creating ingest for %s'
            logger.info(msg, file_path, final_name)
            ingest = Ingest()
            # Ingest model should record the actual name of the file (no
            # temporary suffix)
            ingest.file_name = final_name
            ingest.strike_id = self.strike_id
            # TODO: investigate better way to get start time of transfer
            last_access = os.path.getatime(file_path)
            ingest.transfer_path = os.path.join(self.strike_dir, final_name)
            ingest.transfer_started = datetime.utcfromtimestamp(last_access)

        if ingest.status == 'TRANSFERRING':
            # Update bytes transferred
            size = os.path.getsize(file_path)
            ingest.bytes_transferred = size

            # Ensure that file is still in Strike dir as expected
            if not os.path.exists(file_path):
                msg = '%s was being transferred, but the file is now lost'
                logger.error(msg, file_path)
                ingest.status = 'ERRORED'
                ingest.save()
                logger.info('Ingest for %s marked as ERRORED', final_name)
                return

            if self._is_still_transferring(file_name):
                # Update with current progress of the transfer
                ingest.save()
                logger.info('%s is still transferring, progress updated', file_path)
            else:
                # Transfer is complete, will move on to next section
                self._complete_transfer(ingest, size)

        if ingest.status == 'TRANSFERRED':
            if ingest.ingest_path:
                self._prepare_file_for_ingest(ingest)
            else:
                self._defer_file(ingest)
        elif not ingest.status == 'TRANSFERRING':
            msg = 'Strike not expecting to process file with status %s'
            raise Exception(msg, ingest.status)
Example #17
0
    def _process_file(self, file_name, ingest):
        '''Processes the given file in the Strike directory. The file_name
        argument represents a file in the Strike directory to process. If
        file_name is None, then the ingest argument represents an ongoing
        transfer where the file is unexpectedly not in the Strike directory.
        If file_name is not None and ingest is None, then this is a
        new transfer without an ingest record yet. If both arguments are None
        an exception is thrown.

        :param file_name: The name of the file to process (possibly None)
        :type file_name: str
        :param ingest: The ingest model for the file (possibly None)
        :type ingest: :class:`ingest.models.Ingest`
        '''
        if file_name is None and ingest is None:
            raise Exception('Nothing for Strike to process')
        if file_name is None:
            file_name = ingest.file_name
        file_path = os.path.join(self.strike_dir, file_name)
        final_name = self._final_filename(file_name)

        # Create ingest model for new transfer
        if ingest is None:
            msg = 'New file %s has arrived, creating ingest for %s'
            logger.info(msg, file_path, final_name)
            ingest = Ingest()
            # Ingest model should record the actual name of the file (no
            # temporary suffix)
            ingest.file_name = final_name
            ingest.strike_id = self.strike_id
            # TODO: investigate better way to get start time of transfer
            last_access = os.path.getatime(file_path)
            ingest.transfer_path = os.path.join(self.strike_dir, final_name)
            ingest.transfer_started = datetime.utcfromtimestamp(last_access)

        if ingest.status == 'TRANSFERRING':
            # Update bytes transferred
            size = os.path.getsize(file_path)
            ingest.bytes_transferred = size

            # Ensure that file is still in Strike dir as expected
            if not os.path.exists(file_path):
                msg = '%s was being transferred, but the file is now lost'
                logger.error(msg, file_path)
                ingest.status = 'ERRORED'
                ingest.save()
                logger.info('Ingest for %s marked as ERRORED', final_name)
                return

            if self._is_still_transferring(file_name):
                # Update with current progress of the transfer
                ingest.save()
                logger.info('%s is still transferring, progress updated', file_path)
            else:
                # Transfer is complete, will move on to next section
                self._complete_transfer(ingest, size)

        if ingest.status == 'TRANSFERRED':
            if ingest.ingest_path:
                self._prepare_file_for_ingest(ingest)
            else:
                self._defer_file(ingest)
        elif not ingest.status == 'TRANSFERRING':
            msg = 'Strike not expecting to process file with status %s'
            raise Exception(msg, ingest.status)