Beispiel #1
0
    def enrich_item(self, enrichments, object_id, combined_index_doc, doc):
        """Enriches the media objects referenced in a single item.

        First, a media item will be retrieved from the source, than the
        registered and configured tasks will run. In case fetching the
        item fails, enrichment of the media item will be skipped. In case
        a specific media enrichment task fails, only that task is
        skipped, which means that we move on to the next task.
        """

        if not doc.get('media_urls', []):
            raise SkipEnrichment('No "media_urls" in document.')

        self.setup_http_session()

        # Check the settings to see if media should by fetch partially
        partial_fetch = self.enricher_settings.get('partial_media_fetch', False)

        media_urls_enrichments = []
        for media_item in doc['media_urls']:
            media_item_enrichment = {}

            content_type, content_length, media_file = self.fetch_media(
                media_item['original_url'],
                partial_fetch
            )

            for task in self.enricher_settings['tasks']:
                # Seek to the beginning of the file before starting a task
                media_file.seek(0)
                try:
                    self.available_tasks[task](media_item, content_type,
                                               media_file,
                                               media_item_enrichment,
                                               object_id, combined_index_doc,
                                               doc)
                except UnsupportedContentType:
                    log.debug('Skipping media enrichment task %s, '
                              'content-type %s (object_id: %s, url %s) is not '
                              'supported.' % (task, content_type, object_id,
                                              media_item['original_url']))
                    continue

            media_item_enrichment['url'] = media_item['url']
            media_item_enrichment['original_url'] = media_item['original_url']
            media_item_enrichment['content_type'] = content_type
            media_item_enrichment['size_in_bytes'] = content_length

            media_file.close()

            media_urls_enrichments.append(media_item_enrichment)

        enrichments['media_urls'] = media_urls_enrichments

        return enrichments
Beispiel #2
0
    def enrich_item(self, item):
        """Enriches the media objects referenced in a single item.

        First, a media item will be retrieved from the source, than the
        registered and configured tasks will run. In case fetching the
        item fails, enrichment of the media item will be skipped. In case
        a specific media enrichment task fails, only that task is
        skipped, which means that we move on to the next task.
        """

        try:
            identifier = strip_scheme(item.identifier_url)
        except AttributeError:
            raise Exception('No identifier_url for item: %s', item)

        try:
            date_modified = item.date_modified
        except AttributeError:
            date_modified = None

        try:
            resource = self.fetch(
                item.original_url,
                identifier,
                date_modified,
            )
        except requests.HTTPError as e:
            raise SkipEnrichment(e)

        item.url = '%s/%s' % (RESOLVER_BASE_URL, urllib.quote(identifier))
        item.content_type = resource.content_type
        item.size_in_bytes = resource.file_size

        enrich_tasks = item.enricher_task
        if isinstance(enrich_tasks, basestring):
            enrich_tasks = [item.enricher_task]

        # The enricher tasks will executed in specified order
        for task in enrich_tasks:
            # Seek to the beginning of the file before starting a task
            resource.media_file.seek(0)
            self.available_tasks[task](self.source_definition).enrich_item(
                item, resource.media_file)

        resource.media_file.close()

        item.db.save(item)
Beispiel #3
0
    def enrich_item(self, item):
        """Enriches the media objects referenced in a single item.

        First, a media item will be retrieved from the source, than the
        registered and configured tasks will run. In case fetching the
        item fails, enrichment of the media item will be skipped. In case
        a specific media enrichment task fails, only that task is
        skipped, which means that we move on to the next task.
        """

        try:
            identifier = strip_scheme(item.identifier_url)
        except AttributeError:
            raise Exception('No identifier_url for item: %s', item)

        try:
            date_modified = item.date_modified
        except AttributeError:
            date_modified = None

        ori_enriched = GCSCachingMixin.factory('ori-enriched')
        if ori_enriched.exists(identifier):
            resource = ori_enriched.download_cache(identifier)
            try:
                item.text = json.load(resource.media_file)['data']

                # Adding the same text again for Elastic nesting
                item.text_pages = [{
                    'text': text,
                    'page_number': i
                } for i, text in enumerate(item.text, start=1) if text]
            except (ValueError, KeyError):
                # No json could be decoded or data not found, pass and parse again
                pass

        if not hasattr(item, 'text') or not item.text:
            try:
                resource = GCSCachingMixin.factory('ori-static').fetch(
                    item.original_url,
                    identifier,
                    date_modified,
                )
            except requests.HTTPError as e:
                raise SkipEnrichment(e)

            item.url = '%s/%s' % (RESOLVER_BASE_URL, urllib.quote(identifier))
            item.content_type = resource.content_type
            item.size_in_bytes = resource.file_size

            # Make sure file_object is actually on the disk for pdf parsing
            temporary_file = NamedTemporaryFile(delete=True)
            temporary_file.write(resource.read())
            temporary_file.seek(0, 0)

            if os.path.exists(temporary_file.name):
                path = os.path.realpath(temporary_file.name)
                item.text = file_parser(path, max_pages=100)

            temporary_file.close()

            if hasattr(item, 'text') and item.text:
                # Adding the same text again for Elastic nesting
                item.text_pages = [{
                    'text': text,
                    'page_number': i
                } for i, text in enumerate(item.text, start=1) if text]

                # Save the enriched version to the ori-enriched bucket
                data = json.dumps({
                    'data': item.text,
                    'pages': len(item.text),
                })
                ori_enriched.upload(identifier,
                                    data,
                                    content_type='application/json')

        enrich_tasks = item.enricher_task
        if isinstance(enrich_tasks, basestring):
            enrich_tasks = [item.enricher_task]

        # The enricher tasks will executed in specified order
        for task in enrich_tasks:
            self.available_tasks[task](
                self.source_definition).enrich_item(item)

        item.db.save(item)