Beispiel #1
0
    def parse_item(self, image_path):
        filename = os.path.basename(image_path)
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        item = {'guid': guid,
                'uri': guid,
                config.VERSION: 1,
                ITEM_TYPE: CONTENT_TYPE.PICTURE,
                'mimetype': content_type,
                'versioncreated': utcnow(),
                }
        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata)
            filemeta.set_filemeta(item, file_metadata)
            f.seek(0)

            metadata = get_meta_iptc(f)
            f.seek(0)
            self.parse_meta(item, metadata)

            rendition_spec = get_renditions_spec(no_custom_crops=True)
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec, url_for_media)
            item['renditions'] = renditions
        return item
    def on_create(self, docs):
        """Create corresponding item on file upload."""

        for doc in docs:
            if 'media' not in doc or doc['media'] is None:
                abort(400, description="No media found")

            file, content_type, metadata = self.get_file_from_document(doc)
            inserted = [doc['media']]
            file_type = content_type.split('/')[0]

            self._set_metadata(doc)

            try:
                doc[ITEM_TYPE] = self.type_av.get(file_type)
                rendition_spec = get_renditions_spec()
                renditions = generate_renditions(file, doc['media'], inserted, file_type,
                                                 content_type, rendition_spec, url_for_media)
                doc['renditions'] = renditions
                doc['mimetype'] = content_type
                set_filemeta(doc, metadata)

                add_activity('upload', 'uploaded media {{ name }}',
                             'archive', item=doc,
                             name=doc.get('headline', doc.get('mimetype')),
                             renditions=doc.get('renditions'))
            except Exception as io:
                logger.exception(io)
                for file_id in inserted:
                    delete_file_on_error(doc, file_id)
                abort(500)
    def setUp(self):
        super().setUp()
        dirname = os.path.dirname(os.path.realpath(__file__))
        image_path = os.path.normpath(
            os.path.join(dirname, "fixtures", self.filename))
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        self.item = {
            "guid": guid,
            "version": 1,
            "_id": guid,
            ITEM_TYPE: CONTENT_TYPE.PICTURE,
            "mimetype": content_type,
            "versioncreated": datetime.now(),
        }

        with open(image_path, "rb") as f:
            _, content_type, file_metadata = process_file_from_stream(
                f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f,
                                    filename=self.filename,
                                    content_type=content_type,
                                    metadata=file_metadata)
            filemeta.set_filemeta(self.item, file_metadata)
            f.seek(0)
            rendition_spec = get_renditions_spec()
            renditions = generate_renditions(f, file_id, [file_id], "image",
                                             content_type, rendition_spec,
                                             url_for_media)
            self.item["renditions"] = renditions
        archive = get_resource_service("archive")
        archive.post([self.item])
Beispiel #4
0
 def crop_and_store_file(self, doc, content, filename, content_type):
     # retrieve file name and metadata from file
     file_name, content_type, metadata = process_file_from_stream(
         content, content_type=content_type)
     # crop the file if needed, can change the image size
     was_cropped, out = crop_image(content, filename, doc)
     # the length in metadata could be updated if it was cropped
     if was_cropped:
         file_name, content_type, metadata_after_cropped = process_file_from_stream(
             out, content_type=content_type)
         # when cropped, metadata are reseted. Then we update the previous metadata variable
         metadata['length'] = metadata_after_cropped['length']
     try:
         logger.debug('Going to save media file with %s ' % file_name)
         out.seek(0)
         file_id = app.media.put(out,
                                 filename=file_name,
                                 content_type=content_type,
                                 resource=self.datasource,
                                 metadata=metadata)
         doc['media'] = file_id
         doc['mimetype'] = content_type
         set_filemeta(doc, decode_metadata(metadata))
         inserted = [doc['media']]
         file_type = content_type.split('/')[0]
         rendition_spec = config.RENDITIONS['avatar']
         renditions = generate_renditions(out, file_id, inserted, file_type,
                                          content_type, rendition_spec,
                                          url_for_media)
         doc['renditions'] = renditions
     except Exception as io:
         for file_id in inserted:
             delete_file_on_error(doc, file_id)
         raise SuperdeskApiError.internalError(
             'Generating renditions failed', exception=io)
Beispiel #5
0
 def crop_and_store_file(self, doc, content, filename, content_type):
     # retrieve file name and metadata from file
     file_name, content_type, metadata = process_file_from_stream(content, content_type=content_type)
     # crop the file if needed, can change the image size
     was_cropped, out = crop_image(content, filename, doc)
     # the length in metadata could be updated if it was cropped
     if was_cropped:
         file_name, content_type, metadata_after_cropped = process_file_from_stream(out, content_type=content_type)
         # when cropped, metadata are reseted. Then we update the previous metadata variable
         metadata['length'] = metadata_after_cropped['length']
     try:
         logger.debug('Going to save media file with %s ' % file_name)
         out.seek(0)
         file_id = app.media.put(out, filename=file_name, content_type=content_type,
                                 resource=self.datasource, metadata=metadata)
         doc['media'] = file_id
         doc['mimetype'] = content_type
         set_filemeta(doc, decode_metadata(metadata))
         inserted = [doc['media']]
         file_type = content_type.split('/')[0]
         rendition_spec = config.RENDITIONS['avatar']
         renditions = generate_renditions(out, file_id, inserted, file_type,
                                          content_type, rendition_spec, url_for_media)
         doc['renditions'] = renditions
     except Exception as io:
         logger.exception(io)
         for file_id in inserted:
             delete_file_on_error(doc, file_id)
         raise SuperdeskApiError.internalError('Generating renditions failed')
    def setUp(self):
        super().setUp()
        dirname = os.path.dirname(os.path.realpath(__file__))
        image_path = os.path.normpath(
            os.path.join(dirname, 'fixtures', self.filename))
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        self.item = {
            'guid': guid,
            'version': 1,
            '_id': guid,
            ITEM_TYPE: CONTENT_TYPE.PICTURE,
            'mimetype': content_type,
            'versioncreated': datetime.now()
        }

        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(
                f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f,
                                    filename=self.filename,
                                    content_type=content_type,
                                    metadata=file_metadata)
            filemeta.set_filemeta(self.item, file_metadata)
            f.seek(0)
            rendition_spec = get_renditions_spec()
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec,
                                             url_for_media)
            self.item['renditions'] = renditions
        archive = get_resource_service('archive')
        archive.post([self.item])
    def setUp(self):
        super().setUp()
        dirname = os.path.dirname(os.path.realpath(__file__))
        image_path = os.path.normpath(os.path.join(dirname, 'fixtures', self.filename))
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        self.item = {'guid': guid,
                     'version': 1,
                     '_id': guid,
                     ITEM_TYPE: CONTENT_TYPE.PICTURE,
                     'mimetype': content_type,
                     'versioncreated': datetime.now()
                     }

        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f, filename=self.filename, content_type=content_type, metadata=file_metadata)
            filemeta.set_filemeta(self.item, file_metadata)
            f.seek(0)
            rendition_spec = get_renditions_spec()
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec, url_for_media)
            self.item['renditions'] = renditions
        archive = get_resource_service('archive')
        archive.post([self.item])
Beispiel #8
0
    def on_create(self, docs):
        """Create corresponding item on file upload."""

        for doc in docs:
            if 'media' not in doc or doc['media'] is None:
                abort(400, description="No media found")

            file, content_type, metadata = self.get_file_from_document(doc)
            inserted = [doc['media']]
            file_type = content_type.split('/')[0]

            self._set_metadata(doc)

            try:
                doc[ITEM_TYPE] = self.type_av.get(file_type)
                rendition_spec = get_renditions_spec()
                renditions = generate_renditions(file, doc['media'], inserted,
                                                 file_type, content_type,
                                                 rendition_spec, url_for_media)
                doc['renditions'] = renditions
                doc['mimetype'] = content_type
                set_filemeta(doc, metadata)

                add_activity('upload',
                             'uploaded media {{ name }}',
                             'archive',
                             item=doc,
                             name=doc.get('headline', doc.get('mimetype')),
                             renditions=doc.get('renditions'))
            except Exception as io:
                logger.exception(io)
                for file_id in inserted:
                    delete_file_on_error(doc, file_id)
                abort(500)
    def on_create(self, docs):
        """Create corresponding item on file upload."""

        for doc in docs:
            if 'media' not in doc or doc['media'] is None:
                abort(400, description="No media found")
            # check content type of video by python-magic
            content_type = magic.from_buffer(doc['media'].read(1024),
                                             mime=True)
            doc['media'].seek(0)
            file_type = content_type.split('/')[0]
            if file_type == 'video' and app.config.get("VIDEO_SERVER_ENABLE"):
                if not self.videoEditor.check_video_server():
                    raise SuperdeskApiError(
                        message="Cannot connect to videoserver",
                        status_code=500)
                # upload media to video server
                res, renditions, metadata = self.upload_file_to_video_server(
                    doc)
                # get thumbnails for timeline bar
                self.videoEditor.get_timeline_thumbnails(doc.get('media'), 40)
            else:
                file, content_type, metadata = self.get_file_from_document(doc)
                inserted = [doc['media']]
                # if no_custom_crops is set to False the custom crops are generated automatically on media upload
                # see (SDESK-4742)
                rendition_spec = get_renditions_spec(
                    no_custom_crops=app.config.get("NO_CUSTOM_CROPS"))
                with timer('archive:renditions'):
                    renditions = generate_renditions(file, doc['media'],
                                                     inserted, file_type,
                                                     content_type,
                                                     rendition_spec,
                                                     url_for_media)
            try:
                self._set_metadata(doc)
                doc[ITEM_TYPE] = self.type_av.get(file_type)
                doc[ITEM_STATE] = CONTENT_STATE.PROGRESS
                doc['renditions'] = renditions
                doc['mimetype'] = content_type
                set_filemeta(doc, metadata)
                add_activity('upload',
                             'uploaded media {{ name }}',
                             'archive',
                             item=doc,
                             name=doc.get('headline', doc.get('mimetype')),
                             renditions=doc.get('renditions'))
            except Exception as io:
                logger.exception(io)
                for file_id in inserted:
                    delete_file_on_error(doc, file_id)
                if res:
                    self.videoEditor.delete(res.get('_id'))
                abort(500)
Beispiel #10
0
    def on_create(self, docs):
        """Create corresponding item on file upload."""

        for doc in docs:
            if "media" not in doc or doc["media"] is None:
                abort(400, description="No media found")
            # check content type of video by python-magic
            content_type = app.media._get_mimetype(doc["media"])
            doc["media"].seek(0)
            file_type = content_type.split("/")[0]
            if file_type == "video" and app.config.get("VIDEO_SERVER_ENABLED"):
                # upload media to video server
                res, renditions, metadata = self.upload_file_to_video_server(
                    doc)
                # get thumbnails for timeline bar
                self.video_editor.create_timeline_thumbnails(
                    doc.get("media"), 60)
            else:
                file, content_type, metadata = self.get_file_from_document(doc)
                inserted = [doc["media"]]
                # if no_custom_crops is set to False the custom crops are generated automatically on media upload
                # see (SDESK-4742)
                rendition_spec = get_renditions_spec(
                    no_custom_crops=app.config.get("NO_CUSTOM_CROPS"))
                with timer("archive:renditions"):
                    renditions = generate_renditions(file, doc["media"],
                                                     inserted, file_type,
                                                     content_type,
                                                     rendition_spec,
                                                     url_for_media)
            try:
                self._set_metadata(doc)
                doc[ITEM_TYPE] = self.type_av.get(file_type)
                doc[ITEM_STATE] = CONTENT_STATE.PROGRESS
                doc["renditions"] = renditions
                doc["mimetype"] = content_type
                set_filemeta(doc, metadata)
                add_activity(
                    "upload",
                    "uploaded media {{ name }}",
                    "archive",
                    item=doc,
                    name=doc.get("headline", doc.get("mimetype")),
                    renditions=doc.get("renditions"),
                )
            except Exception as io:
                logger.exception(io)
                for file_id in inserted:
                    delete_file_on_error(doc, file_id)
                if res:
                    self.video_editor.delete(res.get("_id"))
                abort(500)
    def parse_item(self, image_path):
        filename = os.path.basename(image_path)
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        item = {
            'guid': guid,
            config.VERSION: 1,
            config.ID_FIELD: guid,
            ITEM_TYPE: CONTENT_TYPE.PICTURE,
            'mimetype': content_type,
            'versioncreated': datetime.now()
        }
        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(
                f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f,
                                    filename=filename,
                                    content_type=content_type,
                                    metadata=file_metadata)
            filemeta.set_filemeta(item, file_metadata)
            f.seek(0)
            metadata = get_meta_iptc(f)
            f.seek(0)
            rendition_spec = get_renditions_spec(no_custom_crops=True)
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec,
                                             url_for_media)
            item['renditions'] = renditions

        try:
            date_created, time_created = metadata[TAG.DATE_CREATED], metadata[
                TAG.TIME_CREATED]
        except KeyError:
            pass
        else:
            # we format proper ISO 8601 date so we can parse it with dateutil
            datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(
                date_created[0:4], date_created[4:6], date_created[6:8],
                time_created[0:2], time_created[2:4], time_created[4:6],
                time_created[6], time_created[7:9], time_created[9:])
            item['firstcreated'] = dateutil.parser.parse(datetime_created)

        # now we map IPTC metadata to superdesk metadata
        for source_key, dest_key in IPTC_MAPPING.items():
            try:
                item[dest_key] = metadata[source_key]
            except KeyError:
                continue
        return item
Beispiel #12
0
 def on_create(self, docs):
     super().on_create(docs)
     for doc in docs:
         update_dates_for(doc)
         doc['original_creator'] = str(get_user().get('_id'))
         if doc.get('item_type'):
             if doc['item_type'] == 'embed':
                 metadata = doc['meta']
                 set_filemeta(doc, metadata)
                 if get_filemeta(doc, 'version'):
                     metadata['version'] = str(metadata.get('version'))
                 if get_filemeta(doc, 'width'):
                     metadata['width'] = str(metadata.get('width'))
                 if get_filemeta(doc, 'height'):
                     metadata['height'] = str(metadata.get('height'))
    def parse_item(self, image_path):
        filename = os.path.basename(image_path)
        content_type = mimetypes.guess_type(image_path)[0]
        guid = utils.generate_guid(type=GUID_TAG)
        item = {'guid': guid,
                config.VERSION: 1,
                config.ID_FIELD: guid,
                ITEM_TYPE: CONTENT_TYPE.PICTURE,
                'mimetype': content_type,
                'versioncreated': datetime.now()
                }
        with open(image_path, 'rb') as f:
            _, content_type, file_metadata = process_file_from_stream(f, content_type=content_type)
            f.seek(0)
            file_id = app.media.put(f, filename=filename, content_type=content_type, metadata=file_metadata)
            filemeta.set_filemeta(item, file_metadata)
            f.seek(0)
            metadata = get_meta_iptc(f)
            f.seek(0)
            rendition_spec = get_renditions_spec(no_custom_crops=True)
            renditions = generate_renditions(f, file_id, [file_id], 'image',
                                             content_type, rendition_spec, url_for_media)
            item['renditions'] = renditions

        try:
            date_created, time_created = metadata[TAG.DATE_CREATED], metadata[TAG.TIME_CREATED]
        except KeyError:
            pass
        else:
            # we format proper ISO 8601 date so we can parse it with dateutil
            datetime_created = '{}-{}-{}T{}:{}:{}{}{}:{}'.format(date_created[0:4],
                                                                 date_created[4:6],
                                                                 date_created[6:8],
                                                                 time_created[0:2],
                                                                 time_created[2:4],
                                                                 time_created[4:6],
                                                                 time_created[6],
                                                                 time_created[7:9],
                                                                 time_created[9:])
            item['firstcreated'] = dateutil.parser.parse(datetime_created)

        # now we map IPTC metadata to superdesk metadata
        for source_key, dest_key in IPTC_MAPPING.items():
            try:
                item[dest_key] = metadata[source_key]
            except KeyError:
                continue
        return item
Beispiel #14
0
def update_renditions(item, href, old_item, request_kwargs=None):
    """Update renditions for an item.

    If the old_item has renditions uploaded in to media then the old rendition details are
    assigned to the item, this avoids repeatedly downloading the same image and leaving the media entries orphaned.
    If there is no old_item the original is downloaded and renditions are
    generated.
    :param item: parsed item from source
    :param href: reference to original
    :param old_item: the item that we have already ingested, if it exists
    :return: item with renditions
    """
    inserted = []
    try:
        # If there is an existing set of renditions we keep those
        if old_item:
            media = old_item.get("renditions", {}).get("original",
                                                       {}).get("media", {})
            if media:
                item["renditions"] = old_item["renditions"]
                item["mimetype"] = old_item.get("mimetype")
                item["filemeta"] = old_item.get("filemeta")
                item["filemeta_json"] = old_item.get("filemeta_json")
                return

        content, filename, content_type = download_file_from_url(
            href, request_kwargs)
        file_type, ext = content_type.split("/")
        metadata = process_file(content, file_type)
        file_guid = app.media.put(content,
                                  filename=filename,
                                  content_type=content_type,
                                  metadata=metadata)
        inserted.append(file_guid)
        rendition_spec = get_renditions_spec()
        renditions = generate_renditions(content, file_guid, inserted,
                                         file_type, content_type,
                                         rendition_spec,
                                         app.media.url_for_media)
        item["renditions"] = renditions
        item["mimetype"] = content_type
        set_filemeta(item, metadata)
    except Exception as e:
        logger.exception(e)
        for file_id in inserted:
            app.media.delete(file_id)
        raise
    def on_create(self, docs):
        """Create corresponding item on file upload."""

        for doc in docs:
            if 'media' not in doc or doc['media'] is None:
                abort(400, description="No media found")

            file, content_type, metadata = self.get_file_from_document(doc)
            inserted = [doc['media']]
            file_type = content_type.split('/')[0]

            self._set_metadata(doc)

            try:
                doc[ITEM_TYPE] = self.type_av.get(file_type)
                doc[ITEM_STATE] = CONTENT_STATE.PROGRESS
                # if no_custom_crops is set to False the custom crops are generated automatically on media upload
                # see (SDESK-4742)
                rendition_spec = get_renditions_spec(
                    no_custom_crops=app.config.get("NO_CUSTOM_CROPS"))
                with timer('archive:renditions'):
                    renditions = generate_renditions(file, doc['media'],
                                                     inserted, file_type,
                                                     content_type,
                                                     rendition_spec,
                                                     url_for_media)
                doc['renditions'] = renditions
                doc['mimetype'] = content_type
                set_filemeta(doc, metadata)

                add_activity('upload',
                             'uploaded media {{ name }}',
                             'archive',
                             item=doc,
                             name=doc.get('headline', doc.get('mimetype')),
                             renditions=doc.get('renditions'))
            except Exception as io:
                logger.exception(io)
                for file_id in inserted:
                    delete_file_on_error(doc, file_id)
                abort(500)
Beispiel #16
0
def update_renditions(item, href, old_item):
    """
    If the old_item has renditions uploaded in to media then the old rendition details are
    assigned to the item, this avoids repeatedly downloading the same image and leaving the media entries orphaned.
    If there is no old_item the original is downloaded and renditions are
    generated.
    :param item: parsed item from source
    :param href: reference to original
    :param old_item: the item that we have already ingested, if it exists
    :return: item with renditions
    """
    inserted = []
    try:
        # If there is an existing set of renditions we keep those
        if old_item:
            media = old_item.get('renditions', {}).get('original',
                                                       {}).get('media', {})
            if media:
                item['renditions'] = old_item['renditions']
                item['mimetype'] = old_item.get('mimetype')
                item['filemeta'] = old_item.get('filemeta')
                item['filemeta_json'] = old_item.get('filemeta_json')
                return

        content, filename, content_type = download_file_from_url(href)
        file_type, ext = content_type.split('/')
        metadata = process_file(content, file_type)
        file_guid = app.media.put(content, filename, content_type, metadata)
        inserted.append(file_guid)
        rendition_spec = get_renditions_spec()
        renditions = generate_renditions(content, file_guid, inserted,
                                         file_type, content_type,
                                         rendition_spec, url_for_media)
        item['renditions'] = renditions
        item['mimetype'] = content_type
        set_filemeta(item, metadata)
    except Exception:
        for file_id in inserted:
            app.media.delete(file_id)
        raise
def update_renditions(item, href, old_item):
    """Update renditions for an item.

    If the old_item has renditions uploaded in to media then the old rendition details are
    assigned to the item, this avoids repeatedly downloading the same image and leaving the media entries orphaned.
    If there is no old_item the original is downloaded and renditions are
    generated.
    :param item: parsed item from source
    :param href: reference to original
    :param old_item: the item that we have already ingested, if it exists
    :return: item with renditions
    """
    inserted = []
    try:
        # If there is an existing set of renditions we keep those
        if old_item:
            media = old_item.get('renditions', {}).get('original', {}).get('media', {})
            if media:
                item['renditions'] = old_item['renditions']
                item['mimetype'] = old_item.get('mimetype')
                item['filemeta'] = old_item.get('filemeta')
                item['filemeta_json'] = old_item.get('filemeta_json')
                return

        content, filename, content_type = download_file_from_url(href)
        file_type, ext = content_type.split('/')
        metadata = process_file(content, file_type)
        file_guid = app.media.put(content, filename, content_type, metadata)
        inserted.append(file_guid)
        rendition_spec = get_renditions_spec()
        renditions = generate_renditions(content, file_guid, inserted, file_type,
                                         content_type, rendition_spec, app.media.url_for_media)
        item['renditions'] = renditions
        item['mimetype'] = content_type
        set_filemeta(item, metadata)
    except Exception as e:
        logger.exception(e)
        for file_id in inserted:
            app.media.delete(file_id)
        raise
 def test_get_set_filemeta(self):
     item = {}
     set_filemeta(item, {'foo': 'bar'})
     self.assertEqual('bar', get_filemeta(item, 'foo'))
     self.assertEqual({'foo': 'bar'}, get_filemeta(item))
Beispiel #19
0
    def parse(self, data, provider=None):
        config = provider.get("config", {})
        # If the channel is configured to process structured email generated from a google form
        if config.get("formatted", False):
            return self._parse_formatted_email(data, provider)
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item["versioncreated"] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item["headline"] = self.parse_header(msg["subject"])
                    field_from = self.parse_header(msg["from"])
                    item["original_source"] = field_from
                    try:
                        if email_regex.findall(field_from):
                            email_address = email_regex.findall(field_from)[0]
                            user = get_resource_service(
                                "users").get_user_by_email(email_address)
                            item["original_creator"] = user[
                                eve.utils.config.ID_FIELD]
                    except UserNotRegisteredException:
                        pass
                    item["guid"] = msg["Message-ID"]
                    date_tuple = email.utils.parsedate_tz(msg["Date"])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone("utc"))
                        item["firstcreated"] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}: {2}"
                                    .format(item["headline"], field_from, ex))
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = sanitize_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing html body for {0} from {1}: {2}"
                                    .format(item["headline"], field_from, ex))
                                continue
                        if part.get_content_maintype() == "multipart":
                            continue
                        if part.get("Content-Disposition") is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != "image":
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(
                                content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == "image/gif" or content_type == "image/png":
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(
                                content,
                                filename=fileName,
                                content_type=content_type,
                                metadata=metadata)
                            renditions = {"baseImage": {"href": image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE
                                comp_item["guid"] = generate_guid(
                                    type=GUID_TAG)
                                comp_item["versioncreated"] = utcnow()
                                comp_item["groups"] = []
                                comp_item["headline"] = item["headline"]
                                comp_item["groups"] = []
                                comp_item["original_source"] = item[
                                    "original_source"]
                                if "original_creator" in item:
                                    comp_item["original_creator"] = item[
                                        "original_creator"]

                                # create a reference to the item that stores the body of the email
                                item_ref = {
                                    "guid": item["guid"],
                                    "residRef": item["guid"],
                                    "headline": item["headline"],
                                    "location": "ingest",
                                    "itemClass": "icls:text",
                                    "original_source": item["original_source"],
                                }
                                if "original_creator" in item:
                                    item_ref["original_creator"] = item[
                                        "original_creator"]
                                refs.append(item_ref)

                            media_item = dict()
                            media_item["guid"] = generate_guid(type=GUID_TAG)
                            media_item["versioncreated"] = utcnow()
                            media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE
                            media_item["renditions"] = renditions
                            media_item["mimetype"] = content_type
                            set_filemeta(media_item, metadata)
                            media_item["slugline"] = fileName
                            if text_body is not None:
                                media_item["body_html"] = text_body
                            media_item["headline"] = item["headline"]
                            media_item["original_source"] = item[
                                "original_source"]
                            if "original_creator" in item:
                                media_item["original_creator"] = item[
                                    "original_creator"]
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {
                                "guid": media_item["guid"],
                                "residRef": media_item["guid"],
                                "headline": fileName,
                                "location": "ingest",
                                "itemClass": "icls:picture",
                                "original_source": item["original_source"],
                            }
                            if "original_creator" in item:
                                media_ref["original_creator"] = item[
                                    "original_creator"]
                            refs.append(media_ref)

            if html_body:
                item["body_html"] = html_body
            else:
                item["body_html"] = "<pre>" + text_body + "</pre>"
                item[FORMAT] = FORMATS.PRESERVED

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {
                    "refs": [{
                        "idRef": "main"
                    }],
                    "id": "root",
                    "role": "grpRole:NEP"
                }
                comp_item["groups"].append(grefs)

                grefs = {"refs": refs, "id": "main", "role": "grpRole:Main"}
                comp_item["groups"].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Beispiel #20
0
    def parse(self, data, provider=None):
        config = provider.get('config', {})
        # If the channel is configured to process structured email generated from a google form
        if config.get('formatted', False):
            return self._parse_formatted_email(data, provider)
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    field_from = self.parse_header(msg['from'])
                    item['original_source'] = field_from
                    try:
                        if email_regex.findall(field_from):
                            email_address = email_regex.findall(field_from)[0]
                            user = get_resource_service(
                                'users').get_user_by_email(email_address)
                            item['original_creator'] = user[
                                eve.utils.config.ID_FIELD]
                    except UserNotRegisteredException:
                        pass
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}: {2}"
                                    .format(item['headline'], field_from, ex))
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing html body for {0} from {1}: {2}"
                                    .format(item['headline'], field_from, ex))
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(
                                content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(
                                content,
                                filename=fileName,
                                content_type=content_type,
                                metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE
                                comp_item['guid'] = generate_guid(
                                    type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []
                                comp_item['original_source'] = item[
                                    'original_source']
                                if 'original_creator' in item:
                                    comp_item['original_creator'] = item[
                                        'original_creator']

                                # create a reference to the item that stores the body of the email
                                item_ref = {
                                    'guid': item['guid'],
                                    'residRef': item['guid'],
                                    'headline': item['headline'],
                                    'location': 'ingest',
                                    'itemClass': 'icls:text',
                                    'original_source': item['original_source']
                                }
                                if 'original_creator' in item:
                                    item_ref['original_creator'] = item[
                                        'original_creator']
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            set_filemeta(media_item, metadata)
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            media_item['original_source'] = item[
                                'original_source']
                            if 'original_creator' in item:
                                media_item['original_creator'] = item[
                                    'original_creator']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {
                                'guid': media_item['guid'],
                                'residRef': media_item['guid'],
                                'headline': fileName,
                                'location': 'ingest',
                                'itemClass': 'icls:picture',
                                'original_source': item['original_source']
                            }
                            if 'original_creator' in item:
                                media_ref['original_creator'] = item[
                                    'original_creator']
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = '<pre>' + text_body + '</pre>'
                item[FORMAT] = FORMATS.PRESERVED

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {
                    'refs': [{
                        'idRef': 'main'
                    }],
                    'id': 'root',
                    'role': 'grpRole:NEP'
                }
                comp_item['groups'].append(grefs)

                grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'}
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Beispiel #21
0
 def test_get_set_filemeta(self):
     item = {}
     set_filemeta(item, {'foo': 'bar'})
     self.assertEqual('bar', get_filemeta(item, 'foo'))
     self.assertEqual({'foo': 'bar'}, get_filemeta(item))
Beispiel #22
0
    def parse(self, data, provider=None):
        config = provider.get('config', {})
        # If the channel is configured to process structured email generated from a google form
        if config.get('formatted', False):
            return self._parse_formatted_email(data, provider)
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item[ITEM_TYPE] = CONTENT_TYPE.TEXT
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    field_from = self.parse_header(msg['from'])
                    item['original_source'] = field_from
                    try:
                        if email_regex.findall(field_from):
                            email_address = email_regex.findall(field_from)[0]
                            user = get_resource_service('users').get_user_by_email(email_address)
                            item['original_creator'] = user[eve.utils.config.ID_FIELD]
                    except UserNotRegisteredException:
                        pass
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}: {2}".format(item['headline'],
                                                                                               field_from, ex))
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing html body for {0} from {1}: {2}".format(item['headline'],
                                                                                               field_from, ex))
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(content, filename=fileName,
                                                                 content_type=content_type, metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item[ITEM_TYPE] = CONTENT_TYPE.COMPOSITE
                                comp_item['guid'] = generate_guid(type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []
                                comp_item['original_source'] = item['original_source']
                                if 'original_creator' in item:
                                    comp_item['original_creator'] = item['original_creator']

                                # create a reference to the item that stores the body of the email
                                item_ref = {'guid': item['guid'], 'residRef': item['guid'],
                                            'headline': item['headline'], 'location': 'ingest',
                                            'itemClass': 'icls:text', 'original_source': item['original_source']}
                                if 'original_creator' in item:
                                    item_ref['original_creator'] = item['original_creator']
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item[ITEM_TYPE] = CONTENT_TYPE.PICTURE
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            set_filemeta(media_item, metadata)
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            media_item['original_source'] = item['original_source']
                            if 'original_creator' in item:
                                media_item['original_creator'] = item['original_creator']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {'guid': media_item['guid'], 'residRef': media_item['guid'],
                                         'headline': fileName, 'location': 'ingest', 'itemClass': 'icls:picture',
                                         'original_source': item['original_source']}
                            if 'original_creator' in item:
                                media_ref['original_creator'] = item['original_creator']
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = '<pre>' + text_body + '</pre>'
                item[FORMAT] = FORMATS.PRESERVED

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {'refs': [{'idRef': 'main'}], 'id': 'root', 'role': 'grpRole:NEP'}
                comp_item['groups'].append(grefs)

                grefs = {'refs': refs, 'id': 'main', 'role': 'grpRole:Main'}
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)