Beispiel #1
0
    def parse_file(self, filename):
        """Parse 7901 file by given filename.

        :param filename
        """
        try:
            item = {'type': 'preformatted'}
            item['guid'] = generate_guid(type=GUID_TAG)
            item['versioncreated'] = utcnow()

            with open(filename, 'rb') as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)', lines[0], flags=re.I)
            if m:
                item['original_source'] = m.group(1).decode()
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa-category'] = {'qcode': self.map_category(m.group(4).decode())}
                item['word_count'] = int(m.group(5).decode())

            inHeader = True
            inText = False
            inNote = False
            for line in lines[1:]:
                # STX starts the body of the story
                if line[0:1] == b'\x02':
                    # pick the rest of the line off as the headline
                    item['headline'] = line[1:].decode().rstrip('\r\n')
                    item['body_html'] = ''
                    inText = True
                    inHeader = False
                    continue
                # ETX denotes the end of the story
                if line[0:1] == b'\x03':
                    break
                if inText:
                    if line.decode().find('The following information is not for publication') != -1:
                        inNote = True
                        inText = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line.decode()
                if inNote:
                    item['ednote'] += line.decode()
                    continue
                if inHeader:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.decode().rstrip('/\r\n')
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(filename, ex)
Beispiel #2
0
    def parse_file(self, filename):
        """Parse 7901 file by given filename.

        :param filename
        """
        try:
            item = {"type": "preformatted"}
            item["guid"] = generate_guid(type=GUID_TAG)
            item["versioncreated"] = utcnow()

            with open(filename, "rb") as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(b"\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)", lines[0], flags=re.I)
            if m:
                item["original_source"] = m.group(1).decode()
                item["ingest_provider_sequence"] = m.group(2).decode()
                item["priority"] = self.map_priority(m.group(3).decode())
                item["anpa_category"] = [{"qcode": self.map_category(m.group(4).decode())}]
                item["word_count"] = int(m.group(5).decode())

            inHeader = True
            inText = False
            inNote = False
            for line in lines[1:]:
                # STX starts the body of the story
                if line[0:1] == b"\x02":
                    # pick the rest of the line off as the headline
                    item["headline"] = line[1:].decode().rstrip("\r\n")
                    item["body_html"] = ""
                    inText = True
                    inHeader = False
                    continue
                # ETX denotes the end of the story
                if line[0:1] == b"\x03":
                    break
                if inText:
                    if line.decode().find("The following information is not for publication") != -1:
                        inNote = True
                        inText = False
                        item["ednote"] = ""
                        continue
                    item["body_html"] += line.decode()
                if inNote:
                    item["ednote"] += line.decode()
                    continue
                if inHeader:
                    if "slugline" not in item:
                        item["slugline"] = ""
                    item["slugline"] += line.decode().rstrip("/\r\n")
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(filename, ex)
Beispiel #3
0
    def parse_email(self, data, provider):
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item['type'] = 'text'
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    item['original_creator'] = self.parse_header(msg['from'])
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(
                            email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}"
                                    .format(item['headline'],
                                            item['original_creator']), ex)
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text html for {0} from {1}"
                                    .format(item['headline'],
                                            item['original_creator']), ex)
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(
                                content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(
                                content,
                                filename=fileName,
                                content_type=content_type,
                                metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item['type'] = 'composite'
                                comp_item['guid'] = generate_guid(
                                    type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []

                                # create a reference to the item that stores the body of the email
                                item_ref = {}
                                item_ref['guid'] = item['guid']
                                item_ref['residRef'] = item['guid']
                                item_ref['headline'] = item['headline']
                                item_ref['location'] = 'ingest'
                                item_ref['itemClass'] = 'icls:text'
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item['type'] = 'picture'
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            media_item['filemeta'] = metadata
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {}
                            media_ref['guid'] = media_item['guid']
                            media_ref['residRef'] = media_item['guid']
                            media_ref['headline'] = fileName
                            media_ref['location'] = 'ingest'
                            media_ref['itemClass'] = 'icls:picture'
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = text_body
                item['type'] = 'preformatted'

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {}
                grefs['refs'] = [{'idRef': 'main'}]
                grefs['id'] = 'root'
                grefs['role'] = 'grpRole:NEP'
                comp_item['groups'].append(grefs)

                grefs = {}
                grefs['refs'] = refs
                grefs['id'] = 'main'
                grefs['role'] = 'grpRole:Main'
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Beispiel #4
0
    def parse_email(self, data, provider):
        try:
            new_items = []
            # create an item for the body text of the email
            # either text or html
            item = dict()
            item['type'] = 'text'
            item['versioncreated'] = utcnow()

            comp_item = None

            # a list to keep the references to the attachments
            refs = []

            html_body = None
            text_body = None

            for response_part in data:
                if isinstance(response_part, tuple):
                    msg = email.message_from_bytes(response_part[1])
                    item['headline'] = self.parse_header(msg['subject'])
                    item['original_creator'] = self.parse_header(msg['from'])
                    item['guid'] = msg['Message-ID']
                    date_tuple = email.utils.parsedate_tz(msg['Date'])
                    if date_tuple:
                        dt = datetime.datetime.utcfromtimestamp(email.utils.mktime_tz(date_tuple))
                        dt = dt.replace(tzinfo=timezone('utc'))
                        item['firstcreated'] = dt

                    # this will loop through all the available multiparts in mail
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            body = part.get_payload(decode=True)
                            try:
                                # if we don't know the charset just have a go!
                                if part.get_content_charset() is None:
                                    text_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    text_body = body.decode(charset)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text body for {0} from {1}".format(item['headline'],
                                                                                          item['original_creator']), ex)
                                continue
                        if part.get_content_type() == "text/html":
                            body = part.get_payload(decode=True)
                            try:
                                if part.get_content_charset() is None:
                                    html_body = body.decode()
                                else:
                                    charset = part.get_content_charset()
                                    html_body = body.decode(charset)
                                html_body = self.safe_html(html_body)
                                continue
                            except Exception as ex:
                                logger.exception(
                                    "Exception parsing text html for {0} from {1}".format(item['headline'],
                                                                                          item['original_creator']), ex)
                                continue
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # we are only going to pull off image attachments at this stage
                        if part.get_content_maintype() != 'image':
                            continue

                        fileName = part.get_filename()
                        if bool(fileName):
                            image = part.get_payload(decode=True)
                            content = io.BytesIO(image)
                            res = process_file_from_stream(content, part.get_content_type())
                            file_name, content_type, metadata = res
                            if content_type == 'image/gif' or content_type == 'image/png':
                                continue
                            content.seek(0)
                            image_id = self.parser_app.media.put(content, filename=fileName,
                                                                 content_type=content_type, metadata=metadata)
                            renditions = {'baseImage': {'href': image_id}}

                            # if we have not got a composite item then create one
                            if not comp_item:
                                comp_item = dict()
                                comp_item['type'] = 'composite'
                                comp_item['guid'] = generate_guid(type=GUID_TAG)
                                comp_item['versioncreated'] = utcnow()
                                comp_item['groups'] = []
                                comp_item['headline'] = item['headline']
                                comp_item['groups'] = []

                                # create a reference to the item that stores the body of the email
                                item_ref = {}
                                item_ref['guid'] = item['guid']
                                item_ref['residRef'] = item['guid']
                                item_ref['headline'] = item['headline']
                                item_ref['location'] = 'ingest'
                                item_ref['itemClass'] = 'icls:text'
                                refs.append(item_ref)

                            media_item = dict()
                            media_item['guid'] = generate_guid(type=GUID_TAG)
                            media_item['versioncreated'] = utcnow()
                            media_item['type'] = 'picture'
                            media_item['renditions'] = renditions
                            media_item['mimetype'] = content_type
                            media_item['filemeta'] = metadata
                            media_item['slugline'] = fileName
                            if text_body is not None:
                                media_item['body_html'] = text_body
                            media_item['headline'] = item['headline']
                            new_items.append(media_item)

                            # add a reference to this item in the composite item
                            media_ref = {}
                            media_ref['guid'] = media_item['guid']
                            media_ref['residRef'] = media_item['guid']
                            media_ref['headline'] = fileName
                            media_ref['location'] = 'ingest'
                            media_ref['itemClass'] = 'icls:picture'
                            refs.append(media_ref)

            if html_body is not None:
                item['body_html'] = html_body
            else:
                item['body_html'] = text_body
                item['type'] = 'preformatted'

            # if there is composite item then add the main group and references
            if comp_item:
                grefs = {}
                grefs['refs'] = [{'idRef': 'main'}]
                grefs['id'] = 'root'
                grefs['role'] = 'grpRole:NEP'
                comp_item['groups'].append(grefs)

                grefs = {}
                grefs['refs'] = refs
                grefs['id'] = 'main'
                grefs['role'] = 'grpRole:Main'
                comp_item['groups'].append(grefs)

                new_items.append(comp_item)

            new_items.append(item)
            return new_items
        except Exception as ex:
            raise IngestEmailError.emailParseError(ex, provider)
Beispiel #5
0
    def parse_file(self, filename):
        """Parse 7901 file by given filename.

        :param filename
        """
        try:
            item = {'type': 'preformatted'}
            item['guid'] = generate_guid(type=GUID_TAG)
            item['versioncreated'] = utcnow()

            with open(filename, 'rb') as f:
                lines = [line for line in f]
            # parse first header line
            m = re.match(
                b'\x01([a-zA-Z]*)([0-9]*) (.) (.) ([0-9]*) ([a-zA-Z0-9 ]*)',
                lines[0],
                flags=re.I)
            if m:
                item['original_source'] = m.group(1).decode()
                item['ingest_provider_sequence'] = m.group(2).decode()
                item['priority'] = self.map_priority(m.group(3).decode())
                item['anpa-category'] = {
                    'qcode': self.map_category(m.group(4).decode())
                }
                item['word_count'] = int(m.group(5).decode())

            inHeader = True
            inText = False
            inNote = False
            for line in lines[1:]:
                # STX starts the body of the story
                if line[0:1] == b'\x02':
                    # pick the rest of the line off as the headline
                    item['headline'] = line[1:].decode().rstrip('\r\n')
                    item['body_html'] = ''
                    inText = True
                    inHeader = False
                    continue
                # ETX denotes the end of the story
                if line[0:1] == b'\x03':
                    break
                if inText:
                    if line.decode().find(
                            'The following information is not for publication'
                    ) != -1:
                        inNote = True
                        inText = False
                        item['ednote'] = ''
                        continue
                    item['body_html'] += line.decode()
                if inNote:
                    item['ednote'] += line.decode()
                    continue
                if inHeader:
                    if 'slugline' not in item:
                        item['slugline'] = ''
                    item['slugline'] += line.decode().rstrip('/\r\n')
                    continue

            return item
        except Exception as ex:
            raise ParserError.IPTC7901ParserError(filename, ex)