Esempio n. 1
0
    def parse_item(self, tree):
        item = super().parse_item(tree)
        meta = tree.find(self.qname('contentMeta'))

        organisation = meta.xpath('./iptc:subject[@type="cpnat:organisation"][@literal]', namespaces=NS)
        if organisation:
            item['abstract'] = format_maxlength('FOR: {}. {}'.format(
                organisation[0].get('literal').upper().rstrip('.'),
                get_text(item['body_html']).replace('  ', ' '),
            ), 200)

        return item
Esempio n. 2
0
    def parse_item(self, tree):
        item = super().parse_item(tree)
        meta = tree.find(self.qname("contentMeta"))

        organisation = meta.xpath(
            './iptc:subject[@type="cpnat:organisation"][@literal]',
            namespaces=NS)
        if organisation:
            org_name = organisation[0].get("literal")
            item["abstract"] = format_maxlength(
                "FOR: {}. {}".format(
                    org_name.upper().rstrip("."),
                    get_text(item["body_html"]).replace("  ", " "),
                ),
                200,
            )
            item.setdefault("subject", []).append({
                "name": org_name,
                "qcode": org_name,
                "scheme": cp.ORGANISATION,
            })

        return item
Esempio n. 3
0
 def _format_keyword(self, content, keywords, glue):
     if keywords:
         etree.SubElement(content, "Keyword").text = format_maxlength(
             glue.join(keywords), 150
         )
Esempio n. 4
0
    def _format_item(self, root, item, pub_seq_num, service, services) -> None:
        if is_picture(item):
            D2P1 = "http://www.w3.org/2001/XMLSchema-instance"
            content = etree.SubElement(
                root,
                "ContentItem",
                {"{%s}type" % D2P1: "PhotoContentItem"},
                nsmap={
                    "d2p1": D2P1,
                },
            )
        else:
            content = etree.SubElement(root, "ContentItem")
        extra = item.get("extra") or {}

        # root system fields
        etree.SubElement(root, "Reschedule").text = "false"
        etree.SubElement(root, "IsRegional").text = "false"
        etree.SubElement(root, "CanAutoRoute").text = "true"
        etree.SubElement(root, "PublishID").text = str(pub_seq_num)
        etree.SubElement(root, "Username")
        etree.SubElement(root, "UseLocalsOut").text = "false"
        etree.SubElement(root, "UserProfileID").text = "0"
        etree.SubElement(root, "PublishOrder").text = "0"
        etree.SubElement(root, "NewCycle").text = "false"
        etree.SubElement(root, "OnlineResend").text = "false"

        # item system fields
        etree.SubElement(content, "AutoSaveID").text = "0"
        etree.SubElement(content, "Type").text = "0"
        etree.SubElement(content, "MediaType").text = "0"
        etree.SubElement(content, "Status").text = "0"

        if is_picture(item):
            etree.SubElement(root, "Services").text = "Pictures"
            self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS)
            if root.find("PscCodes") is None:
                etree.SubElement(root, "PscCodes").text = "Online"
        elif service:
            etree.SubElement(root, "Services").text = "Print"
            etree.SubElement(root, "PscCodes").text = service
        else:
            self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS)
            self._format_services(root, item)

        is_broadcast = cp.is_broadcast(item)

        # content system fields
        orig = self._get_original_item(item)
        seq_id = "{:08d}".format(pub_seq_num % 100000000)
        item_id = "{:08d}".format(orig["unique_id"] % 100000000)
        etree.SubElement(content, "Name")
        etree.SubElement(content, "Cachable").text = "false"
        etree.SubElement(content, "FileName").text = filename(orig)
        etree.SubElement(content, "NewsCompID").text = item_id
        etree.SubElement(content, "SystemSlug").text = slug(orig)
        etree.SubElement(content, "ContentItemID").text = seq_id
        etree.SubElement(content, "ProfileID").text = "204"
        etree.SubElement(content, "SysContentType").text = "0"

        if is_picture(item):
            etree.SubElement(content, "PhotoContentItemID").text = item_id

        if extra.get(cp.FILENAME):
            etree.SubElement(content, "OrigTransRef").text = extra[cp.FILENAME]

        if service:
            etree.SubElement(content, "Note").text = ",".join(services)

        # timestamps
        firstpublished = item.get("firstpublished") or item["versioncreated"]
        etree.SubElement(root, "PublishDateTime").text = self._format_datetime(
            firstpublished
        )
        try:
            etree.SubElement(content, "EmbargoTime").text = self._format_datetime(
                item[SCHEDULE_SETTINGS]["utc_embargo"],
                local=True,
            )
        except KeyError:
            etree.SubElement(content, "EmbargoTime").text = self._format_datetime(
                item.get("embargoed"), local=True
            )
        etree.SubElement(content, "CreatedDateTime").text = self._format_datetime(
            firstpublished
        )  # SDCP-380
        etree.SubElement(content, "UpdatedDateTime").text = self._format_datetime(
            item["versioncreated"], rel=True
        )

        # obvious
        etree.SubElement(content, "ContentType").text = (
            "Photo" if is_picture(item) else item["type"].capitalize()
        )

        # SDCP-309
        etree.SubElement(content, "Headline").text = format_maxlength(
            extra.get(cp.HEADLINE2) or item.get("headline"), OUTPUT_LENGTH_LIMIT
        )
        if not is_picture(item):
            etree.SubElement(content, "Headline2").text = format_maxlength(
                item.get("headline"), OUTPUT_LENGTH_LIMIT
            )

        etree.SubElement(content, "SlugProper").text = item.get("slugline")
        etree.SubElement(content, "Credit").text = self._format_credit(item)
        etree.SubElement(content, "Source").text = item.get("source")

        content_html = self._format_content(item, is_broadcast)
        etree.SubElement(content, "DirectoryText").text = self._format_text(
            item.get("abstract")
        )
        etree.SubElement(content, "ContentText").text = self._format_html(content_html)
        etree.SubElement(content, "Language").text = (
            "2" if "fr" in item.get("language", "") else "1"
        )

        if item["type"] == "text" and content_html:
            content.find("DirectoryText").text = format_maxlength(
                get_text(content_html, "html", lf_on_block=False).replace("\n", " "),
                200,
            )
            word_count = str(get_word_count(content_html))
            etree.SubElement(content, "Length").text = word_count
            etree.SubElement(content, "WordCount").text = word_count
            etree.SubElement(content, "BreakWordCount").text = word_count

        if item.get("keywords") and item.get("source") == globenewswire.SOURCE:
            etree.SubElement(content, "Stocks").text = ",".join(item["keywords"])

        self._format_category_index(content, item)
        self._format_genre(content, item)
        self._format_urgency(content, item.get("urgency"), item["language"])
        self._format_keyword(
            content,
            item.get("keywords"),
            ", " if item.get("type") == "picture" else ",",
        )
        self._format_dateline(content, item.get("dateline"))
        self._format_writethru(content, item)

        if item.get("byline"):
            etree.SubElement(content, "Byline").text = item["byline"]

        if is_picture(item):
            self._format_picture_metadata(content, item)
        else:
            etree.SubElement(content, "EditorNote").text = item.get("ednote")
            if extra.get(cp.UPDATE):
                etree.SubElement(content, "UpdateNote").text = extra[cp.UPDATE]
            if extra.get(cp.CORRECTION):
                etree.SubElement(content, "Corrections").text = extra[cp.CORRECTION]

        if item.get("associations"):
            self._format_associations(content, item)
Esempio n. 5
0
    def _format_item(self, root, item, pub_seq_num, service, services):
        if is_picture(item):
            D2P1 = 'http://www.w3.org/2001/XMLSchema-instance'
            content = etree.SubElement(root,
                                       'ContentItem',
                                       {'{%s}type' % D2P1: 'PhotoContentItem'},
                                       nsmap={
                                           'd2p1': D2P1,
                                       })
        else:
            content = etree.SubElement(root, 'ContentItem')
        extra = item.get('extra') or {}

        # root system fields
        etree.SubElement(root, 'Reschedule').text = 'false'
        etree.SubElement(root, 'IsRegional').text = 'false'
        etree.SubElement(root, 'CanAutoRoute').text = 'true'
        etree.SubElement(root, 'PublishID').text = str(pub_seq_num)
        etree.SubElement(root, 'Username')
        etree.SubElement(root, 'UseLocalsOut').text = 'false'
        etree.SubElement(root, 'UserProfileID').text = '0'
        etree.SubElement(root, 'PublishOrder').text = '0'
        etree.SubElement(root, 'NewCycle').text = 'false'
        etree.SubElement(root, 'OnlineResend').text = 'false'

        # item system fields
        etree.SubElement(content, 'AutoSaveID').text = '0'
        etree.SubElement(content, 'Type').text = '0'
        etree.SubElement(content, 'MediaType').text = '0'
        etree.SubElement(content, 'Status').text = '0'

        if is_picture(item):
            etree.SubElement(root, 'Services').text = 'Pictures'
            self._format_subject_code(root, item, 'PscCodes', 'destinations')
            if root.find('PscCodes') is None:
                etree.SubElement(root, 'PscCodes').text = 'Online'
        elif service:
            etree.SubElement(root, 'Services').text = 'Print'
            etree.SubElement(root, 'PscCodes').text = service
        else:
            self._format_subject_code(root, item, 'PscCodes', 'destinations')
            self._format_services(root, item)

        # content system fields
        seq_id = '{:08d}'.format(pub_seq_num % 100000000)
        filename = self._format_filename(item)
        etree.SubElement(content, 'Name')
        etree.SubElement(content, 'Cachable').text = 'false'
        etree.SubElement(content, 'FileName').text = filename
        etree.SubElement(content, 'NewsCompID').text = seq_id
        etree.SubElement(content, 'SystemSlug').text = filename
        etree.SubElement(content, 'ContentItemID').text = seq_id
        etree.SubElement(content, 'ProfileID').text = '204'
        etree.SubElement(content, 'SysContentType').text = '0'

        if is_picture(item):
            etree.SubElement(content, 'PhotoContentItemID').text = seq_id

        if extra.get(cp.FILENAME):
            etree.SubElement(content, 'OrigTransRef').text = extra[cp.FILENAME]

        if service:
            etree.SubElement(content, 'Note').text = ','.join(services)

        # timestamps
        firstpublished = item.get('firstpublished') or item['versioncreated']
        etree.SubElement(
            root,
            'PublishDateTime').text = self._format_datetime(firstpublished)
        try:
            etree.SubElement(content,
                             'EmbargoTime').text = self._format_datetime(
                                 item[SCHEDULE_SETTINGS]['utc_embargo'],
                                 local=True,
                             )
        except KeyError:
            etree.SubElement(content,
                             'EmbargoTime').text = self._format_datetime(
                                 item.get('embargoed'), local=True)
        etree.SubElement(content,
                         'CreatedDateTime').text = self._format_datetime(
                             item['firstcreated'])
        etree.SubElement(content,
                         'UpdatedDateTime').text = self._format_datetime(
                             item['versioncreated'], rel=True)

        # obvious
        etree.SubElement(content, 'ContentType').text = 'Photo' if is_picture(
            item) else item['type'].capitalize()

        # SDCP-309
        etree.SubElement(content, 'Headline').text = format_maxlength(
            extra.get(cp.HEADLINE2) or item.get('headline'),
            OUTPUT_LENGTH_LIMIT)
        if not is_picture(item):
            etree.SubElement(content, 'Headline2').text = format_maxlength(
                item.get('headline'), OUTPUT_LENGTH_LIMIT)

        etree.SubElement(content, 'SlugProper').text = item.get('slugline')
        etree.SubElement(content, 'Credit').text = self._format_credit(item)
        etree.SubElement(content, 'Source').text = item.get('source')

        etree.SubElement(content, 'DirectoryText').text = self._format_text(
            item.get('abstract'))
        etree.SubElement(content, 'ContentText').text = self._format_html(
            self._format_content(item))
        etree.SubElement(
            content,
            'Language').text = '2' if 'fr' in item.get('language', '') else '1'

        if item['type'] == 'text' and item.get('body_html'):
            content.find('DirectoryText').text = format_maxlength(
                get_text(item['body_html'], 'html',
                         lf_on_block=False).replace('\n', ' '), 200)
            word_count = str(item['word_count'] if item.get('word_count') else
                             get_word_count(item['body_html']))
            etree.SubElement(content, 'Length').text = word_count
            etree.SubElement(content, 'WordCount').text = word_count
            etree.SubElement(content, 'BreakWordCount').text = word_count

        if item.get('keywords') and item.get('source') == globenewswire.SOURCE:
            etree.SubElement(content,
                             'Stocks').text = ','.join(item['keywords'])

        self._format_index(content, item)
        self._format_category(content, item)
        self._format_genre(content, item)
        self._format_urgency(content, item.get('urgency'), item['language'])
        self._format_keyword(content, item.get('keywords'),
                             ', ' if item.get('type') == 'picture' else ',')
        self._format_dateline(content, item.get('dateline'))
        self._format_writethru(content, item.get('rewrite_sequence'),
                               item['language'])

        if item.get('byline'):
            etree.SubElement(content, 'Byline').text = item['byline']

        if is_picture(item):
            self._format_picture_metadata(content, item)
        else:
            etree.SubElement(content, 'EditorNote').text = item.get('ednote')
            if extra.get(cp.UPDATE):
                etree.SubElement(content, 'UpdateNote').text = extra[cp.UPDATE]
            if extra.get(cp.CORRECTION):
                etree.SubElement(content,
                                 'Corrections').text = extra[cp.CORRECTION]

        if item.get('associations'):
            self._format_associations(content, item)