def parse_item(self, tree): item = super().parse_item(tree) meta = tree.find(self.qname('contentMeta')) organisation = meta.xpath('./iptc:subject[@type="cpnat:organisation"][@literal]', namespaces=NS) if organisation: item['abstract'] = format_maxlength('FOR: {}. {}'.format( organisation[0].get('literal').upper().rstrip('.'), get_text(item['body_html']).replace(' ', ' '), ), 200) return item
def parse_item(self, tree): item = super().parse_item(tree) meta = tree.find(self.qname("contentMeta")) organisation = meta.xpath( './iptc:subject[@type="cpnat:organisation"][@literal]', namespaces=NS) if organisation: org_name = organisation[0].get("literal") item["abstract"] = format_maxlength( "FOR: {}. {}".format( org_name.upper().rstrip("."), get_text(item["body_html"]).replace(" ", " "), ), 200, ) item.setdefault("subject", []).append({ "name": org_name, "qcode": org_name, "scheme": cp.ORGANISATION, }) return item
def _format_keyword(self, content, keywords, glue): if keywords: etree.SubElement(content, "Keyword").text = format_maxlength( glue.join(keywords), 150 )
def _format_item(self, root, item, pub_seq_num, service, services) -> None: if is_picture(item): D2P1 = "http://www.w3.org/2001/XMLSchema-instance" content = etree.SubElement( root, "ContentItem", {"{%s}type" % D2P1: "PhotoContentItem"}, nsmap={ "d2p1": D2P1, }, ) else: content = etree.SubElement(root, "ContentItem") extra = item.get("extra") or {} # root system fields etree.SubElement(root, "Reschedule").text = "false" etree.SubElement(root, "IsRegional").text = "false" etree.SubElement(root, "CanAutoRoute").text = "true" etree.SubElement(root, "PublishID").text = str(pub_seq_num) etree.SubElement(root, "Username") etree.SubElement(root, "UseLocalsOut").text = "false" etree.SubElement(root, "UserProfileID").text = "0" etree.SubElement(root, "PublishOrder").text = "0" etree.SubElement(root, "NewCycle").text = "false" etree.SubElement(root, "OnlineResend").text = "false" # item system fields etree.SubElement(content, "AutoSaveID").text = "0" etree.SubElement(content, "Type").text = "0" etree.SubElement(content, "MediaType").text = "0" etree.SubElement(content, "Status").text = "0" if is_picture(item): etree.SubElement(root, "Services").text = "Pictures" self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS) if root.find("PscCodes") is None: etree.SubElement(root, "PscCodes").text = "Online" elif service: etree.SubElement(root, "Services").text = "Print" etree.SubElement(root, "PscCodes").text = service else: self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS) self._format_services(root, item) is_broadcast = cp.is_broadcast(item) # content system fields orig = self._get_original_item(item) seq_id = "{:08d}".format(pub_seq_num % 100000000) item_id = "{:08d}".format(orig["unique_id"] % 100000000) etree.SubElement(content, "Name") etree.SubElement(content, "Cachable").text = "false" etree.SubElement(content, "FileName").text = filename(orig) etree.SubElement(content, "NewsCompID").text = item_id etree.SubElement(content, "SystemSlug").text = slug(orig) etree.SubElement(content, "ContentItemID").text = seq_id etree.SubElement(content, "ProfileID").text = "204" etree.SubElement(content, "SysContentType").text = "0" if is_picture(item): etree.SubElement(content, "PhotoContentItemID").text = item_id if extra.get(cp.FILENAME): etree.SubElement(content, "OrigTransRef").text = extra[cp.FILENAME] if service: etree.SubElement(content, "Note").text = ",".join(services) # timestamps firstpublished = item.get("firstpublished") or item["versioncreated"] etree.SubElement(root, "PublishDateTime").text = self._format_datetime( firstpublished ) try: etree.SubElement(content, "EmbargoTime").text = self._format_datetime( item[SCHEDULE_SETTINGS]["utc_embargo"], local=True, ) except KeyError: etree.SubElement(content, "EmbargoTime").text = self._format_datetime( item.get("embargoed"), local=True ) etree.SubElement(content, "CreatedDateTime").text = self._format_datetime( firstpublished ) # SDCP-380 etree.SubElement(content, "UpdatedDateTime").text = self._format_datetime( item["versioncreated"], rel=True ) # obvious etree.SubElement(content, "ContentType").text = ( "Photo" if is_picture(item) else item["type"].capitalize() ) # SDCP-309 etree.SubElement(content, "Headline").text = format_maxlength( extra.get(cp.HEADLINE2) or item.get("headline"), OUTPUT_LENGTH_LIMIT ) if not is_picture(item): etree.SubElement(content, "Headline2").text = format_maxlength( item.get("headline"), OUTPUT_LENGTH_LIMIT ) etree.SubElement(content, "SlugProper").text = item.get("slugline") etree.SubElement(content, "Credit").text = self._format_credit(item) etree.SubElement(content, "Source").text = item.get("source") content_html = self._format_content(item, is_broadcast) etree.SubElement(content, "DirectoryText").text = self._format_text( item.get("abstract") ) etree.SubElement(content, "ContentText").text = self._format_html(content_html) etree.SubElement(content, "Language").text = ( "2" if "fr" in item.get("language", "") else "1" ) if item["type"] == "text" and content_html: content.find("DirectoryText").text = format_maxlength( get_text(content_html, "html", lf_on_block=False).replace("\n", " "), 200, ) word_count = str(get_word_count(content_html)) etree.SubElement(content, "Length").text = word_count etree.SubElement(content, "WordCount").text = word_count etree.SubElement(content, "BreakWordCount").text = word_count if item.get("keywords") and item.get("source") == globenewswire.SOURCE: etree.SubElement(content, "Stocks").text = ",".join(item["keywords"]) self._format_category_index(content, item) self._format_genre(content, item) self._format_urgency(content, item.get("urgency"), item["language"]) self._format_keyword( content, item.get("keywords"), ", " if item.get("type") == "picture" else ",", ) self._format_dateline(content, item.get("dateline")) self._format_writethru(content, item) if item.get("byline"): etree.SubElement(content, "Byline").text = item["byline"] if is_picture(item): self._format_picture_metadata(content, item) else: etree.SubElement(content, "EditorNote").text = item.get("ednote") if extra.get(cp.UPDATE): etree.SubElement(content, "UpdateNote").text = extra[cp.UPDATE] if extra.get(cp.CORRECTION): etree.SubElement(content, "Corrections").text = extra[cp.CORRECTION] if item.get("associations"): self._format_associations(content, item)
def _format_item(self, root, item, pub_seq_num, service, services): if is_picture(item): D2P1 = 'http://www.w3.org/2001/XMLSchema-instance' content = etree.SubElement(root, 'ContentItem', {'{%s}type' % D2P1: 'PhotoContentItem'}, nsmap={ 'd2p1': D2P1, }) else: content = etree.SubElement(root, 'ContentItem') extra = item.get('extra') or {} # root system fields etree.SubElement(root, 'Reschedule').text = 'false' etree.SubElement(root, 'IsRegional').text = 'false' etree.SubElement(root, 'CanAutoRoute').text = 'true' etree.SubElement(root, 'PublishID').text = str(pub_seq_num) etree.SubElement(root, 'Username') etree.SubElement(root, 'UseLocalsOut').text = 'false' etree.SubElement(root, 'UserProfileID').text = '0' etree.SubElement(root, 'PublishOrder').text = '0' etree.SubElement(root, 'NewCycle').text = 'false' etree.SubElement(root, 'OnlineResend').text = 'false' # item system fields etree.SubElement(content, 'AutoSaveID').text = '0' etree.SubElement(content, 'Type').text = '0' etree.SubElement(content, 'MediaType').text = '0' etree.SubElement(content, 'Status').text = '0' if is_picture(item): etree.SubElement(root, 'Services').text = 'Pictures' self._format_subject_code(root, item, 'PscCodes', 'destinations') if root.find('PscCodes') is None: etree.SubElement(root, 'PscCodes').text = 'Online' elif service: etree.SubElement(root, 'Services').text = 'Print' etree.SubElement(root, 'PscCodes').text = service else: self._format_subject_code(root, item, 'PscCodes', 'destinations') self._format_services(root, item) # content system fields seq_id = '{:08d}'.format(pub_seq_num % 100000000) filename = self._format_filename(item) etree.SubElement(content, 'Name') etree.SubElement(content, 'Cachable').text = 'false' etree.SubElement(content, 'FileName').text = filename etree.SubElement(content, 'NewsCompID').text = seq_id etree.SubElement(content, 'SystemSlug').text = filename etree.SubElement(content, 'ContentItemID').text = seq_id etree.SubElement(content, 'ProfileID').text = '204' etree.SubElement(content, 'SysContentType').text = '0' if is_picture(item): etree.SubElement(content, 'PhotoContentItemID').text = seq_id if extra.get(cp.FILENAME): etree.SubElement(content, 'OrigTransRef').text = extra[cp.FILENAME] if service: etree.SubElement(content, 'Note').text = ','.join(services) # timestamps firstpublished = item.get('firstpublished') or item['versioncreated'] etree.SubElement( root, 'PublishDateTime').text = self._format_datetime(firstpublished) try: etree.SubElement(content, 'EmbargoTime').text = self._format_datetime( item[SCHEDULE_SETTINGS]['utc_embargo'], local=True, ) except KeyError: etree.SubElement(content, 'EmbargoTime').text = self._format_datetime( item.get('embargoed'), local=True) etree.SubElement(content, 'CreatedDateTime').text = self._format_datetime( item['firstcreated']) etree.SubElement(content, 'UpdatedDateTime').text = self._format_datetime( item['versioncreated'], rel=True) # obvious etree.SubElement(content, 'ContentType').text = 'Photo' if is_picture( item) else item['type'].capitalize() # SDCP-309 etree.SubElement(content, 'Headline').text = format_maxlength( extra.get(cp.HEADLINE2) or item.get('headline'), OUTPUT_LENGTH_LIMIT) if not is_picture(item): etree.SubElement(content, 'Headline2').text = format_maxlength( item.get('headline'), OUTPUT_LENGTH_LIMIT) etree.SubElement(content, 'SlugProper').text = item.get('slugline') etree.SubElement(content, 'Credit').text = self._format_credit(item) etree.SubElement(content, 'Source').text = item.get('source') etree.SubElement(content, 'DirectoryText').text = self._format_text( item.get('abstract')) etree.SubElement(content, 'ContentText').text = self._format_html( self._format_content(item)) etree.SubElement( content, 'Language').text = '2' if 'fr' in item.get('language', '') else '1' if item['type'] == 'text' and item.get('body_html'): content.find('DirectoryText').text = format_maxlength( get_text(item['body_html'], 'html', lf_on_block=False).replace('\n', ' '), 200) word_count = str(item['word_count'] if item.get('word_count') else get_word_count(item['body_html'])) etree.SubElement(content, 'Length').text = word_count etree.SubElement(content, 'WordCount').text = word_count etree.SubElement(content, 'BreakWordCount').text = word_count if item.get('keywords') and item.get('source') == globenewswire.SOURCE: etree.SubElement(content, 'Stocks').text = ','.join(item['keywords']) self._format_index(content, item) self._format_category(content, item) self._format_genre(content, item) self._format_urgency(content, item.get('urgency'), item['language']) self._format_keyword(content, item.get('keywords'), ', ' if item.get('type') == 'picture' else ',') self._format_dateline(content, item.get('dateline')) self._format_writethru(content, item.get('rewrite_sequence'), item['language']) if item.get('byline'): etree.SubElement(content, 'Byline').text = item['byline'] if is_picture(item): self._format_picture_metadata(content, item) else: etree.SubElement(content, 'EditorNote').text = item.get('ednote') if extra.get(cp.UPDATE): etree.SubElement(content, 'UpdateNote').text = extra[cp.UPDATE] if extra.get(cp.CORRECTION): etree.SubElement(content, 'Corrections').text = extra[cp.CORRECTION] if item.get('associations'): self._format_associations(content, item)