def __set_published_item_expiry(self, doc): """Set the expiry for the published item. :param dict doc: doc on which publishing action is performed """ desk_id = doc.get('task', {}).get('desk', None) stage_id = doc.get('task', {}).get('stage', None) offset = get_utc_schedule(doc, PUBLISH_SCHEDULE) or get_utc_schedule(doc, EMBARGO) doc['expiry'] = get_expiry(desk_id, stage_id, offset=offset)
def _set_item_expiry(self, updates, original): """ Set the expiry for the item :param dict updates: doc on which publishing action is performed """ desk_id = original.get('task', {}).get('desk') stage_id = original.get('task', {}).get('stage') if EMBARGO in updates or PUBLISH_SCHEDULE in updates: offset = get_utc_schedule(updates, PUBLISH_SCHEDULE) or get_utc_schedule(updates, EMBARGO) elif EMBARGO in original or PUBLISH_SCHEDULE in original: offset = get_utc_schedule(original, PUBLISH_SCHEDULE) or get_utc_schedule(original, EMBARGO) updates['expiry'] = get_expiry(desk_id, stage_id, offset=offset)
def __format_head(self, article, head): title = SubElement(head, 'title') title.text = article.get('headline', '') tobject = SubElement(head, 'tobject', {'tobject.type': 'news'}) if 'genre' in article and len(article['genre']) > 0: SubElement(tobject, 'tobject.property', {'tobject.property.type': article['genre'][0]['name']}) self.__format_subjects(article, tobject) if article.get(EMBARGO): docdata = SubElement(head, 'docdata', {'management-status': 'embargoed'}) SubElement(docdata, 'date.expire', {'norm': str(get_utc_schedule(article, EMBARGO).isoformat())}) else: docdata = SubElement(head, 'docdata', {'management-status': article.get('pubstatus', '')}) SubElement(docdata, 'date.expire', {'norm': str(article.get('expiry', ''))}) SubElement(docdata, 'urgency', {'ed-urg': str(article.get('urgency', ''))}) SubElement(docdata, 'date.issue', {'norm': str(article.get('firstcreated', ''))}) SubElement(docdata, 'doc-id', attrib={'id-string': article.get('guid', '')}) if article.get('ednote'): SubElement(docdata, 'ed-msg', {'info': article.get('ednote', '')}) self.__format_keywords(article, head)
def _format_news_management(self, formatted_article, news_item): """ Create a NewsManagement element :param dict formatted_article: :param Element news_item: """ news_management = SubElement(news_item, "NewsManagement") SubElement(news_management, 'NewsItemType', {'FormalName': 'News'}) SubElement(news_management, 'FirstCreated').text = \ formatted_article['firstcreated'].strftime('%Y%m%dT%H%M%S+0000') SubElement(news_management, 'ThisRevisionCreated').text = \ formatted_article['versioncreated'].strftime('%Y%m%dT%H%M%S+0000') if formatted_article.get(EMBARGO): SubElement(news_management, 'Status', {'FormalName': 'Embargoed'}) status_will_change = SubElement(news_management, 'StatusWillChange') SubElement(status_will_change, 'FutureStatus', {'FormalName': formatted_article['pubstatus']}) SubElement(status_will_change, 'DateAndTime').text = \ get_utc_schedule(formatted_article, EMBARGO).isoformat() else: SubElement(news_management, 'Status', {'FormalName': formatted_article['pubstatus']}) if formatted_article.get('urgency'): SubElement(news_management, 'Urgency', {'FormalName': str(formatted_article['urgency'])}) if formatted_article['state'] == 'corrected': SubElement(news_management, 'Instruction', {'FormalName': 'Correction'}) else: SubElement(news_management, 'Instruction', {'FormalName': 'Update'}) SubElement(news_management, 'Property', {'FormalName': 'reuters.3rdPartyStyleGuideVersion', 'Value': '2.1'}) SubElement(news_management, 'Property', {'FormalName': 'USN', 'Value': 'AAP' + str( int(formatted_article.get('unique_id', 1)) % 100000) + 'a'})
def test_get_utc_schedule(self): embargo_date = utcnow() + timedelta(minutes=10) content = { 'embargo': embargo_date } utc_schedule = get_utc_schedule(content, 'embargo') self.assertEqual(utc_schedule, embargo_date)
def _format_newsmanagement(self, newsitem): """ Creates the NewsManagement element and add it to `newsitem` :param Element newsitem: """ news_management = SubElement(newsitem, 'NewsManagement') SubElement(news_management, 'NewsItemType', {'FormalName': 'News'}) SubElement( news_management, 'FirstCreated').text = self._article.get('firstcreated').strftime( self.DATETIME_FORMAT) SubElement(news_management, 'ThisRevisionCreated' ).text = self._article['versioncreated'].strftime( self.DATETIME_FORMAT) if self._article.get(EMBARGO): SubElement(news_management, 'Status', {'FormalName': 'Embargoed'}) status_will_change = SubElement(news_management, 'StatusWillChange') SubElement( status_will_change, 'FutureStatus', {'FormalName': self._article.get('pubstatus', '').upper()}) SubElement(status_will_change, 'DateAndTime').text = get_utc_schedule( self._article, EMBARGO).isoformat() else: SubElement( news_management, 'Status', {'FormalName': self._article.get('pubstatus', '').upper()})
def format(self, article, subscriber, codes=None): """ Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) sms_message = article.get('sms_message', article.get('abstract', '')).replace('\'', '\'\'') # category = 1 is used to indicate a test message category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \ else article.get('anpa_category', [{}])[0].get('qcode').upper() odbc_item = {'Sequence': pub_seq_num, 'Category': category, 'Headline': BeautifulSoup(sms_message, 'html.parser').text, 'Priority': map_priority(article.get('priority'))} body = self.append_body_footer(article) if article.get(EMBARGO): embargo = '{}{}'.format('Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) body = embargo + body if article[ITEM_TYPE] == CONTENT_TYPE.TEXT: body = BeautifulSoup(body, "html.parser").text odbc_item['StoryText'] = body.replace('\'', '\'\'') # @article_text odbc_item['ident'] = '0' return [(pub_seq_num, json.dumps(odbc_item))] except Exception as ex: raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def _format_news_management(self, article, news_item): """ Create a NewsManagement element :param dict article: :param Element news_item: """ news_management = SubElement(news_item, "NewsManagement") SubElement(news_management, 'NewsItemType', {'FormalName': 'News'}) SubElement(news_management, 'FirstCreated').text = \ article['firstcreated'].strftime('%Y%m%dT%H%M%S+0000') SubElement(news_management, 'ThisRevisionCreated').text = \ article['versioncreated'].strftime('%Y%m%dT%H%M%S+0000') if article.get(EMBARGO): SubElement(news_management, 'Status', {'FormalName': 'Embargoed'}) status_will_change = SubElement(news_management, 'StatusWillChange') SubElement(status_will_change, 'FutureStatus', {'FormalName': article['pubstatus']}) SubElement(status_will_change, 'DateAndTime').text = \ get_utc_schedule(article, EMBARGO).isoformat() else: SubElement(news_management, 'Status', {'FormalName': article['pubstatus']}) if article.get('urgency'): SubElement(news_management, 'Urgency', {'FormalName': str(article['urgency'])}) if article['state'] == 'corrected': SubElement(news_management, 'Instruction', {'FormalName': 'Correction'}) else: SubElement(news_management, 'Instruction', {'FormalName': 'Update'})
def _format_date_expire(self, article, docdata): if article.get(EMBARGO): docdata.attrib["management-status"] = "embargoed" SubElement(docdata, "date.expire", {"norm": str(get_utc_schedule(article, EMBARGO).isoformat())}) else: docdata.attrib["management-status"] = article.get("pubstatus", "") SubElement(docdata, "date.expire", {"norm": str(article.get("expiry", ""))})
def __format_head(self, article, head): title = SubElement(head, 'title') title.text = article.get('headline', '') tobject = SubElement(head, 'tobject', {'tobject.type': 'news'}) if 'genre' in article and len(article['genre']) > 0: SubElement(tobject, 'tobject.property', {'tobject.property.type': article['genre'][0]['name']}) self.__format_subjects(article, tobject) if article.get(EMBARGO): docdata = SubElement(head, 'docdata', {'management-status': 'embargoed'}) SubElement( docdata, 'date.expire', {'norm': str(get_utc_schedule(article, EMBARGO).isoformat())}) else: docdata = SubElement( head, 'docdata', {'management-status': article.get('pubstatus', '')}) SubElement(docdata, 'date.expire', {'norm': str(article.get('expiry', ''))}) SubElement(docdata, 'urgency', {'ed-urg': str(article.get('urgency', ''))}) SubElement(docdata, 'date.issue', {'norm': str(article.get('firstcreated', ''))}) SubElement(docdata, 'doc-id', attrib={'id-string': article.get('guid', '')}) if article.get('ednote'): SubElement(docdata, 'ed-msg', {'info': article.get('ednote', '')}) self.__format_keywords(article, head)
def _format_date_expire(self, article, docdata): if article.get(EMBARGO): docdata.attrib['management-status'] = 'embargoed' SubElement(docdata, 'date.expire', {'norm': str(get_utc_schedule(article, EMBARGO).isoformat())}) else: docdata.attrib['management-status'] = article.get('pubstatus', '') SubElement(docdata, 'date.expire', {'norm': str(article.get('expiry', ''))})
def _set_item_expiry(self, updates, original): """Set the expiry for the item. :param dict updates: doc on which publishing action is performed """ desk_id = original.get("task", {}).get("desk") stage_id = original.get("task", {}).get("stage") if EMBARGO in updates or PUBLISH_SCHEDULE in updates: offset = get_utc_schedule(updates, PUBLISH_SCHEDULE) or get_utc_schedule(updates, EMBARGO) elif EMBARGO in original or PUBLISH_SCHEDULE in original: offset = get_utc_schedule(original, PUBLISH_SCHEDULE) or get_utc_schedule(original, EMBARGO) if app.settings.get("PUBLISHED_CONTENT_EXPIRY_MINUTES"): updates["expiry"] = get_expiry_date(app.settings["PUBLISHED_CONTENT_EXPIRY_MINUTES"], offset=offset) else: updates["expiry"] = get_expiry(desk_id, stage_id, offset=offset)
def _set_item_expiry(self, updates, original): """Set the expiry for the item. :param dict updates: doc on which publishing action is performed """ desk_id = original.get('task', {}).get('desk') stage_id = original.get('task', {}).get('stage') if EMBARGO in updates or PUBLISH_SCHEDULE in updates: offset = get_utc_schedule(updates, PUBLISH_SCHEDULE) or get_utc_schedule(updates, EMBARGO) elif EMBARGO in original or PUBLISH_SCHEDULE in original: offset = get_utc_schedule(original, PUBLISH_SCHEDULE) or get_utc_schedule(original, EMBARGO) if app.settings.get('PUBLISHED_CONTENT_EXPIRY_MINUTES'): updates['expiry'] = get_expiry_date(app.settings['PUBLISHED_CONTENT_EXPIRY_MINUTES'], offset=offset) else: updates['expiry'] = get_expiry(desk_id, stage_id, offset=offset)
def add_embargo(self, odbc_item, article): """ Add the embargo text to the article if required :param odbc_item: :param article: :return: """ if article.get(EMBARGO): embargo = '{}{}\r\n'.format('Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) odbc_item['article_text'] = embargo + odbc_item['article_text']
def add_embargo(self, odbc_item, article): """ Add the embargo text to the article if required :param odbc_item: :param article: :return: """ if article.get(EMBARGO): embargo = '{}{}\r\n'.format( 'Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) odbc_item['article_text'] = embargo + odbc_item['article_text']
def get_subscribers(self, doc, target_media_type): """Get the subscribers for this document based on the target_media_type for article Correction. 1. The article is sent to Subscribers (digital and wire) who has received the article previously. 2. For subsequent takes, only published to previously published wire clients. Digital clients don't get individual takes but digital client takes package. 3. If the item has embargo and is a future date then fetch active Wire Subscribers. Otherwise fetch Active Subscribers. After fetching exclude those who received the article previously from active subscribers list. 4. If article has 'targeted_for' property then exclude subscribers of type Internet from Subscribers list. 5. Filter the subscriber that have not received the article previously against publish filters and global filters for this document. :param doc: Document to correct :param target_media_type: dictate if the doc being queued is a Takes Package or an Individual Article. Valid values are - Wire, Digital. If Digital then the doc being queued is a Takes Package and if Wire then the doc being queues is an Individual Article. :return: (list, list) List of filtered subscribers, List of subscribers that have not received item previously """ subscribers, subscribers_yet_to_receive = [], [] # step 1 query = {'$and': [{'item_id': doc['item_id']}, {'publishing_action': {'$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED]}}]} subscribers, subscriber_codes = self._get_subscribers_for_previously_sent_items(query) if subscribers: # step 2 if not self.takes_package_service.get_take_package_id(doc): # Step 3 query = {'is_active': True} if doc.get(EMBARGO) and get_utc_schedule(doc, EMBARGO) > utcnow(): query['subscriber_type'] = SUBSCRIBER_TYPES.WIRE # Ta 04/05/16: Commenting out this section for ticket SD-4465 # query['media_type'] = SUBSCRIBER_MEDIA_TYPES.MEDIA active_subscribers = list(get_resource_service('subscribers').get(req=None, lookup=query)) subscribers_yet_to_receive = [a for a in active_subscribers if not any(a[config.ID_FIELD] == s[config.ID_FIELD] for s in subscribers)] if len(subscribers_yet_to_receive) > 0: # Step 4 if doc.get('target_regions'): subscribers_yet_to_receive = list(self.non_digital(subscribers_yet_to_receive)) # Step 5 subscribers_yet_to_receive, codes = \ self.filter_subscribers(doc, subscribers_yet_to_receive, target_media_type) if codes: subscriber_codes.update(codes) return subscribers, subscribers_yet_to_receive, subscriber_codes
def _format_item_meta(self, article, item_meta, item): self._format_itemClass(article, item_meta) self._format_provider(item_meta) self._format_versioncreated(article, item_meta) self._format_firstcreated(article, item_meta) self._format_pubstatus(article, item_meta) if article.get(EMBARGO): SubElement(item_meta, 'embargoed').text = \ get_utc_schedule(article, EMBARGO).isoformat() # optional properties self._format_ednote(article, item_meta) self._format_signal(article, item_meta)
def get_subscribers(self, doc, target_media_type): """ Get the subscribers for this document based on the target_media_type for article Correction. 1. The article is sent to Subscribers (digital and wire) who has received the article previously. 2. For subsequent takes, only published to previously published wire clients. Digital clients don't get individual takes but digital client takes package. 3. If the item has embargo and is a future date then fetch active Wire Subscribers. Otherwise fetch Active Subscribers. After fetching exclude those who received the article previously from active subscribers list. 4. If article has 'targeted_for' property then exclude subscribers of type Internet from Subscribers list. 5. Filter the subscriber that have not received the article previously against publish filters and global filters for this document. :param doc: Document to correct :param target_media_type: dictate if the doc being queued is a Takes Package or an Individual Article. Valid values are - Wire, Digital. If Digital then the doc being queued is a Takes Package and if Wire then the doc being queues is an Individual Article. :return: (list, list) List of filtered subscribers, List of subscribers that have not received item previously """ subscribers, subscribers_yet_to_receive = [], [] # step 1 query = {'$and': [{'item_id': doc['item_id']}, {'publishing_action': {'$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED]}}]} subscribers, subscriber_codes = self._get_subscribers_for_previously_sent_items(query) if subscribers: # step 2 if not self.takes_package_service.get_take_package_id(doc): # Step 3 query = {'is_active': True} if doc.get(EMBARGO) and get_utc_schedule(doc, EMBARGO) > utcnow(): query['subscriber_type'] = SUBSCRIBER_TYPES.WIRE active_subscribers = list(get_resource_service('subscribers').get(req=None, lookup=query)) subscribers_yet_to_receive = [a for a in active_subscribers if not any(a[config.ID_FIELD] == s[config.ID_FIELD] for s in subscribers)] if len(subscribers_yet_to_receive) > 0: # Step 4 if doc.get('targeted_for'): subscribers_yet_to_receive = list(self.non_digital(subscribers_yet_to_receive)) # Step 5 subscribers_yet_to_receive, codes = \ self.filter_subscribers(doc, subscribers_yet_to_receive, SUBSCRIBER_TYPES.WIRE if doc.get('targeted_for') else target_media_type) if codes: subscriber_codes.update(codes) return subscribers, subscribers_yet_to_receive, subscriber_codes
def _format_news_management(self, formatted_article, news_item): """ Create a NewsManagement element :param dict formatted_article: :param Element news_item: """ news_management = SubElement(news_item, "NewsManagement") SubElement(news_management, 'NewsItemType', {'FormalName': 'News'}) SubElement(news_management, 'FirstCreated').text = \ formatted_article['firstcreated'].strftime('%Y%m%dT%H%M%S+0000') SubElement(news_management, 'ThisRevisionCreated').text = \ formatted_article['versioncreated'].strftime('%Y%m%dT%H%M%S+0000') if formatted_article.get(EMBARGO): SubElement(news_management, 'Status', {'FormalName': 'Embargoed'}) status_will_change = SubElement(news_management, 'StatusWillChange') SubElement(status_will_change, 'FutureStatus', {'FormalName': formatted_article['pubstatus']}) SubElement(status_will_change, 'DateAndTime').text = \ get_utc_schedule(formatted_article, EMBARGO).isoformat() else: SubElement(news_management, 'Status', {'FormalName': formatted_article['pubstatus']}) if formatted_article.get('urgency'): SubElement(news_management, 'Urgency', {'FormalName': str(formatted_article['urgency'])}) if formatted_article['state'] == 'corrected': SubElement(news_management, 'Instruction', {'FormalName': 'Correction'}) else: SubElement(news_management, 'Instruction', {'FormalName': 'Update'}) SubElement(news_management, 'Property', { 'FormalName': 'reuters.3rdPartyStyleGuideVersion', 'Value': '2.1' }) SubElement( news_management, 'Property', { 'FormalName': 'USN', 'Value': 'AAP' + str(int(formatted_article.get('unique_id', 1)) % 100000) + 'a' })
def _format_item_set(self, article, item_set, item_type): """Construct the item element (newsItem or packageItem) and append the item_meta and contentMeta entities :param dict article: :param element item_set: :param str item_type: """ item = SubElement(item_set, item_type, attrib={'standard': 'NewsML-G2', 'standardversion': '2.18', 'guid': article['guid'], 'version': str(article[superdesk.config.VERSION]), XML_LANG: article.get('language', 'en'), 'conformance': 'power'}) SubElement(item, 'catalogRef', attrib={'href': 'http://www.iptc.org/std/catalog/catalog.IPTC-G2-Standards_25.xml'}) self._format_rights(item, article) item_meta = SubElement(item, 'itemMeta') self._format_itemClass(article, item_meta) self._format_provider(item_meta) self._format_versioncreated(article, item_meta) self._format_firstcreated(article, item_meta) self._format_pubstatus(article, item_meta) if article.get(EMBARGO): SubElement(item_meta, 'embargoed').text = \ get_utc_schedule(article, EMBARGO).isoformat() # optional properties self._format_ednote(article, item_meta) self._format_signal(article, item_meta) content_meta = SubElement(item, 'contentMeta') SubElement(content_meta, 'urgency').text = str(article.get('urgency', 5)) self._format_timestamps(article, content_meta) self._format_creator(article, content_meta) self._format_located(article, content_meta) self._format_subject(article, content_meta) self._format_genre(article, content_meta) self._format_slugline(article, content_meta) self._format_headline(article, content_meta) self._format_place(article, content_meta) self._format_category(article, content_meta) self._format_company_codes(article, content_meta, item) if article[ITEM_TYPE] in {CONTENT_TYPE.PICTURE, CONTENT_TYPE.AUDIO, CONTENT_TYPE.VIDEO}: self._format_description(article, content_meta) self._format_creditline(article, content_meta) return item
def _format_item_set(self, article, item_set, item_type): """ Construct the item element (newsItem or packageItem) and append the item_meta and contentMeta entities :param dict article: :param element item_set: :param str item_type: """ item = SubElement(item_set, item_type, attrib={'standard': 'NewsML-G2', 'standardversion': '2.18', 'guid': article['guid'], 'version': str(article[superdesk.config.VERSION]), 'xml:lang': article.get('language', 'en'), 'conformance': 'power'}) SubElement(item, 'catalogRef', attrib={'href': 'http://www.iptc.org/std/catalog/catalog.IPTC-G2-Standards_25.xml'}) self._format_rights(item, article) item_meta = SubElement(item, 'itemMeta') self._format_itemClass(article, item_meta) self._format_provider(item_meta) self._format_versioncreated(article, item_meta) self._format_firstcreated(article, item_meta) self._format_pubstatus(article, item_meta) if article.get(EMBARGO): SubElement(item_meta, 'embargoed').text = \ get_utc_schedule(article, EMBARGO).isoformat() # optional properties self._format_ednote(article, item_meta) self._format_signal(article, item_meta) content_meta = SubElement(item, 'contentMeta') SubElement(content_meta, 'urgency').text = str(article.get('urgency', 5)) self._format_timestamps(article, content_meta) self._format_creator(article, content_meta) self._format_located(article, content_meta) self._format_subject(article, content_meta) self._format_genre(article, content_meta) self._format_slugline(article, content_meta) self._format_headline(article, content_meta) self._format_place(article, content_meta) self._format_category(article, content_meta) self._format_company_codes(article, content_meta, item) if article[ITEM_TYPE] in {CONTENT_TYPE.PICTURE, CONTENT_TYPE.AUDIO, CONTENT_TYPE.VIDEO}: self._format_description(article, content_meta) self._format_creditline(article, content_meta) return item
def enqueue_items(published_items): """ Creates the corresponding entries in the publish queue for each item :param list published_items: the list of items marked for publishing """ failed_items = {} current_utc = utcnow() for queue_item in published_items: try: schedule_utc_datetime = get_utc_schedule(queue_item, PUBLISH_SCHEDULE) if not schedule_utc_datetime or schedule_utc_datetime < current_utc: enqueue_item(queue_item) except: logger.exception('Failed to queue item {}'.format(queue_item.get('_id'))) failed_items[str(queue_item.get('_id'))] = queue_item # mark failed items as pending so that Celery tasks will try again if len(failed_items) > 0: logger.error('Failed to publish the following items: {}'.format(failed_items.keys()))
def _format_news_management(self, article, news_item): """ Create a NewsManagement element :param dict article: :param Element news_item: """ news_management = SubElement(news_item, "NewsManagement") SubElement(news_management, "NewsItemType", {"FormalName": "News"}) SubElement(news_management, "FirstCreated").text = article["firstcreated"].strftime( "%Y%m%dT%H%M%S+0000") SubElement( news_management, "ThisRevisionCreated").text = article["versioncreated"].strftime( "%Y%m%dT%H%M%S+0000") if article.get(EMBARGO): SubElement(news_management, "Status", {"FormalName": "Embargoed"}) status_will_change = SubElement(news_management, "StatusWillChange") SubElement(status_will_change, "FutureStatus", {"FormalName": article["pubstatus"]}) SubElement(status_will_change, "DateAndTime").text = get_utc_schedule( article, EMBARGO).isoformat() else: SubElement(news_management, "Status", {"FormalName": article["pubstatus"]}) if article.get("urgency"): SubElement(news_management, "Urgency", {"FormalName": str(article["urgency"])}) if article["state"] == "corrected": SubElement(news_management, "Instruction", {"FormalName": "Correction"}) else: SubElement(news_management, "Instruction", {"FormalName": "Update"})
def format(self, article, subscriber, codes=None): """ Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service( 'subscribers').generate_sequence_number(subscriber) sms_message = article.get('sms_message', article.get('abstract', '')).replace('\'', '\'\'') # category = 1 is used to indicate a test message category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \ else article.get('anpa_category', [{}])[0].get('qcode').upper() odbc_item = { 'Sequence': pub_seq_num, 'Category': category, 'Headline': BeautifulSoup(sms_message, 'html.parser').text, 'Priority': map_priority(article.get('priority')) } body = self.append_body_footer(article) if article.get(EMBARGO): embargo = '{}{}'.format( 'Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) body = embargo + body if article[ITEM_TYPE] == CONTENT_TYPE.TEXT: body = BeautifulSoup(body, "html.parser").text odbc_item['StoryText'] = body.replace('\'', '\'\'') # @article_text odbc_item['ident'] = '0' return [(pub_seq_num, json.dumps(odbc_item))] except Exception as ex: raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def enqueue_items(published_items): """ Creates the corresponding entries in the publish queue for each item :param list published_items: the list of items marked for publishing """ failed_items = {} current_utc = utcnow() for queue_item in published_items: try: schedule_utc_datetime = get_utc_schedule(queue_item, PUBLISH_SCHEDULE) if not schedule_utc_datetime or schedule_utc_datetime < current_utc: enqueue_item(queue_item) except: logger.exception('Failed to queue item {}'.format( queue_item.get('_id'))) failed_items[str(queue_item.get('_id'))] = queue_item # mark failed items as pending so that Celery tasks will try again if len(failed_items) > 0: logger.error('Failed to publish the following items: {}'.format( failed_items.keys()))
def format(self, article, subscriber): try: docs = [] for category in article.get('anpa_category'): pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) anpa = [] # selector codes are only injected for those subscribers that are defined # in the mapper selectors = dict() SelectorcodeMapper().map(article, category.get('qcode').upper(), subscriber=subscriber, formatted_item=selectors) if 'selector_codes' in selectors and selectors['selector_codes']: anpa.append(b'\x05') anpa.append(selectors['selector_codes'].encode('ascii')) anpa.append(b'\x0D\x0A') # start of message header (syn syn soh) anpa.append(b'\x16\x16\x01') anpa.append(article.get('service_level', 'a').lower().encode('ascii')) # story number anpa.append(str(pub_seq_num).zfill(4).encode('ascii')) # field seperator anpa.append(b'\x0A') # -LF anpa.append(map_priority(article.get('priority')).encode('ascii')) anpa.append(b'\x20') anpa.append(category['qcode'].encode('ascii')) anpa.append(b'\x13') # format identifier if article[ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED: anpa.append(b'\x12') else: anpa.append(b'\x11') anpa.append(b'\x20') # keyword keyword = 'bc-{}'.format(self.append_legal(article=article, truncate=True)).replace(' ', '-') keyword = keyword[:24] if len(keyword) > 24 else keyword anpa.append(keyword.encode('ascii')) anpa.append(b'\x20') # version field anpa.append(b'\x20') # reference field anpa.append(b'\x20') # filing date anpa.append('{}-{}'.format(article['_updated'].strftime('%m'), article['_updated'].strftime('%d')).encode('ascii')) anpa.append(b'\x20') # add the word count anpa.append(str(article.get('word_count', '0000')).zfill(4).encode('ascii')) anpa.append(b'\x0D\x0A') anpa.append(b'\x02') # STX self._process_headline(anpa, article, category['qcode'].encode('ascii')) keyword = self.append_legal(article=article, truncate=True).encode('ascii', 'ignore') anpa.append(keyword) take_key = article.get('anpa_take_key', '').encode('ascii', 'ignore') anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'') anpa.append(b'\x0D\x0A') if BYLINE in article: anpa.append(article.get(BYLINE).encode('ascii', 'ignore')) anpa.append(b'\x0D\x0A') if article.get('dateline', {}).get('text'): anpa.append(article.get('dateline').get('text').encode('ascii', 'ignore')) body = self.append_body_footer(article) if article.get(EMBARGO): embargo = '{}{}'.format('Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) body = embargo + body if article[ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED: anpa.append(body.encode('ascii', 'replace')) else: anpa.append(BeautifulSoup(body, "html.parser").text.encode('ascii', 'replace')) anpa.append(b'\x0D\x0A') if article.get('more_coming', False): anpa.append('MORE'.encode('ascii')) else: anpa.append(article.get('source', '').encode('ascii')) sign_off = article.get('sign_off', '').encode('ascii') anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'') anpa.append(b'\x0D\x0A') anpa.append(b'\x03') # ETX # time and date anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii')) anpa.append(b'\x04') # EOT anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A') docs.append((pub_seq_num, b''.join(anpa))) return docs except Exception as ex: raise FormatterError.AnpaFormatterError(ex, subscriber)
def get_subscribers(self, doc, target_media_type): """ Get the subscribers for this document based on the target_media_type for publishing. 1. If the item has embargo and is a future date then fetch active Wire Subscribers. Otherwise get all active subscribers. a. Get the list of takes subscribers if Takes Package 2. If takes package then subsequent takes are sent to same wire subscriber as first take. 3. Filter the subscriber list based on the publish filter and global filters (if configured). a. Publish to takes package subscribers if the takes package is received by the subscriber. b. Rewrites are sent to subscribers that received the original item or the previous rewrite. :param dict doc: Document to publish/correct/kill :param str target_media_type: dictate if the doc being queued is a Takes Package or an Individual Article. Valid values are - Wire, Digital. If Digital then the doc being queued is a Takes Package and if Wire then the doc being queues is an Individual Article. :return: (list, list, dict) List of filtered subscriber, List of subscribers that have not received item previously (empty list in this case). List of product codes per subscriber """ subscribers, subscribers_yet_to_receive, takes_subscribers, rewrite_subscribers = [], [], [], [] subscriber_codes, take_codes, codes, rewrite_codes = {}, {}, {}, {} first_take = None # Step 3b rewrite_of = doc.get('rewrite_of') rewrite_take_package = None if rewrite_of: rewrite_of_item = get_resource_service('archive').find_one(req=None, _id=rewrite_of) if rewrite_of_item: if is_takes_package(rewrite_of_item): rewrite_take_package = rewrite_of_item else: rewrite_take_package = self.takes_package_service.get_take_package(rewrite_of_item) # Step 1 query = {'is_active': True} if doc.get(EMBARGO) and get_utc_schedule(doc, EMBARGO) > utcnow(): query['subscriber_type'] = SUBSCRIBER_TYPES.WIRE # Ta 04/05/16: Commenting out this section for ticket SD-4465 # query['media_type'] = SUBSCRIBER_MEDIA_TYPES.MEDIA subscribers = list(get_resource_service('subscribers').get(req=None, lookup=query)) if doc.get(ITEM_TYPE) in [CONTENT_TYPE.COMPOSITE] and doc.get(PACKAGE_TYPE) == TAKES_PACKAGE: # Step 1a query = {'$and': [{'item_id': doc['item_id']}, {'publishing_action': {'$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED]}}]} takes_subscribers, take_codes = self._get_subscribers_for_previously_sent_items(query) if rewrite_of and rewrite_take_package: # Step 3b query = {'$and': [{'item_id': rewrite_take_package.get(config.ID_FIELD)}, {'publishing_action': {'$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED]}}]} rewrite_subscribers, rewrite_codes = self._get_subscribers_for_previously_sent_items(query) # Step 2 if doc.get(ITEM_TYPE) in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: # get first take first_take = self.takes_package_service.get_take_by_take_no(doc, 1) if str(doc['item_id']) == str(first_take): # if the current document is the first take then continue first_take = None if first_take: # if first take is published then subsequent takes should to same subscribers. query = {'$and': [{'item_id': first_take}, {'publishing_action': {'$in': [CONTENT_STATE.PUBLISHED]}}]} subscribers, subscriber_codes = self._get_subscribers_for_previously_sent_items(query) if rewrite_of: # Step 3b if rewrite_take_package and rewrite_take_package.get(config.ID_FIELD) == rewrite_of: item_ids = self.package_service.get_residrefs(rewrite_take_package) else: item_ids = [rewrite_of] query = {'$and': [{'item_id': {'$in': item_ids}}, {'publishing_action': {'$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED]}}]} rewrite_subscribers, rewrite_codes = self._get_subscribers_for_previously_sent_items(query) # Step 3 if not first_take: subscribers, codes = self.filter_subscribers(doc, subscribers, target_media_type) if takes_subscribers: # Step 3a subscribers_ids = set(s[config.ID_FIELD] for s in takes_subscribers) subscribers = takes_subscribers + [s for s in subscribers if s[config.ID_FIELD] not in subscribers_ids] if rewrite_subscribers: # Step 3b subscribers_ids = set(s[config.ID_FIELD] for s in rewrite_subscribers) subscribers = rewrite_subscribers + [s for s in subscribers if s[config.ID_FIELD] not in subscribers_ids] if take_codes: # join the codes subscriber_codes.update(take_codes) if rewrite_codes: # join the codes subscriber_codes.update(rewrite_codes) if codes: # join the codes subscriber_codes.update(codes) return subscribers, subscribers_yet_to_receive, subscriber_codes
def format(self, article, subscriber): try: docs = [] for category in article.get('anpa_category'): pub_seq_num = superdesk.get_resource_service( 'subscribers').generate_sequence_number(subscriber) anpa = [] # selector codes are only injected for those subscribers that are defined # in the mapper selectors = dict() SelectorcodeMapper().map(article, category.get('qcode').upper(), subscriber=subscriber, formatted_item=selectors) if 'selector_codes' in selectors and selectors[ 'selector_codes']: anpa.append(b'\x05') anpa.append(selectors['selector_codes'].encode('ascii')) anpa.append(b'\x0D\x0A') # start of message header (syn syn soh) anpa.append(b'\x16\x16\x01') anpa.append( article.get('service_level', 'a').lower().encode('ascii')) # story number anpa.append(str(pub_seq_num).zfill(4).encode('ascii')) # field seperator anpa.append(b'\x0A') # -LF anpa.append( map_priority(article.get('priority')).encode('ascii')) anpa.append(b'\x20') anpa.append(category['qcode'].encode('ascii')) anpa.append(b'\x13') # format identifier if article[ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED: anpa.append(b'\x12') else: anpa.append(b'\x11') anpa.append(b'\x20') # keyword keyword = 'bc-{}'.format( self.append_legal(article=article, truncate=True)).replace(' ', '-') keyword = keyword[:24] if len(keyword) > 24 else keyword anpa.append(keyword.encode('ascii')) anpa.append(b'\x20') # version field anpa.append(b'\x20') # reference field anpa.append(b'\x20') # filing date anpa.append('{}-{}'.format( article['_updated'].strftime('%m'), article['_updated'].strftime('%d')).encode('ascii')) anpa.append(b'\x20') # add the word count anpa.append( str(article.get('word_count', '0000')).zfill(4).encode('ascii')) anpa.append(b'\x0D\x0A') anpa.append(b'\x02') # STX self._process_headline(anpa, article, category['qcode'].encode('ascii')) keyword = self.append_legal(article=article, truncate=True).encode( 'ascii', 'ignore') anpa.append(keyword) take_key = article.get('anpa_take_key', '').encode('ascii', 'ignore') anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'') anpa.append(b'\x0D\x0A') if BYLINE in article: anpa.append(article.get(BYLINE).encode('ascii', 'ignore')) anpa.append(b'\x0D\x0A') if article.get('dateline', {}).get('text'): anpa.append( article.get('dateline').get('text').encode( 'ascii', 'ignore')) body = self.append_body_footer(article) if article.get(EMBARGO): embargo = '{}{}'.format( 'Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) body = embargo + body if article[ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED: anpa.append(body.encode('ascii', 'replace')) else: anpa.append( BeautifulSoup(body, "html.parser").text.encode( 'ascii', 'replace')) anpa.append(b'\x0D\x0A') if article.get('more_coming', False): anpa.append('MORE'.encode('ascii')) else: anpa.append(article.get('source', '').encode('ascii')) sign_off = article.get('sign_off', '').encode('ascii') anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'') anpa.append(b'\x0D\x0A') anpa.append(b'\x03') # ETX # time and date anpa.append(datetime.datetime.now().strftime( '%d-%m-%y %H-%M-%S').encode('ascii')) anpa.append(b'\x04') # EOT anpa.append( b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A' ) docs.append((pub_seq_num, b''.join(anpa))) return docs except Exception as ex: raise FormatterError.AnpaFormatterError(ex, subscriber)
def queue_transmission(self, doc, subscribers, subscriber_codes={}): """ Method formats and then queues the article for transmission to the passed subscribers. ::Important Note:: Format Type across Subscribers can repeat. But we can't have formatted item generated once based on the format_types configured across for all the subscribers as the formatted item must have a published sequence number generated by Subscriber. :param dict doc: document to queue for transmission :param list subscribers: List of subscriber dict. :return : (list, bool) tuple of list of missing formatters and boolean flag. True if queued else False """ try: queued = False no_formatters = [] for subscriber in subscribers: try: if doc[ITEM_TYPE] not in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED] and \ subscriber.get('subscriber_type', '') == SUBSCRIBER_TYPES.WIRE: # wire subscribers can get only text and preformatted stories continue for destination in subscriber['destinations']: embed_package_items = doc[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE and \ PACKAGE_TYPE not in doc and destination['config'].get('packaged', False) if embed_package_items: doc = self._embed_package_items(doc) # Step 2(a) formatter = get_formatter(destination['format'], doc) if not formatter: # if formatter not found then record it no_formatters.append(destination['format']) continue formatted_docs = formatter.format( doc, subscriber, subscriber_codes.get(subscriber[config.ID_FIELD])) for idx, publish_data in enumerate(formatted_docs): if not isinstance(publish_data, dict): pub_seq_num, formatted_doc = publish_data formatted_docs[idx] = { 'published_seq_num': pub_seq_num, 'formatted_item': formatted_doc } else: assert 'published_seq_num' in publish_data and 'formatted_item' in publish_data,\ "missing keys in publish_data" for publish_queue_item in formatted_docs: publish_queue_item['item_id'] = doc['item_id'] publish_queue_item['item_version'] = doc[ config.VERSION] publish_queue_item['subscriber_id'] = subscriber[ config.ID_FIELD] publish_queue_item['codes'] = subscriber_codes.get( subscriber[config.ID_FIELD]) publish_queue_item['destination'] = destination # publish_schedule is just to indicate in the queue item is create via scheduled item publish_queue_item[ PUBLISH_SCHEDULE] = get_utc_schedule( doc, PUBLISH_SCHEDULE) or None publish_queue_item['unique_name'] = doc.get( 'unique_name', None) publish_queue_item['content_type'] = doc.get( 'type', None) publish_queue_item['headline'] = doc.get( 'headline', None) publish_queue_item[ 'publishing_action'] = self.published_state publish_queue_item['ingest_provider'] = \ ObjectId(doc.get('ingest_provider')) if doc.get('ingest_provider') else None if doc.get(PUBLISHED_IN_PACKAGE): publish_queue_item[PUBLISHED_IN_PACKAGE] = doc[ PUBLISHED_IN_PACKAGE] try: encoded_item = publish_queue_item.pop( 'encoded_item') except KeyError: pass else: binary = io.BytesIO(encoded_item) publish_queue_item[ 'encoded_item_id'] = app.storage.put( binary) publish_queue_item.pop(ITEM_STATE, None) get_resource_service('publish_queue').post( [publish_queue_item]) queued = True except: logger.exception( "Failed to queue item for id {} with headline {} for subscriber {}." .format(doc.get(config.ID_FIELD), doc.get('headline'), subscriber.get('name'))) return no_formatters, queued except: raise
def queue_transmission(self, doc, subscribers, subscriber_codes={}): """ Method formats and then queues the article for transmission to the passed subscribers. ::Important Note:: Format Type across Subscribers can repeat. But we can't have formatted item generated once based on the format_types configured across for all the subscribers as the formatted item must have a published sequence number generated by Subscriber. :param dict doc: document to queue for transmission :param list subscribers: List of subscriber dict. :return : (list, bool) tuple of list of missing formatters and boolean flag. True if queued else False """ try: queued = False no_formatters = [] for subscriber in subscribers: try: if doc[ITEM_TYPE] not in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED] and \ subscriber.get('subscriber_type', '') == SUBSCRIBER_TYPES.WIRE: # wire subscribers can get only text and preformatted stories continue for destination in subscriber['destinations']: embed_package_items = doc[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE and \ PACKAGE_TYPE not in doc and destination['config'].get('packaged', False) if embed_package_items: doc = self._embed_package_items(doc) # Step 2(a) formatter = get_formatter(destination['format'], doc) if not formatter: # if formatter not found then record it no_formatters.append(destination['format']) continue formatted_docs = formatter.format(doc, subscriber) for pub_seq_num, formatted_doc in formatted_docs: publish_queue_item = dict() publish_queue_item['item_id'] = doc['item_id'] publish_queue_item['item_version'] = doc[config.VERSION] publish_queue_item['formatted_item'] = formatted_doc publish_queue_item['subscriber_id'] = subscriber[config.ID_FIELD] publish_queue_item['codes'] = subscriber_codes.get(subscriber[config.ID_FIELD]) publish_queue_item['destination'] = destination publish_queue_item['published_seq_num'] = pub_seq_num # publish_schedule is just to indicate in the queue item is create via scheduled item publish_queue_item[PUBLISH_SCHEDULE] = get_utc_schedule(doc, PUBLISH_SCHEDULE) or None publish_queue_item['unique_name'] = doc.get('unique_name', None) publish_queue_item['content_type'] = doc.get('type', None) publish_queue_item['headline'] = doc.get('headline', None) publish_queue_item['publishing_action'] = self.published_state if doc.get(PUBLISHED_IN_PACKAGE): publish_queue_item[PUBLISHED_IN_PACKAGE] = doc[PUBLISHED_IN_PACKAGE] publish_queue_item.pop(ITEM_STATE, None) get_resource_service('publish_queue').post([publish_queue_item]) queued = True except: logger.exception("Failed to queue item for id {} with headline {} for subscriber {}." .format(doc.get(config.ID_FIELD), doc.get('headline'), subscriber.get('name'))) return no_formatters, queued except: raise
def format(self, article, subscriber, codes=None): try: docs = [] formatted_article = deepcopy(article) for category in self._get_category_list(formatted_article.get('anpa_category')): mapped_source = self._get_mapped_source(formatted_article) formatted_article[config.ID_FIELD] = formatted_article.get('item_id', formatted_article.get(config.ID_FIELD)) is_last_take = TakesPackageService().is_last_takes_package_item(formatted_article) is_first_part = formatted_article.get('sequence', 1) == 1 pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) anpa = [] if codes: anpa.append(b'\x05') anpa.append(' '.join(codes).encode('ascii')) anpa.append(b'\x0D\x0A') # start of message header (syn syn soh) anpa.append(b'\x16\x16\x01') anpa.append(get_service_level(category, formatted_article).encode('ascii')) # story number anpa.append(str(pub_seq_num).zfill(4).encode('ascii')) # field seperator anpa.append(b'\x0A') # -LF anpa.append(map_priority(formatted_article.get('priority')).encode('ascii')) anpa.append(b'\x20') anpa.append(category['qcode'].lower().encode('ascii')) anpa.append(b'\x13') # format identifier if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED: anpa.append(b'\x12') else: anpa.append(b'\x11') anpa.append(b'\x20') # keyword keyword = 'bc-{}'.format(self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-') keyword = keyword[:24] if len(keyword) > 24 else keyword anpa.append(keyword.encode('ascii')) anpa.append(b'\x20') # version field anpa.append(b'\x20') # reference field anpa.append(b'\x20') # filing date anpa.append('{}-{}'.format(formatted_article['_updated'].strftime('%m'), formatted_article['_updated'].strftime('%d')).encode('ascii')) anpa.append(b'\x20') # add the word count anpa.append(str(formatted_article.get('word_count', '0000')).zfill(4).encode('ascii')) anpa.append(b'\x0D\x0A') anpa.append(b'\x02') # STX self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii')) keyword = SluglineMapper().map(article=formatted_article, category=category['qcode'].upper(), truncate=True).encode('ascii', 'ignore') anpa.append(keyword) take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore') anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'') anpa.append(b'\x0D\x0A') if formatted_article.get(EMBARGO): embargo = '{}{}\r\n'.format('Embargo Content. Timestamp: ', get_utc_schedule(formatted_article, EMBARGO).isoformat()) anpa.append(embargo.encode('ascii', 'replace')) if formatted_article.get('ednote', '') != '': ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote'))) anpa.append(ednote.encode('ascii', 'replace')) if formatted_article.get(BYLINE): anpa.append(BeautifulSoup(formatted_article.get(BYLINE), 'html.parser').text.encode ('ascii', 'ignore')) anpa.append(b'\x0D\x0A') if formatted_article.get(FORMAT) == FORMATS.PRESERVED: soup = BeautifulSoup(self.append_body_footer(formatted_article), "html.parser") anpa.append(soup.get_text().encode('ascii', 'replace')) else: body = to_ascii(formatted_article.get('body_html', '')) # we need to inject the dateline if is_first_part and formatted_article.get('dateline', {}).get('text') \ and not article.get('auto_publish', False): soup = BeautifulSoup(body, "html.parser") ptag = soup.find('p') if ptag is not None: ptag.insert(0, NavigableString( '{} '.format(formatted_article.get('dateline').get('text')).encode('ascii', 'ignore'))) body = str(soup) anpa.append(self.get_text_content(body)) if formatted_article.get('body_footer'): anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', '')))) anpa.append(b'\x0D\x0A') if not is_last_take: anpa.append('MORE'.encode('ascii')) else: anpa.append(mapped_source.encode('ascii')) sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii') anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'') anpa.append(b'\x0D\x0A') anpa.append(b'\x03') # ETX # time and date anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii')) anpa.append(b'\x04') # EOT anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A') docs.append({'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa), 'formatted_item': b''.join(anpa).decode('ascii')}) return docs except Exception as ex: raise FormatterError.AnpaFormatterError(ex, subscriber)
def queue_transmission(self, doc, subscribers, subscriber_codes=None, associations=None, sent=False): """Method formats and then queues the article for transmission to the passed subscribers. ::Important Note:: Format Type across Subscribers can repeat. But we can't have formatted item generated once based on the format_types configured across for all the subscribers as the formatted item must have a published sequence number generated by Subscriber. :param dict doc: document to queue for transmission :param list subscribers: List of subscriber dict. :return : (list, bool) tuple of list of missing formatters and boolean flag. True if queued else False """ if associations is None: associations = {} if subscriber_codes is None: subscriber_codes = {} try: if config.PUBLISH_ASSOCIATIONS_RESEND and not sent: is_correction = doc.get("state") in [ "corrected", "being_corrected" ] is_update = doc.get("rewrite_of") is_new = not is_correction and not is_update if config.PUBLISH_ASSOCIATIONS_RESEND == "new" and is_new: self.resend_association_items(doc) elif config.PUBLISH_ASSOCIATIONS_RESEND == "corrections": self.resend_association_items(doc) elif config.PUBLISH_ASSOCIATIONS_RESEND == "updates" and not is_correction: self.resend_association_items(doc) queued = False no_formatters = [] for subscriber in subscribers: try: if (doc[ITEM_TYPE] not in [ CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED ] and subscriber.get("subscriber_type", "") == SUBSCRIBER_TYPES.WIRE): # wire subscribers can get only text and preformatted stories continue for destination in self.get_destinations(subscriber): embed_package_items = doc[ ITEM_TYPE] == CONTENT_TYPE.COMPOSITE and ( destination.get("config") or {}).get( "packaged", False) if embed_package_items: doc = self._embed_package_items(doc) if doc.get(PUBLISHED_IN_PACKAGE) and ( destination.get("config") or {}).get( "packaged", False): continue # Step 2(a) formatter = get_formatter(destination["format"], doc) if not formatter: # if formatter not found then record it no_formatters.append(destination["format"]) continue formatter.set_destination(destination, subscriber) formatted_docs = formatter.format( self.filter_document(doc), subscriber, subscriber_codes.get(subscriber[config.ID_FIELD])) for idx, publish_data in enumerate(formatted_docs): if not isinstance(publish_data, dict): pub_seq_num, formatted_doc = publish_data formatted_docs[idx] = { "published_seq_num": pub_seq_num, "formatted_item": formatted_doc, } else: assert ("published_seq_num" in publish_data and "formatted_item" in publish_data ), "missing keys in publish_data" for publish_queue_item in formatted_docs: publish_queue_item["item_id"] = doc["item_id"] publish_queue_item["item_version"] = doc[ config.VERSION] publish_queue_item["subscriber_id"] = subscriber[ config.ID_FIELD] publish_queue_item["codes"] = subscriber_codes.get( subscriber[config.ID_FIELD]) publish_queue_item["destination"] = destination # publish_schedule is just to indicate in the queue item is create via scheduled item publish_queue_item[ PUBLISH_SCHEDULE] = get_utc_schedule( doc, PUBLISH_SCHEDULE) or None publish_queue_item["unique_name"] = doc.get( "unique_name", None) publish_queue_item["content_type"] = doc.get( "type", None) publish_queue_item["headline"] = doc.get( "headline", None) publish_queue_item[ "publishing_action"] = self.published_state publish_queue_item["ingest_provider"] = ( ObjectId(doc.get("ingest_provider")) if doc.get("ingest_provider") else None) publish_queue_item[ "associated_items"] = associations.get( subscriber[config.ID_FIELD], []) publish_queue_item["priority"] = subscriber.get( "priority") if doc.get(PUBLISHED_IN_PACKAGE): publish_queue_item[PUBLISHED_IN_PACKAGE] = doc[ PUBLISHED_IN_PACKAGE] try: encoded_item = publish_queue_item.pop( "encoded_item") except KeyError: pass else: binary = io.BytesIO(encoded_item) publish_queue_item[ "encoded_item_id"] = app.storage.put( binary) publish_queue_item.pop(ITEM_STATE, None) # content api delivery will be marked as SUCCESS in queue get_resource_service("publish_queue").post( [publish_queue_item]) queued = True except Exception: logger.exception( "Failed to queue item for id {} with headline {} for subscriber {}." .format(doc.get(config.ID_FIELD), doc.get("headline"), subscriber.get("name"))) return no_formatters, queued except Exception: raise
def get_subscribers(self, doc, target_media_type): """Get the subscribers for this document based on the target_media_type for publishing. 1. If the item has embargo and is a future date then fetch active Wire Subscribers. Otherwise get all active subscribers. a. Get the list of takes subscribers if Takes Package 2. If takes package then subsequent takes are sent to same wire subscriber as first take. 3. Filter the subscriber list based on the publish filter and global filters (if configured). a. Publish to takes package subscribers if the takes package is received by the subscriber. b. Rewrites are sent to subscribers that received the original item or the previous rewrite. :param dict doc: Document to publish/correct/kill :param str target_media_type: dictate if the doc being queued is a Takes Package or an Individual Article. Valid values are - Wire, Digital. If Digital then the doc being queued is a Takes Package and if Wire then the doc being queues is an Individual Article. :return: (list, list, dict) List of filtered subscriber, List of subscribers that have not received item previously (empty list in this case). List of product codes per subscriber """ subscribers, subscribers_yet_to_receive, takes_subscribers, rewrite_subscribers = [], [], [], [] subscriber_codes, take_codes, codes, rewrite_codes = {}, {}, {}, {} first_take = None # Step 3b rewrite_of = doc.get('rewrite_of') rewrite_take_package = None if rewrite_of: rewrite_of_item = get_resource_service('archive').find_one( req=None, _id=rewrite_of) if rewrite_of_item: if is_takes_package(rewrite_of_item): rewrite_take_package = rewrite_of_item else: rewrite_take_package = self.takes_package_service.get_take_package( rewrite_of_item) # Step 1 query = {'is_active': True} if doc.get(EMBARGO) and get_utc_schedule(doc, EMBARGO) > utcnow(): query['subscriber_type'] = SUBSCRIBER_TYPES.WIRE # Ta 04/05/16: Commenting out this section for ticket SD-4465 # query['media_type'] = SUBSCRIBER_MEDIA_TYPES.MEDIA subscribers = list( get_resource_service('subscribers').get(req=None, lookup=query)) if doc.get(ITEM_TYPE) in [CONTENT_TYPE.COMPOSITE ] and doc.get(PACKAGE_TYPE) == TAKES_PACKAGE: # Step 1a query = { '$and': [{ 'item_id': doc['item_id'] }, { 'publishing_action': { '$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED] } }] } takes_subscribers, take_codes = self._get_subscribers_for_previously_sent_items( query) if rewrite_of and rewrite_take_package: # Step 3b query = { '$and': [{ 'item_id': rewrite_take_package.get(config.ID_FIELD) }, { 'publishing_action': { '$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED] } }] } rewrite_subscribers, rewrite_codes = self._get_subscribers_for_previously_sent_items( query) # Step 2 if doc.get(ITEM_TYPE) in [ CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED ]: # get first take first_take = self.takes_package_service.get_take_by_take_no(doc, 1) if str(doc['item_id']) == str(first_take): # if the current document is the first take then continue first_take = None if first_take: # if first take is published then subsequent takes should to same subscribers. query = { '$and': [{ 'item_id': first_take }, { 'publishing_action': { '$in': [CONTENT_STATE.PUBLISHED] } }] } subscribers, subscriber_codes = self._get_subscribers_for_previously_sent_items( query) if rewrite_of: # Step 3b if rewrite_take_package and rewrite_take_package.get( config.ID_FIELD) == rewrite_of: item_ids = self.package_service.get_residrefs( rewrite_take_package) else: item_ids = [rewrite_of] query = { '$and': [{ 'item_id': { '$in': item_ids } }, { 'publishing_action': { '$in': [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED] } }] } rewrite_subscribers, rewrite_codes = self._get_subscribers_for_previously_sent_items( query) # Step 3 if not first_take: subscribers, codes = self.filter_subscribers( doc, subscribers, target_media_type) if takes_subscribers: # Step 3a subscribers_ids = set(s[config.ID_FIELD] for s in takes_subscribers) subscribers = takes_subscribers + [ s for s in subscribers if s[config.ID_FIELD] not in subscribers_ids ] if rewrite_subscribers: # Step 3b subscribers_ids = set(s[config.ID_FIELD] for s in rewrite_subscribers) subscribers = rewrite_subscribers + [ s for s in subscribers if s[config.ID_FIELD] not in subscribers_ids ] if take_codes: # join the codes subscriber_codes.update(take_codes) if rewrite_codes: # join the codes subscriber_codes.update(rewrite_codes) if codes: # join the codes subscriber_codes.update(codes) return subscribers, subscribers_yet_to_receive, subscriber_codes
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } try: ninjs['byline'] = self._get_byline(article) except: pass located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs['associations'] = self._get_associations(article, subscriber) if 'associations' in article: ninjs['associations'].update(self._format_related(article, subscriber)) elif article.get('associations', {}): ninjs['associations'] = self._format_related(article, subscriber) elif article.get('associations'): ninjs['associations'] = self._format_related(article, subscriber) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if article.get('abstract'): abstract = article.get('abstract') ninjs['description_html'] = abstract soup = BeautifulSoup(abstract, 'html.parser') ninjs['description_text'] = soup.get_text() elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '')}]} for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] return ninjs
def queue_transmission(self, doc, subscribers, subscriber_codes={}, associations={}): """Method formats and then queues the article for transmission to the passed subscribers. ::Important Note:: Format Type across Subscribers can repeat. But we can't have formatted item generated once based on the format_types configured across for all the subscribers as the formatted item must have a published sequence number generated by Subscriber. :param dict doc: document to queue for transmission :param list subscribers: List of subscriber dict. :return : (list, bool) tuple of list of missing formatters and boolean flag. True if queued else False """ try: queued = False no_formatters = [] for subscriber in subscribers: try: if doc[ITEM_TYPE] not in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED] and \ subscriber.get('subscriber_type', '') == SUBSCRIBER_TYPES.WIRE: # wire subscribers can get only text and preformatted stories continue for destination in self.get_destinations(subscriber): embed_package_items = doc[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE and \ (destination.get('config') or {}).get('packaged', False) if embed_package_items: doc = self._embed_package_items(doc) if doc.get(PUBLISHED_IN_PACKAGE) and \ (destination.get('config') or {}).get('packaged', False): continue # Step 2(a) formatter = get_formatter(destination['format'], doc) if not formatter: # if formatter not found then record it no_formatters.append(destination['format']) continue formatted_docs = formatter.format(apply_schema(doc), subscriber, subscriber_codes.get(subscriber[config.ID_FIELD])) for idx, publish_data in enumerate(formatted_docs): if not isinstance(publish_data, dict): pub_seq_num, formatted_doc = publish_data formatted_docs[idx] = {'published_seq_num': pub_seq_num, 'formatted_item': formatted_doc} else: assert 'published_seq_num' in publish_data and 'formatted_item' in publish_data,\ "missing keys in publish_data" for publish_queue_item in formatted_docs: publish_queue_item['item_id'] = doc['item_id'] publish_queue_item['item_version'] = doc[config.VERSION] publish_queue_item['subscriber_id'] = subscriber[config.ID_FIELD] publish_queue_item['codes'] = subscriber_codes.get(subscriber[config.ID_FIELD]) publish_queue_item['destination'] = destination # publish_schedule is just to indicate in the queue item is create via scheduled item publish_queue_item[PUBLISH_SCHEDULE] = get_utc_schedule(doc, PUBLISH_SCHEDULE) or None publish_queue_item['unique_name'] = doc.get('unique_name', None) publish_queue_item['content_type'] = doc.get('type', None) publish_queue_item['headline'] = doc.get('headline', None) publish_queue_item['publishing_action'] = self.published_state publish_queue_item['ingest_provider'] = \ ObjectId(doc.get('ingest_provider')) if doc.get('ingest_provider') else None publish_queue_item['associated_items'] = associations.get(subscriber[config.ID_FIELD], []) if doc.get(PUBLISHED_IN_PACKAGE): publish_queue_item[PUBLISHED_IN_PACKAGE] = doc[PUBLISHED_IN_PACKAGE] try: encoded_item = publish_queue_item.pop('encoded_item') except KeyError: pass else: binary = io.BytesIO(encoded_item) publish_queue_item['encoded_item_id'] = app.storage.put(binary) publish_queue_item.pop(ITEM_STATE, None) # content api delivery will be marked as SUCCESS in queue get_resource_service('publish_queue').post([publish_queue_item]) queued = True except Exception: logger.exception("Failed to queue item for id {} with headline {} for subscriber {}." .format(doc.get(config.ID_FIELD), doc.get('headline'), subscriber.get('name'))) return no_formatters, queued except Exception: raise
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { "guid": article.get(GUID_FIELD, article.get("uri")), "version": str(article.get(config.VERSION, 1)), "type": self._get_type(article), } if article.get("byline"): ninjs["byline"] = article["byline"] located = article.get("dateline", {}).get("located", {}) if located: ninjs["located"] = located.get("city", "") for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if "body_text" not in article and "alt_text" in article: ninjs["body_text"] = article["alt_text"] if "title" in article: ninjs["headline"] = article["title"] if article.get("body_html"): ninjs["body_html"] = self.append_body_footer(article) if article.get("description"): ninjs["description_html"] = self.append_body_footer(article) if article.get("place"): ninjs["place"] = self._format_place(article) if article.get("profile"): ninjs["profile"] = self._format_profile(article["profile"]) extra_items = None if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations( article, subscriber) if article.get(ASSOCIATIONS): associations, extra_items = self._format_related( article, subscriber) ninjs[ASSOCIATIONS].update(associations) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related( article, subscriber) elif article.get(ASSOCIATIONS) and recursive: ninjs[ASSOCIATIONS], extra_items = self._format_related( article, subscriber) if extra_items: ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items) if article.get("embargoed"): ninjs["embargoed"] = article["embargoed"].isoformat() if article.get( EMBARGO): # embargo set in superdesk overrides ingested one ninjs["embargoed"] = get_utc_schedule(article, EMBARGO).isoformat() if article.get("priority"): ninjs["priority"] = article["priority"] else: ninjs["priority"] = 5 if article.get("subject"): ninjs["subject"] = self._get_subject(article) if article.get("anpa_category"): ninjs["service"] = self._get_service(article) if article.get("renditions"): ninjs["renditions"] = self._get_renditions(article) elif "url" in article: ninjs["renditions"] = self._generate_renditions(article) if "order" in article: ninjs["order"] = article["order"] # SDPA-317 if "abstract" in article: abstract = article.get("abstract", "") ninjs["description_html"] = abstract ninjs["description_text"] = text_utils.get_text(abstract) elif article.get("description_text"): ninjs["description_text"] = article.get("description_text") if article.get("company_codes"): ninjs["organisation"] = [{ "name": c.get("name", ""), "rel": "Securities Identifier", "symbols": [{ "ticker": c.get("qcode", ""), "exchange": c.get("security_exchange", "") }], } for c in article["company_codes"]] elif "company" in article: ninjs["organisation"] = [{"name": article["company"]}] if article.get("rewrite_of"): ninjs["evolvedfrom"] = article["rewrite_of"] if not ninjs.get("copyrightholder") and not ninjs.get( "copyrightnotice") and not ninjs.get("usageterms"): ninjs.update( superdesk.get_resource_service("vocabularies").get_rightsinfo( article)) if article.get("genre"): ninjs["genre"] = self._get_genre(article) if article.get("flags", {}).get("marked_for_legal"): ninjs["signal"] = self._format_signal_cwarn() if article.get("signal"): ninjs.setdefault("signal", []).extend( [self._format_signal(signal) for signal in article["signal"]]) if article.get("attachments"): ninjs["attachments"] = self._format_attachments(article) if ninjs["type"] == CONTENT_TYPE.TEXT and ("body_html" in ninjs or "body_text" in ninjs): if "body_html" in ninjs: body_html = ninjs["body_html"] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get("language")) else: body_text = ninjs["body_text"] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get("language")) ninjs["charcount"] = char_count ninjs["wordcount"] = word_count ninjs["readtime"] = readtime if article.get("authors"): ninjs["authors"] = self._format_authors(article) if (article.get("schedule_settings") or {}).get("utc_publish_schedule"): ninjs["publish_schedule"] = article["schedule_settings"][ "utc_publish_schedule"] # set description for custom embed field if article.get("extra"): ninjs["extra"] = article["extra"] for key, value in ninjs["extra"].items(): if type(value) == dict and "embed" in value: value.setdefault("description", "") return ninjs
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_place(article) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations( article, subscriber) if article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS].update( self._format_related(article, subscriber)) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if 'abstract' in article: abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = text_utils.get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{ 'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{ 'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '') }] } for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get( 'copyrightnotice') and not ninjs.get('usageterms'): ninjs.update( superdesk.get_resource_service('vocabularies').get_rightsinfo( article)) if 'genre' in article: ninjs['genre'] = self._get_genre(article) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs): if 'body_html' in ninjs: body_html = ninjs['body_html'] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get('language')) else: body_text = ninjs['body_text'] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get('language')) ninjs['charcount'] = char_count ninjs['wordcount'] = word_count ninjs['readtime'] = readtime if article.get('authors'): ninjs['authors'] = self._format_authors(article) return ninjs
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { "guid": article.get(GUID_FIELD, article.get("uri")), "version": str(article.get(config.VERSION, 1)), "type": self._get_type(article), } try: ninjs["byline"] = self._get_byline(article) except: pass located = article.get("dateline", {}).get("located", {}) if located: ninjs["located"] = located.get("city", "") for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if "body_text" not in article and "alt_text" in article: ninjs["body_text"] = article["alt_text"] if "title" in article: ninjs["headline"] = article["title"] if article.get("body_html"): ninjs["body_html"] = self.append_body_footer(article) if article.get("description"): ninjs["description_html"] = self.append_body_footer(article) if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs["associations"] = self._get_associations(article, subscriber) if "associations" in article: ninjs["associations"].update(self._format_related(article, subscriber)) elif article.get("associations", {}): ninjs["associations"] = self._format_related(article, subscriber) elif article.get("associations"): ninjs["associations"] = self._format_related(article, subscriber) if article.get(EMBARGO): ninjs["embargoed"] = get_utc_schedule(article, EMBARGO).isoformat() if article.get("priority"): ninjs["priority"] = article["priority"] else: ninjs["priority"] = 5 if article.get("subject"): ninjs["subject"] = self._get_subject(article) if article.get("anpa_category"): ninjs["service"] = self._get_service(article) if article.get("renditions"): ninjs["renditions"] = self._get_renditions(article) elif "url" in article: ninjs["renditions"] = self._generate_renditions(article) # SDPA-317 if article.get("abstract"): abstract = article.get("abstract") ninjs["description_html"] = abstract soup = BeautifulSoup(abstract, "html.parser") ninjs["description_text"] = soup.get_text() elif article.get("description_text"): ninjs["description_text"] = article.get("description_text") if article.get("company_codes"): ninjs["organisation"] = [ { "name": c.get("name", ""), "rel": "Securities Identifier", "symbols": [{"ticker": c.get("qcode", ""), "exchange": c.get("security_exchange", "")}], } for c in article["company_codes"] ] elif "company" in article: ninjs["organisation"] = [{"name": article["company"]}] return ninjs
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_place(article) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) extra_items = None if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber) if article.get(ASSOCIATIONS): associations, extra_items = self._format_related(article, subscriber) ninjs[ASSOCIATIONS].update(associations) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber) if extra_items: ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if 'abstract' in article: abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = text_utils.get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '')}]} for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'): ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article)) if 'genre' in article: ninjs['genre'] = self._get_genre(article) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs): if 'body_html' in ninjs: body_html = ninjs['body_html'] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) readtime = text_utils.get_reading_time(body_html, word_count, article.get('language')) else: body_text = ninjs['body_text'] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) readtime = text_utils.get_reading_time(body_text, word_count, article.get('language')) ninjs['charcount'] = char_count ninjs['wordcount'] = word_count ninjs['readtime'] = readtime if article.get('authors'): ninjs['authors'] = self._format_authors(article) return ninjs
def format(self, article, subscriber): """ Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: docs = [] for category in article.get('anpa_category'): pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) odbc_item = {'originator': article.get('source', None), 'sequence': pub_seq_num, 'category': category.get('qcode'), 'headline': article.get('headline', '').replace('\'', '\'\''), 'author': article.get('byline', '').replace('\'', '\'\''), 'keyword': self.append_legal(article=article, truncate=True).replace('\'', '\'\''), 'subject_reference': set_subject(category, article)} if 'subject_reference' in odbc_item and odbc_item['subject_reference'] is not None \ and odbc_item['subject_reference'] != '00000000': odbc_item['subject'] = subject_codes[odbc_item['subject_reference'][:2] + '000000'] if odbc_item['subject_reference'][2:5] != '000': odbc_item['subject_matter'] = subject_codes[odbc_item['subject_reference'][:5] + '000'] else: odbc_item['subject_matter'] = '' if not odbc_item['subject_reference'].endswith('000'): odbc_item['subject_detail'] = subject_codes[odbc_item['subject_reference']] else: odbc_item['subject_detail'] = '' else: odbc_item['subject_reference'] = '00000000' odbc_item['take_key'] = article.get('anpa_take_key', '').replace('\'', '\'\'') # @take_key odbc_item['usn'] = article.get('unique_id', None) # @usn if article[ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED: # @article_text odbc_item['article_text'] = self.append_body_footer(article).replace('\'', '\'\'') elif article[ITEM_TYPE] == CONTENT_TYPE.TEXT: soup = BeautifulSoup(self.append_body_footer(article), "html.parser") text = StringIO() for p in soup.findAll('p'): text.write('\x19\r\n') ptext = p.get_text('\n') for l in ptext.split('\n'): if len(l) > 80: text.write(textwrap.fill(l, 80).replace('\n', ' \r\n')) else: text.write(l + ' \r\n') odbc_item['article_text'] = text.getvalue().replace('\'', '\'\'') if 'genre' in article and len(article['genre']) >= 1: odbc_item['genre'] = article['genre'][0].get('name', None) else: odbc_item['genre'] = 'Current' # @genre if article.get(ITEM_TYPE, CONTENT_TYPE.TEXT) == CONTENT_TYPE.TEXT: odbc_item['texttab'] = 'x' elif article.get(ITEM_TYPE, None) == CONTENT_TYPE.PREFORMATTED: odbc_item['texttab'] = 't' odbc_item['wordcount'] = article.get('word_count', None) # @wordcount odbc_item['news_item_type'] = 'News' odbc_item['priority'] = map_priority(article.get('priority')) # @priority odbc_item['service_level'] = 'a' # @service_level odbc_item['fullStory'] = 1 odbc_item['ident'] = '0' # @ident SelectorcodeMapper().map(article, category.get('qcode').upper(), subscriber=subscriber, formatted_item=odbc_item) headline_prefix = LocatorMapper().map(article, category.get('qcode').upper()) if headline_prefix: odbc_item['headline'] = '{}:{}'.format(headline_prefix, odbc_item['headline']) if article.get(EMBARGO): embargo = '{}{}'.format('Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) odbc_item['article_text'] = embargo + odbc_item['article_text'] docs.append((pub_seq_num, json.dumps(odbc_item))) return docs except Exception as ex: raise FormatterError.AAPIpNewsFormatterError(ex, subscriber)
def _validate_associated_items(self, original_item, updates=None, validation_errors=None): """Validates associated items. This function will ensure that the unpublished content validates and none of the content is locked, also do not allow any killed or recalled or spiked content. :param package: :param validation_errors: validation errors are appended if there are any. """ if validation_errors is None: validation_errors = [] if updates is None: updates = {} # merge associations associations = deepcopy(original_item.get(ASSOCIATIONS, {})) associations.update(updates.get(ASSOCIATIONS, {})) items = [value for value in associations.values()] if original_item[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE and self.publish_type == ITEM_PUBLISH: items.extend(self.package_service.get_residrefs(original_item)) main_publish_schedule = get_utc_schedule(updates, PUBLISH_SCHEDULE) or get_utc_schedule( original_item, PUBLISH_SCHEDULE ) for item in items: orig = None if type(item) == dict and item.get(config.ID_FIELD): doc = item orig = super().find_one(req=None, _id=item[config.ID_FIELD]) if not app.settings.get("COPY_METADATA_FROM_PARENT") and orig: doc = orig try: doc.update({"lock_user": orig["lock_user"]}) except (TypeError, KeyError): pass elif item: doc = super().find_one(req=None, _id=item) else: continue if not doc: continue if not orig: orig = doc.copy() if original_item[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: self._validate_associated_items(doc, validation_errors=validation_errors) # make sure no items are killed or recalled or spiked # using the latest version of the item from archive doc_item_state = orig.get(ITEM_STATE, CONTENT_STATE.PUBLISHED) if ( doc_item_state in { CONTENT_STATE.KILLED, CONTENT_STATE.RECALLED, CONTENT_STATE.SPIKED, } or (doc_item_state == CONTENT_STATE.SCHEDULED and main_publish_schedule is None) ): validation_errors.append(_("Item cannot contain associated {state} item.").format(state=doc_item_state)) elif doc_item_state == CONTENT_STATE.SCHEDULED: item_schedule = get_utc_schedule(orig, PUBLISH_SCHEDULE) if main_publish_schedule < item_schedule: validation_errors.append(_("Associated item is scheduled later than current item.")) if doc.get(EMBARGO): validation_errors.append(_("Item cannot have associated items with Embargo")) # don't validate items that already have published if doc_item_state not in [CONTENT_STATE.PUBLISHED, CONTENT_STATE.CORRECTED]: validate_item = {"act": self.publish_type, "type": doc[ITEM_TYPE], "validate": doc} if type(item) == dict: validate_item["embedded"] = True errors = get_resource_service("validate").post([validate_item], headline=True, fields=True)[0] if errors[0]: pre_errors = [ _("Associated item {name} {error}").format(name=doc.get("slugline", ""), error=error) for error in errors[0] ] validation_errors.extend(pre_errors) if config.PUBLISH_ASSOCIATED_ITEMS: # check the locks on the items if doc.get("lock_user"): if original_item["lock_user"] != doc["lock_user"]: validation_errors.extend( [ "{}: {}".format( doc.get("headline", doc["_id"]), _("packaged item is locked by another user") ) ] ) elif original_item["lock_user"] == doc["lock_user"]: validation_errors.extend( [ "{}: {}".format( doc.get("headline", doc["_id"]), _("packaged item is locked by you. Unlock it and try again"), ) ] )
def format(self, article, subscriber): """ Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: docs = [] for category in article.get('anpa_category'): pub_seq_num = superdesk.get_resource_service( 'subscribers').generate_sequence_number(subscriber) odbc_item = { 'originator': article.get('source', None), 'sequence': pub_seq_num, 'category': category.get('qcode'), 'headline': article.get('headline', '').replace('\'', '\'\''), 'author': article.get('byline', '').replace('\'', '\'\''), 'keyword': self.append_legal(article=article, truncate=True).replace('\'', '\'\''), 'subject_reference': set_subject(category, article) } if 'subject_reference' in odbc_item and odbc_item['subject_reference'] is not None \ and odbc_item['subject_reference'] != '00000000': odbc_item['subject'] = subject_codes[ odbc_item['subject_reference'][:2] + '000000'] if odbc_item['subject_reference'][2:5] != '000': odbc_item['subject_matter'] = subject_codes[ odbc_item['subject_reference'][:5] + '000'] else: odbc_item['subject_matter'] = '' if not odbc_item['subject_reference'].endswith('000'): odbc_item['subject_detail'] = subject_codes[ odbc_item['subject_reference']] else: odbc_item['subject_detail'] = '' else: odbc_item['subject_reference'] = '00000000' odbc_item['take_key'] = article.get( 'anpa_take_key', '').replace('\'', '\'\'') # @take_key odbc_item['usn'] = article.get('unique_id', None) # @usn if article[ ITEM_TYPE] == CONTENT_TYPE.PREFORMATTED: # @article_text odbc_item['article_text'] = self.append_body_footer( article).replace('\'', '\'\'') elif article[ITEM_TYPE] == CONTENT_TYPE.TEXT: soup = BeautifulSoup(self.append_body_footer(article), "html.parser") text = StringIO() for p in soup.findAll('p'): text.write('\x19\r\n') ptext = p.get_text('\n') for l in ptext.split('\n'): if len(l) > 80: text.write( textwrap.fill(l, 80).replace('\n', ' \r\n')) else: text.write(l + ' \r\n') odbc_item['article_text'] = text.getvalue().replace( '\'', '\'\'') if 'genre' in article and len(article['genre']) >= 1: odbc_item['genre'] = article['genre'][0].get('name', None) else: odbc_item['genre'] = 'Current' # @genre if article.get(ITEM_TYPE, CONTENT_TYPE.TEXT) == CONTENT_TYPE.TEXT: odbc_item['texttab'] = 'x' elif article.get(ITEM_TYPE, None) == CONTENT_TYPE.PREFORMATTED: odbc_item['texttab'] = 't' odbc_item['wordcount'] = article.get('word_count', None) # @wordcount odbc_item['news_item_type'] = 'News' odbc_item['priority'] = map_priority( article.get('priority')) # @priority odbc_item['service_level'] = 'a' # @service_level odbc_item['fullStory'] = 1 odbc_item['ident'] = '0' # @ident SelectorcodeMapper().map(article, category.get('qcode').upper(), subscriber=subscriber, formatted_item=odbc_item) headline_prefix = LocatorMapper().map( article, category.get('qcode').upper()) if headline_prefix: odbc_item['headline'] = '{}:{}'.format( headline_prefix, odbc_item['headline']) if article.get(EMBARGO): embargo = '{}{}'.format( 'Embargo Content. Timestamp: ', get_utc_schedule(article, EMBARGO).isoformat()) odbc_item[ 'article_text'] = embargo + odbc_item['article_text'] docs.append((pub_seq_num, json.dumps(odbc_item))) return docs except Exception as ex: raise FormatterError.AAPIpNewsFormatterError(ex, subscriber)