def format(self, article, subscriber, codes=None): try: formatted_doc = {} formatted_doc['headline'] = get_text(article.get('headline', ''), content='html') formatted_doc['headline'] = formatted_doc['headline'].replace('\'', '\'\'').replace('\xA0', ' ') formatted_doc['keyword'] = article.get('slugline', '').replace('\'', '\'\'') # body formatting is_last_take = self.is_last_take(article) if article.get(FORMAT) == FORMATS.PRESERVED: body = get_text( self.append_body_footer(article) if is_last_take else article.get('body_html', ''), content='html') formatted_doc['article_text'] = body.replace('\'', '\'\'') elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML: body = self.get_wrapped_text_content( to_ascii(self.append_body_footer(article) if is_last_take else article.get('body_html', ''))).replace('\'', '\'\'') formatted_doc['article_text'] = body self.refine_article_body(formatted_doc, article) # Frame the text output according to AAP requirement formatted_output = 'KEYWORD: ' + formatted_doc.get('keyword', '') + '\r\n' formatted_output += 'HEADLINE: ' + formatted_doc.get('headline', '') + '\r\n' formatted_output += ' ' + formatted_doc.get('article_text', '') return [(0, json.dumps({'article_text': formatted_output}))] except Exception as ex: raise FormatterError.AAPTextFormatterError(ex, subscriber)
def format(self, article, subscriber, codes=None): """ Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ try: pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) sms_message = article.get('sms_message', article.get('abstract', '')).replace('\'', '\'\'') # category = 1 is used to indicate a test message category = '1' if superdesk.app.config.get('TEST_SMS_OUTPUT', True) is True \ else article.get('anpa_category', [{}])[0].get('qcode').upper() odbc_item = {'Sequence': pub_seq_num, 'Category': category, 'Headline': get_text(sms_message, content='html'), 'Priority': map_priority(article.get('priority'))} body = self.append_body_footer(article) if article[ITEM_TYPE] == CONTENT_TYPE.TEXT: body = get_text(body, content='html') odbc_item['StoryText'] = body.replace('\'', '\'\'') # @article_text odbc_item['ident'] = '0' return [(pub_seq_num, json.dumps(odbc_item))] except Exception as ex: raise FormatterError.AAPSMSFormatterError(ex, subscriber)
def format_for_source(self, article, subscriber, source, codes=None): """Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure :type article: object :return: returns the sequence number of the subscriber and the constructed parameter dictionary """ pass_through = article.get('auto_publish', False) try: docs = [] for category in self._get_category_list(article.get('anpa_category')): # All NZN sourced content is AAP content for the AAP output formatted article['source'] = source pub_seq_num, odbc_item = self.get_odbc_item(article, subscriber, category, codes, pass_through) # determine if this is the last take is_last_take = self.is_last_take(article) if article.get(FORMAT) == FORMATS.PRESERVED: # @article_text body = get_text( self.append_body_footer(article) if is_last_take else article.get('body_html', ''), content='html') odbc_item['article_text'] = body.replace('\'', '\'\'') odbc_item['texttab'] = 't' elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML: body = self.get_wrapped_text_content( to_ascii(self.append_body_footer(article) if is_last_take else article.get('body_html', ''))).replace('\'', '\'\'') # if this is the first take and we have a dateline inject it if self.is_first_part(article) and 'dateline' in article and 'text' in article.get('dateline', {})\ and not pass_through: if body.startswith(' '): body = ' {} {}'.format(article.get('dateline') .get('text').replace('\'', '\'\''), body[3:]) odbc_item['article_text'] = body odbc_item['texttab'] = 'x' if self.is_first_part(article) and not pass_through: self.add_ednote(odbc_item, article) self.add_byline(odbc_item, article) if not is_last_take: odbc_item['article_text'] += '\r\nMORE' else: odbc_item['article_text'] += '\r\n' + article.get('source', '') sign_off = article.get('sign_off', '') or '' if len(sign_off) > 0: odbc_item['article_text'] += ' ' + sign_off odbc_item['service_level'] = get_service_level(category, article) # @service_level odbc_item['wordcount'] = article.get('word_count') or 0 # @wordcount odbc_item['priority'] = map_priority(article.get('priority')) # @priority docs.append((pub_seq_num, json.dumps(odbc_item))) return docs except Exception as ex: raise FormatterError.AAPIpNewsFormatterError(ex, subscriber)
def add_byline(self, odbc_item, article): """ Add the byline to the article text :param odbc_item: :param article: :return: """ if article.get('byline') and article.get('byline') != '': byline = get_text(article.get('byline', ''), content='html') if len(byline) >= 3 and byline[:2].upper() != 'BY': byline = 'By ' + byline byline = ' {}\r\n\r\n'.format(byline).replace('\'', '\'\'') odbc_item['article_text'] = byline + odbc_item['article_text']
def get_odbc_item(self, article, subscriber, category, codes, pass_through=False): """ Construct an odbc_item with the common key value pairs populated, if pass_through is true then the headline original headline is maintained. :param article: :param subscriber: :param category: :param codes: :param pass_through: :return: """ article['headline'] = get_text(article.get('headline', ''), content='html') pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) odbc_item = dict(originator=article.get('source', None), sequence=pub_seq_num, category=category.get('qcode').lower(), author=get_text(article.get('byline', '') or '', content='html').replace('\'', '\'\''), keyword=SluglineMapper().map(article=article, category=category.get('qcode').upper(), truncate=True).replace('\'', '\'\'') if not pass_through else (article.get('slugline', '') or '').replace('\'', '\'\''), subject_reference=set_subject(category, article), take_key=(article.get('anpa_take_key', '') or '').replace('\'', '\'\'')) if 'genre' in article and len(article['genre']) >= 1: odbc_item['genre'] = article['genre'][0].get('name', None) else: odbc_item['genre'] = 'Current' # @genre odbc_item['news_item_type'] = 'News' odbc_item['fullStory'] = 1 odbc_item['ident'] = '0' # @ident odbc_item['selector_codes'] = ' '.join(codes) if codes else ' ' headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.get('qcode').upper())) odbc_item['headline'] = headline.replace('\'', '\'\'').replace('\xA0', ' ') self.expand_subject_codes(odbc_item) self.set_usn(odbc_item, article) return pub_seq_num, odbc_item
def populate(item, **kwargs): """Populate the abstract field with the first sentence of the body""" # get the list of sentences of the body if not item.get('body_html', None): item['abstract'] = 'No body found to use for abstract...' else: sentences = p.split(item['body_html']) # chop the first sentence to size for abstract (64) if sentences and len(sentences) > 0: item['abstract'] = get_text(sentences[0][:64]).strip() return item
def _sanitize_fields(self, doc, validator): """If maxlength or minlength is specified in the validator then remove any markups from that field :param doc: Article to be validated :param validator: Validation rule :return: updated article """ fields_to_check = ['minlength', 'maxlength'] schema = validator.get('schema', {}) for field in schema: if doc.get(field) and schema.get(field) and any(k in schema[field] for k in fields_to_check): try: doc[field] = get_text(doc[field]) except (ValueError, TypeError): # fails for json fields like subject, genre pass
def _format_content(self, article, news_item, nitf): """Adds the content set to the xml :param dict article: :param Element newsItem: :param Element nitf: """ content_set = SubElement(news_item, 'contentSet') if article.get(FORMAT) == FORMATS.PRESERVED: inline_data = get_text(self.append_body_footer(article)) SubElement(content_set, 'inlineData', attrib={'contenttype': 'text/plain'}).text = inline_data elif article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.COMPOSITE]: inline = SubElement(content_set, 'inlineXML', attrib={'contenttype': 'application/nitf+xml'}) inline.append(nitf)
def _sanitize_fields(self, doc, validator): """If maxlength or minlength is specified in the validator then remove any markups from that field :param doc: Article to be validated :param validator: Validation rule :return: updated article """ fields_to_check = ['minlength', 'maxlength'] schema = validator.get('schema', {}) for field in schema: if doc.get(field) and schema.get(field) and any( k in schema[field] for k in fields_to_check): try: doc[field] = get_text(doc[field]) except (ValueError, TypeError): # fails for json fields like subject, genre pass
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param element: The xml element to populate :param html: the html to parse the text from :return: """ html = html.replace('<br>', '<br/>').replace('</br>', '') html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html) html = html.replace('\n', ' ') html = re.sub(r'\s\s+', ' ', html) parsed = parse_html(html, content='html') for tag in parsed.xpath('//*'): if tag.getparent() is not None and tag.getparent().tag == 'body': p = etree.Element('p') p.text = to_ascii(get_text(to_string(tag, method='html'), content='html')) element.append(p)
def format_for_source(self, article, subscriber, source, codes=None): try: pass_through = article.get('auto_publish', False) docs = [] for category in self._get_category_list(article.get('anpa_category')): article['source'] = source pub_seq_num, odbc_item = self.get_odbc_item(article, subscriber, category, codes, pass_through) is_last_take = self.is_last_take(article) if article.get(FORMAT) == FORMATS.PRESERVED: # @article_text body = get_text( self.append_body_footer(article) if is_last_take else article.get('body_html', ''), content='html') odbc_item['article_text'] = body.replace('\'', '\'\'') else: body = self.get_text_content( to_ascii(self.append_body_footer(article) if is_last_take else article.get('body_html', ''))) if self.is_first_part(article) and 'dateline' in article \ and 'text' in article.get('dateline', {}) and not pass_through: if body.startswith(' '): body = ' {} {}'.format(article.get('dateline').get('text'), body[3:]) odbc_item['article_text'] = body.replace('\'', '\'\'') if self.is_first_part(article) and not pass_through: self.add_ednote(odbc_item, article) self.add_byline(odbc_item, article) if not is_last_take: odbc_item['article_text'] += '\r\nMORE' else: odbc_item['article_text'] += '\r\n' + source sign_off = article.get('sign_off', '') or '' if len(sign_off) > 0: odbc_item['article_text'] += ' ' + sign_off odbc_item['category'] = odbc_item.get('category', '').upper() odbc_item['selector_codes'] = odbc_item.get('selector_codes', '').upper() docs.append((pub_seq_num, json.dumps(odbc_item))) return docs except Exception as ex: raise FormatterError.AAPNewscentreFormatterError(ex, subscriber)
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article['body_html'] = article['body_html'].replace( '<br>', '<br/>') except KeyError: pass body = '' if article[ITEM_TYPE] in [ CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED ]: body = article.get('body_html', '') elif article[ITEM_TYPE] in [ CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO ]: body = article.get('description', '') if body and article.get(FORMAT, '') == FORMATS.PRESERVED: body = body.replace('\n', '\r\n').replace('\r\r', '\r') parsed = parse_html(body, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get('body_footer'): footer = article.get('body_footer') if article.get(FORMAT, '') == FORMATS.PRESERVED: body = '{}\r\n{}'.format(body, get_text(footer)) else: body = '{}{}'.format(body, footer) return body
def ap_weather_format(item, **kwargs): if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP': raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table") item['slugline'] = 'WORLD WEATHER' text = get_text(item['body_html'], content='html') lines = text.splitlines() if not lines[0] == 'BC-WEA--Global Weather-Celsius,<': raise SuperdeskApiError.badRequestError("Table should be in Celsius only") # tabular column max lengths are extracted into this list columns = [] # map of the columns to extract and the substitutions to apply to the column columnMap = ({'index': 0}, {'index': 1}, {'index': 2}, {'index': 3, 'substitute': [('COND', 'CONDITIONS'), ('pc', 'partly cloudy'), ('clr', 'clear'), ('cdy', 'cloudy'), ('rn', 'rain'), ('sn', 'snow')]}) # story preamble preamble = 'Temperatures and conditions in world centres:\r\n' output = StringIO() output.write(preamble) # story is always datelined News York city = 'New York City' cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] if 'dateline' not in item: item['dateline'] = {} item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(), tz=timezone(item['dateline']['located']['tz'])) item['dateline']['source'] = 'AP' item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d') item['subject'] = [{"name": "weather", "qcode": "17000000"}] locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US'] if lines: # scan all the lines in the file for potential collimated lines and calculate the length # of the column for line in lines: row = re.split('[;\<]+', line) # only consider it if there are more than two rows if len(row) > 2: index = 0 for col in row: # check if the column is mapped map = [me for me in columnMap if me['index'] == index] if len(map): for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) # if it's a new column if 0 <= index < len(columns): # check the length if len(col) > columns[index]: columns[index] = len(col) else: columns.append(len(col)) index += 1 for line in lines: row = re.split('[;\<]+', line) if len(row) > 2: index = 0 for col in row: map = [me for me in columnMap if me['index'] == index] if len(map) > 0: for sub in map[0].get('substitute', ()): col = col.replace(sub[0], sub[1]) output.write( '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n')) index += 1 output.write('\r\n') item['body_html'] = '<pre>' + output.getvalue() + '</pre>' return item
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_qcodes(article['place']) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations( article, subscriber) if article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS].update( self._format_related(article, subscriber)) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if article.get('abstract'): abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{ 'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{ 'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '') }] } for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get( 'copyrightnotice') and not ninjs.get('usageterms'): ninjs.update( superdesk.get_resource_service('vocabularies').get_rightsinfo( article)) if article.get('genre'): ninjs['genre'] = self._format_qcodes(article['genre']) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() return ninjs
def get_value(self, article): try: return get_text(article[self.field.name]).replace('\n', ' ') except (etree.XMLSyntaxError, ValueError): return article[self.field.name]
def plaintext_filter(value): """Filter out html from value.""" return get_text(value).replace('\n', ' ').strip()
def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs = { 'guid': article.get(GUID_FIELD, article.get('uri')), 'version': str(article.get(config.VERSION, 1)), 'type': self._get_type(article) } if article.get('byline'): ninjs['byline'] = article['byline'] located = article.get('dateline', {}).get('located', {}) if located: ninjs['located'] = located.get('city', '') for copy_property in self.direct_copy_properties: if article.get(copy_property) is not None: ninjs[copy_property] = article[copy_property] if 'body_text' not in article and 'alt_text' in article: ninjs['body_text'] = article['alt_text'] if 'title' in article: ninjs['headline'] = article['title'] if article.get('body_html'): ninjs['body_html'] = self.append_body_footer(article) if article.get('description'): ninjs['description_html'] = self.append_body_footer(article) if article.get('place'): ninjs['place'] = self._format_qcodes(article['place']) if article.get('profile'): ninjs['profile'] = self._format_profile(article['profile']) if recursive: if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE: ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber) if article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS].update(self._format_related(article, subscriber)) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) elif article.get(ASSOCIATIONS): ninjs[ASSOCIATIONS] = self._format_related(article, subscriber) if article.get(EMBARGO): ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat() if article.get('priority'): ninjs['priority'] = article['priority'] else: ninjs['priority'] = 5 if article.get('subject'): ninjs['subject'] = self._get_subject(article) if article.get('anpa_category'): ninjs['service'] = self._get_service(article) if article.get('renditions'): ninjs['renditions'] = self._get_renditions(article) elif 'url' in article: ninjs['renditions'] = self._generate_renditions(article) # SDPA-317 if article.get('abstract'): abstract = article.get('abstract', '') ninjs['description_html'] = abstract ninjs['description_text'] = get_text(abstract) elif article.get('description_text'): ninjs['description_text'] = article.get('description_text') if article.get('company_codes'): ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier', 'symbols': [{'ticker': c.get('qcode', ''), 'exchange': c.get('security_exchange', '')}]} for c in article['company_codes']] elif 'company' in article: ninjs['organisation'] = [{'name': article['company']}] if article.get('rewrite_of'): ninjs['evolvedfrom'] = article['rewrite_of'] if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'): ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article)) if article.get('genre'): ninjs['genre'] = self._format_qcodes(article['genre']) if article.get('flags', {}).get('marked_for_legal'): ninjs['signal'] = self._format_signal_cwarn() if article.get('attachments'): ninjs['attachments'] = self._format_attachments(article) return ninjs
def _format_body_content(self, article, body_content): if article.get(FORMAT) == FORMATS.PRESERVED: pre = get_text(self.append_body_footer(article)) SubElement(body_content, 'pre').text = pre else: self.map_html_to_xml(body_content, self.append_body_footer(article))