def format(self, article, subscriber, codes=None):
            formatted_doc = {}
            formatted_doc['headline'] = get_text(article.get('headline', ''), content='html')
            formatted_doc['headline'] = formatted_doc['headline'].replace('\'', '\'\'').replace('\xA0', ' ')
            formatted_doc['keyword'] = article.get('slugline', '').replace('\'', '\'\'')

            # body formatting
            is_last_take = self.is_last_take(article)
            if article.get(FORMAT) == FORMATS.PRESERVED:
                body = get_text(
                    self.append_body_footer(article) if is_last_take else
                    article.get('body_html', ''),
                formatted_doc['article_text'] = body.replace('\'', '\'\'')
            elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML:
                body = self.get_wrapped_text_content(
                    to_ascii(self.append_body_footer(article) if is_last_take
                             else article.get('body_html', ''))).replace('\'', '\'\'')
                formatted_doc['article_text'] = body

            self.refine_article_body(formatted_doc, article)

            # Frame the text output according to AAP requirement
            formatted_output = 'KEYWORD: ' + formatted_doc.get('keyword', '') + '\r\n'
            formatted_output += 'HEADLINE: ' + formatted_doc.get('headline', '') + '\r\n'
            formatted_output += '   ' + formatted_doc.get('article_text', '')

            return [(0, json.dumps({'article_text': formatted_output}))]
        except Exception as ex:
            raise FormatterError.AAPTextFormatterError(ex, subscriber)
    def format(self, article, subscriber, codes=None):
        Constructs a dictionary that represents the parameters passed to the SMS InsertAlerts stored procedure
        :return: returns the sequence number of the subscriber and the constructed parameter dictionary
            pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
            sms_message = article.get('sms_message', article.get('abstract', '')).replace('\'', '\'\'')

            # category = 1 is used to indicate a test message
            category = '1' if'TEST_SMS_OUTPUT', True) is True \
                else article.get('anpa_category', [{}])[0].get('qcode').upper()

            odbc_item = {'Sequence': pub_seq_num, 'Category': category,
                         'Headline': get_text(sms_message, content='html'),
                         'Priority': map_priority(article.get('priority'))}

            body = self.append_body_footer(article)

            if article[ITEM_TYPE] == CONTENT_TYPE.TEXT:
                body = get_text(body, content='html')

            odbc_item['StoryText'] = body.replace('\'', '\'\'')  # @article_text
            odbc_item['ident'] = '0'

            return [(pub_seq_num, json.dumps(odbc_item))]
        except Exception as ex:
            raise FormatterError.AAPSMSFormatterError(ex, subscriber)
    def format_for_source(self, article, subscriber, source, codes=None):
        """Constructs a dictionary that represents the parameters passed to the IPNews InsertNews stored procedure
        :type article: object
        :return: returns the sequence number of the subscriber and the constructed parameter dictionary
        pass_through = article.get('auto_publish', False)
            docs = []
            for category in self._get_category_list(article.get('anpa_category')):
                # All NZN sourced content is AAP content for the AAP output formatted
                article['source'] = source
                pub_seq_num, odbc_item = self.get_odbc_item(article, subscriber, category, codes, pass_through)
                # determine if this is the last take
                is_last_take = self.is_last_take(article)

                if article.get(FORMAT) == FORMATS.PRESERVED:  # @article_text
                    body = get_text(
                        self.append_body_footer(article) if is_last_take else
                        article.get('body_html', ''), content='html')
                    odbc_item['article_text'] = body.replace('\'', '\'\'')
                    odbc_item['texttab'] = 't'
                elif article.get(FORMAT, FORMATS.HTML) == FORMATS.HTML:
                    body = self.get_wrapped_text_content(
                        to_ascii(self.append_body_footer(article) if is_last_take
                                 else article.get('body_html', ''))).replace('\'', '\'\'')
                    # if this is the first take and we have a dateline inject it
                    if self.is_first_part(article) and 'dateline' in article and 'text' in article.get('dateline', {})\
                            and not pass_through:
                        if body.startswith('   '):
                            body = '   {} {}'.format(article.get('dateline')
                                                     .get('text').replace('\'', '\'\''),

                    odbc_item['article_text'] = body
                    odbc_item['texttab'] = 'x'

                if self.is_first_part(article) and not pass_through:
                    self.add_ednote(odbc_item, article)
                    self.add_byline(odbc_item, article)

                if not is_last_take:
                    odbc_item['article_text'] += '\r\nMORE'
                    odbc_item['article_text'] += '\r\n' + article.get('source', '')
                sign_off = article.get('sign_off', '') or ''
                if len(sign_off) > 0:
                    odbc_item['article_text'] += ' ' + sign_off

                odbc_item['service_level'] = get_service_level(category, article)  # @service_level
                odbc_item['wordcount'] = article.get('word_count') or 0   # @wordcount
                odbc_item['priority'] = map_priority(article.get('priority'))  # @priority

                docs.append((pub_seq_num, json.dumps(odbc_item)))
            return docs
        except Exception as ex:
            raise FormatterError.AAPIpNewsFormatterError(ex, subscriber)
 def add_byline(self, odbc_item, article):
     Add the byline to the article text
     :param odbc_item:
     :param article:
     if article.get('byline') and article.get('byline') != '':
         byline = get_text(article.get('byline', ''), content='html')
         if len(byline) >= 3 and byline[:2].upper() != 'BY':
             byline = 'By ' + byline
         byline = '   {}\r\n\r\n'.format(byline).replace('\'', '\'\'')
         odbc_item['article_text'] = byline + odbc_item['article_text']
    def get_odbc_item(self, article, subscriber, category, codes, pass_through=False):
        Construct an odbc_item with the common key value pairs populated, if pass_through is true then the headline
        original headline is maintained.
        :param article:
        :param subscriber:
        :param category:
        :param codes:
        :param pass_through:
        article['headline'] = get_text(article.get('headline', ''), content='html')
        pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
        odbc_item = dict(originator=article.get('source', None), sequence=pub_seq_num,
                         author=get_text(article.get('byline', '') or '', content='html').replace('\'', '\'\''),
                                                      truncate=True).replace('\'', '\'\'') if not pass_through else
                         (article.get('slugline', '') or '').replace('\'', '\'\''),
                         subject_reference=set_subject(category, article),
                         take_key=(article.get('anpa_take_key', '') or '').replace('\'', '\'\''))
        if 'genre' in article and len(article['genre']) >= 1:
            odbc_item['genre'] = article['genre'][0].get('name', None)
            odbc_item['genre'] = 'Current'  # @genre
        odbc_item['news_item_type'] = 'News'
        odbc_item['fullStory'] = 1
        odbc_item['ident'] = '0'  # @ident
        odbc_item['selector_codes'] = ' '.join(codes) if codes else ' '

        headline = to_ascii(LocatorMapper().get_formatted_headline(article, category.get('qcode').upper()))
        odbc_item['headline'] = headline.replace('\'', '\'\'').replace('\xA0', ' ')

        self.set_usn(odbc_item, article)

        return pub_seq_num, odbc_item
def populate(item, **kwargs):
    """Populate the abstract field with the first sentence of the body"""

    # get the list of sentences of the body
    if not item.get('body_html', None):
        item['abstract'] = 'No body found to use for abstract...'
        sentences = p.split(item['body_html'])

        # chop the first sentence to size for abstract (64)
        if sentences and len(sentences) > 0:
            item['abstract'] = get_text(sentences[0][:64]).strip()

    return item
def populate(item, **kwargs):
    """Populate the abstract field with the first sentence of the body"""

    # get the list of sentences of the body
    if not item.get('body_html', None):
        item['abstract'] = 'No body found to use for abstract...'
        sentences = p.split(item['body_html'])

        # chop the first sentence to size for abstract (64)
        if sentences and len(sentences) > 0:
            item['abstract'] = get_text(sentences[0][:64]).strip()

    return item
    def _sanitize_fields(self, doc, validator):
        """If maxlength or minlength is specified in the validator then remove any markups from that field

        :param doc: Article to be validated
        :param validator: Validation rule
        :return: updated article
        fields_to_check = ['minlength', 'maxlength']
        schema = validator.get('schema', {})
        for field in schema:
            if doc.get(field) and schema.get(field) and any(k in schema[field] for k in fields_to_check):
                    doc[field] = get_text(doc[field])
                except (ValueError, TypeError):
                    # fails for json fields like subject, genre
    def _format_content(self, article, news_item, nitf):
        """Adds the content set to the xml

        :param dict article:
        :param Element newsItem:
        :param Element nitf:
        content_set = SubElement(news_item, 'contentSet')
        if article.get(FORMAT) == FORMATS.PRESERVED:
            inline_data = get_text(self.append_body_footer(article))
            SubElement(content_set, 'inlineData',
                       attrib={'contenttype': 'text/plain'}).text = inline_data
            inline = SubElement(content_set, 'inlineXML',
                                attrib={'contenttype': 'application/nitf+xml'})
    def _sanitize_fields(self, doc, validator):
        """If maxlength or minlength is specified in the validator then remove any markups from that field

        :param doc: Article to be validated
        :param validator: Validation rule
        :return: updated article
        fields_to_check = ['minlength', 'maxlength']
        schema = validator.get('schema', {})
        for field in schema:
            if doc.get(field) and schema.get(field) and any(
                    k in schema[field] for k in fields_to_check):
                    doc[field] = get_text(doc[field])
                except (ValueError, TypeError):
                    # fails for json fields like subject, genre
 def map_html_to_xml(self, element, html):
     Map the html text tags to xml
     :param element: The xml element to populate
     :param html: the html to parse the text from
     html = html.replace('<br>', '<br/>').replace('</br>', '')
     html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html)
     html = html.replace('\n', ' ')
     html = re.sub(r'\s\s+', ' ', html)
     parsed = parse_html(html, content='html')
     for tag in parsed.xpath('//*'):
         if tag.getparent() is not None and tag.getparent().tag == 'body':
             p = etree.Element('p')
             p.text = to_ascii(get_text(to_string(tag, method='html'), content='html'))
    def format_for_source(self, article, subscriber, source, codes=None):
            pass_through = article.get('auto_publish', False)
            docs = []
            for category in self._get_category_list(article.get('anpa_category')):
                article['source'] = source
                pub_seq_num, odbc_item = self.get_odbc_item(article, subscriber, category, codes, pass_through)
                is_last_take = self.is_last_take(article)
                if article.get(FORMAT) == FORMATS.PRESERVED:  # @article_text
                    body = get_text(
                        self.append_body_footer(article) if is_last_take else
                        article.get('body_html', ''), content='html')
                    odbc_item['article_text'] = body.replace('\'', '\'\'')
                    body = self.get_text_content(
                        to_ascii(self.append_body_footer(article) if is_last_take else
                                 article.get('body_html', '')))

                    if self.is_first_part(article) and 'dateline' in article \
                            and 'text' in article.get('dateline', {}) and not pass_through:
                        if body.startswith('   '):
                            body = '   {} {}'.format(article.get('dateline').get('text'), body[3:])
                    odbc_item['article_text'] = body.replace('\'', '\'\'')

                if self.is_first_part(article) and not pass_through:
                    self.add_ednote(odbc_item, article)
                    self.add_byline(odbc_item, article)

                if not is_last_take:
                    odbc_item['article_text'] += '\r\nMORE'
                    odbc_item['article_text'] += '\r\n' + source
                sign_off = article.get('sign_off', '') or ''
                if len(sign_off) > 0:
                    odbc_item['article_text'] += ' ' + sign_off

                odbc_item['category'] = odbc_item.get('category', '').upper()
                odbc_item['selector_codes'] = odbc_item.get('selector_codes', '').upper()

                docs.append((pub_seq_num, json.dumps(odbc_item)))

            return docs
        except Exception as ex:
            raise FormatterError.AAPNewscentreFormatterError(ex, subscriber)
    def append_body_footer(self, article):
        Checks if the article has any Public Service Announcements and if available appends each of them to the body.

        :return: body with public service announcements.
            article['body_html'] = article['body_html'].replace(
                '<br>', '<br/>')
        except KeyError:

        body = ''
        if article[ITEM_TYPE] in [
            body = article.get('body_html', '')
        elif article[ITEM_TYPE] in [
            body = article.get('description', '')

        if body and article.get(FORMAT, '') == FORMATS.PRESERVED:
            body = body.replace('\n', '\r\n').replace('\r\r', '\r')
            parsed = parse_html(body, content='html')

            for br in parsed.xpath('//br'):
                br.tail = '\r\n' + br.tail if br.tail else '\r\n'

            etree.strip_elements(parsed, 'br', with_tail=False)
            body = etree.tostring(parsed, encoding="unicode")

        if body and article.get('body_footer'):
            footer = article.get('body_footer')
            if article.get(FORMAT, '') == FORMATS.PRESERVED:
                body = '{}\r\n{}'.format(body, get_text(footer))
                body = '{}{}'.format(body, footer)
        return body
def ap_weather_format(item, **kwargs):
    if not item.get('slugline', '').startswith('WEA--GlobalWeather-Ce') or not item.get('source', '') == 'AP':
        raise SuperdeskApiError.badRequestError("Article should be an AP sourced weather table")
    item['slugline'] = 'WORLD WEATHER'

    text = get_text(item['body_html'], content='html')
    lines = text.splitlines()
    if not lines[0] == 'BC-WEA--Global Weather-Celsius,<':
        raise SuperdeskApiError.badRequestError("Table should be in Celsius only")

    # tabular column max lengths are extracted into this list
    columns = []
    # map of the columns to extract and the substitutions to apply to the column
    columnMap = ({'index': 0}, {'index': 1}, {'index': 2},
                 {'index': 3, 'substitute': [('COND', 'CONDITIONS'),
                                             ('pc', 'partly cloudy'), ('clr', 'clear'),
                                             ('cdy', 'cloudy'), ('rn', 'rain'),
                                             ('sn', 'snow')]})
    # story preamble
    preamble = 'Temperatures and conditions in world centres:\r\n'
    output = StringIO()

    # story is always datelined News York
    city = 'New York City'
    cities = app.locators.find_cities()
    located = [c for c in cities if c['city'].lower() == city.lower()]
    if 'dateline' not in item:
        item['dateline'] = {}
    item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city,
                                                                       'tz': 'UTC', 'dateline': 'city'}
    item['dateline']['date'] = datetime.fromtimestamp(get_date(item['firstcreated']).timestamp(),
    item['dateline']['source'] = 'AP'
    item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'],
                                                              source=item.get('original_source', 'AP'))

    item['headline'] = 'World Weather for ' + item['dateline']['date'].strftime('%b %-d')

    item['subject'] = [{"name": "weather", "qcode": "17000000"}]
    locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
    item['place'] = [x for x in locator_map.get('items', []) if x['qcode'] == 'US']

    if lines:
        # scan all the lines in the file for potential collimated lines and calculate the length
        # of the column
        for line in lines:
            row = re.split('[;\<]+', line)
            # only consider it if there are more than two rows
            if len(row) > 2:
                index = 0
                for col in row:
                    # check if the column is mapped
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map):
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                    # if it's a new column
                    if 0 <= index < len(columns):
                        # check the length
                        if len(col) > columns[index]:
                            columns[index] = len(col)
                    index += 1

        for line in lines:
            row = re.split('[;\<]+', line)
            if len(row) > 2:
                index = 0
                for col in row:
                    map = [me for me in columnMap if me['index'] == index]
                    if len(map) > 0:
                        for sub in map[0].get('substitute', ()):
                            col = col.replace(sub[0], sub[1])
                            '{}'.format(col.lstrip('\t').ljust(columns[map[0].get('index')] + 2)).rstrip('\r\n'))
                    index += 1

        item['body_html'] = '<pre>' + output.getvalue() + '</pre>'
    return item
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            'guid': article.get(GUID_FIELD, article.get('uri')),
            'version': str(article.get(config.VERSION, 1)),
            'type': self._get_type(article)

        if article.get('byline'):
            ninjs['byline'] = article['byline']

        located = article.get('dateline', {}).get('located', {})
        if located:
            ninjs['located'] = located.get('city', '')

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if 'body_text' not in article and 'alt_text' in article:
            ninjs['body_text'] = article['alt_text']

        if 'title' in article:
            ninjs['headline'] = article['title']

        if article.get('body_html'):
            ninjs['body_html'] = self.append_body_footer(article)

        if article.get('description'):
            ninjs['description_html'] = self.append_body_footer(article)

        if article.get('place'):
            ninjs['place'] = self._format_qcodes(article['place'])

        if article.get('profile'):
            ninjs['profile'] = self._format_profile(article['profile'])

        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(
                    article, subscriber)
                if article.get(ASSOCIATIONS):
                        self._format_related(article, subscriber))
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)
        elif article.get(ASSOCIATIONS):
            ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)

        if article.get(EMBARGO):
            ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get('priority'):
            ninjs['priority'] = article['priority']
            ninjs['priority'] = 5

        if article.get('subject'):
            ninjs['subject'] = self._get_subject(article)

        if article.get('anpa_category'):
            ninjs['service'] = self._get_service(article)
        if article.get('renditions'):
            ninjs['renditions'] = self._get_renditions(article)
        elif 'url' in article:
            ninjs['renditions'] = self._generate_renditions(article)

        # SDPA-317
        if article.get('abstract'):
            abstract = article.get('abstract', '')
            ninjs['description_html'] = abstract
            ninjs['description_text'] = get_text(abstract)
        elif article.get('description_text'):
            ninjs['description_text'] = article.get('description_text')

        if article.get('company_codes'):
            ninjs['organisation'] = [{
                c.get('name', ''),
                'Securities Identifier',
                'symbols': [{
                    'ticker': c.get('qcode', ''),
                    'exchange': c.get('security_exchange', '')
            } for c in article['company_codes']]
        elif 'company' in article:
            ninjs['organisation'] = [{'name': article['company']}]

        if article.get('rewrite_of'):
            ninjs['evolvedfrom'] = article['rewrite_of']

        if not ninjs.get('copyrightholder') and not ninjs.get(
                'copyrightnotice') and not ninjs.get('usageterms'):

        if article.get('genre'):
            ninjs['genre'] = self._format_qcodes(article['genre'])

        if article.get('flags', {}).get('marked_for_legal'):
            ninjs['signal'] = self._format_signal_cwarn()

        return ninjs
 def get_value(self, article):
         return get_text(article[]).replace('\n', ' ')
     except (etree.XMLSyntaxError, ValueError):
         return article[]
def plaintext_filter(value):
    """Filter out html from value."""
    return get_text(value).replace('\n', ' ').strip()
 def get_value(self, article):
         return get_text(article[]).replace('\n', ' ')
     except (etree.XMLSyntaxError, ValueError):
         return article[]
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            'guid': article.get(GUID_FIELD, article.get('uri')),
            'version': str(article.get(config.VERSION, 1)),
            'type': self._get_type(article)

        if article.get('byline'):
            ninjs['byline'] = article['byline']

        located = article.get('dateline', {}).get('located', {})
        if located:
            ninjs['located'] = located.get('city', '')

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if 'body_text' not in article and 'alt_text' in article:
            ninjs['body_text'] = article['alt_text']

        if 'title' in article:
            ninjs['headline'] = article['title']

        if article.get('body_html'):
            ninjs['body_html'] = self.append_body_footer(article)

        if article.get('description'):
            ninjs['description_html'] = self.append_body_footer(article)

        if article.get('place'):
            ninjs['place'] = self._format_qcodes(article['place'])

        if article.get('profile'):
            ninjs['profile'] = self._format_profile(article['profile'])

        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber)
                if article.get(ASSOCIATIONS):
                    ninjs[ASSOCIATIONS].update(self._format_related(article, subscriber))
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)
        elif article.get(ASSOCIATIONS):
            ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)

        if article.get(EMBARGO):
            ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get('priority'):
            ninjs['priority'] = article['priority']
            ninjs['priority'] = 5

        if article.get('subject'):
            ninjs['subject'] = self._get_subject(article)

        if article.get('anpa_category'):
            ninjs['service'] = self._get_service(article)
        if article.get('renditions'):
            ninjs['renditions'] = self._get_renditions(article)
        elif 'url' in article:
            ninjs['renditions'] = self._generate_renditions(article)

        # SDPA-317
        if article.get('abstract'):
            abstract = article.get('abstract', '')
            ninjs['description_html'] = abstract
            ninjs['description_text'] = get_text(abstract)
        elif article.get('description_text'):
            ninjs['description_text'] = article.get('description_text')

        if article.get('company_codes'):
            ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier',
                                      'symbols': [{'ticker': c.get('qcode', ''),
                                                   'exchange': c.get('security_exchange', '')}]}
                                     for c in article['company_codes']]
        elif 'company' in article:
            ninjs['organisation'] = [{'name': article['company']}]

        if article.get('rewrite_of'):
            ninjs['evolvedfrom'] = article['rewrite_of']

        if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'):

        if article.get('genre'):
            ninjs['genre'] = self._format_qcodes(article['genre'])

        if article.get('flags', {}).get('marked_for_legal'):
            ninjs['signal'] = self._format_signal_cwarn()

        if article.get('attachments'):
            ninjs['attachments'] = self._format_attachments(article)

        return ninjs
def plaintext_filter(value):
    """Filter out html from value."""
    return get_text(value).replace('\n', ' ').strip()
 def _format_body_content(self, article, body_content):
     if article.get(FORMAT) == FORMATS.PRESERVED:
         pre = get_text(self.append_body_footer(article))
         SubElement(body_content, 'pre').text = pre
         self.map_html_to_xml(body_content, self.append_body_footer(article))