def test_get_word_count(self):
        self.assertEqual(2, get_word_count('plain text'), 'plain text')
        self.assertEqual(2, get_word_count('<p> html text </p>'), 'paragraph')

        self.assertEqual(22, get_word_count(
            '<doc><p xml:lang="en-US">The weather was superb today in Norfolk, Virginia. Made me want to take\n'
            'out my boat, manufactured by the <org value="acm" idsrc="iptc.org">Acme Boat Company</org>.</p></doc>'))
Exemple #2
0
    def parse(self, file_path, provider=None):
        try:
            item = {
                'guid': '{}-{}'.format(file_path, uuid.uuid4()),
                'pubstatus': 'usable',
                'versioncreated': utcnow(),
                ITEM_TYPE: CONTENT_TYPE.TEXT,
                FORMAT: FORMATS.HTML,
            }

            with open(file_path, 'r', encoding='windows-1252') as f:
                data = f.read().replace('\r', '')

            header, dateline_data, body_data = data.split('\n\n', 2)

            self._process_header(item, header)

            start_of_body = 'MEDIA RELEASE '
            source, data = data.split(start_of_body, 1)
            data = start_of_body + data

            item['anpa_category'] = [{'qcode': 'j'}]
            item['original_source'] = 'AsiaNet'
            body_html = to_ascii(html.escape(data)).replace('\n\n', '</p><p>').replace('\n', ' ')
            item['body_html'] = '<p>' + body_html + '</p>'
            item['word_count'] = get_word_count(item['body_html'])

            return item
        except Exception as e:
            raise AAPParserError.AsiaNetParserError(file_path, e)
 def test_word_count_html(self):
     # If you change the following text, please change it in client too at
     # superdesk-client-core/scripts/apps/authoring/authoring/tests/WordCount.spec.js
     text = """
     <p>This is a test text with numbers (1 000 000 and 1,000,000 and 1.000.000)
     and <strong>compound word (two-done)</strong> and <em>abbreviation (Washington D.C.)</p>
     <p>it should be the same word count as in client and backend</p>"""
     self.assertEqual(32, text_utils.get_word_count(text))
 def test_word_count_ul(self):
     self.assertEqual(3, text_utils.get_word_count("""
         <ul>
             <li>foo</li>
             <li>bar</li>
             <li>baz</li>
             <li></li>
         </ul>
     """))
    def parse(self, xml, provider=None):
        item = {ITEM_TYPE: CONTENT_TYPE.TEXT,  # set the default type.
                }
        try:
            self.do_mapping(item, xml, SETTINGS_MAPPING_PARAM)
            elem = xml.find('body/body.head/dateline/location/city')
            if elem is not None:
                self.set_dateline(item, city=elem.text)

            item.setdefault('word_count', get_word_count(item['body_html'], no_html=True))
        except Exception as ex:
            raise ParserError.nitfParserError(ex, provider)
        return item
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if 'guid' in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item['priority'] = 6
                        item['anpa_category'] = [{'qcode': 'f'}]
                        item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                        item.setdefault('word_count', get_word_count(item['body_html']))
                        # Hard code the urgency
                        item['urgency'] = 3
                        # Dateline is always Wellington in NZ
                        located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if
                                   c.get('city', '').lower() == 'wellington']
                        if len(located) == 1:
                            item['dateline'] = dict()
                            item['dateline']['located'] = located[0]

                        if item.get('body_html') and item['dateline']:
                            parsed = parse_html(item.get('body_html'), content='xml')
                            pars = parsed.xpath('//p')
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(par) == 0 and par.text.startswith('By '):
                                    item['byline'] = par.text.replace('By ', '')
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(' (BusinessDesk) - ')
                                if source:
                                    item['dateline']['date'] = date_parser(date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item['body_html'] = to_string(parsed, remove_root_div=True)
                        locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
                        if locator_map:
                            item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ']

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
 def test_word_count_nitf_2(self):
     self.assertEqual(316, text_utils.get_word_count("""
     <p>Rio Tinto has kept intact its target for iron ore shipments in 2017 after hitting the mid-point
     of its revised guidance range for 2016. </p><p>The world's second largest iron ore exporter shipped
     327.6 million tonnes of iron ore from its Pilbara operations in 2016, in line with the slightly lowered
     full-year guidance of between 325 and 330 million tonnes.</p><p>It expects to ship between 330 to 340
     million tonnes in 2017 from its main mining hub in WA.</p><p>"We have delivered a strong operational
     performance in 2016, underpinned by our drive for efficiency and maximising cash flow," chief executive
     Jean Sebastien Jacques said in a statement.</p><p>"Our disciplined approach remains in place in 2017,
     with the continued focus on productivity, cost reduction and commercial excellence."</p><p>Rio shipped
     87.7 million tonnes of iron ore in the December quarter - up eight per cent from the preceding three
     months - mainly helped by minimal weather disruption.</p><p>Fourth-quarter production was also up four
     per cent from a year ago to 85.5 million tonnes.</p><p>Sales in the quarter exceeded production by 2.2
     million tonnes, primarily through a drawdown on inventories built at the ports in the third quarter,
     the company said.</p><p>The miner also looks to have capitalised on a strong rebound in iron ore prices
     in 2016, saying 80 per cent of its sales were either on the spot market or on current quarter or current
     month average.</p><p>Rio’s copper production rose four per cent from a year ago to 523,000 tonnes, but
     still came in below its guidance range of 535,000 to 565,000 tonnes due to lower-than-expected production
     at its Kennecott mine in the US and no supplies from the Grasberg joint venture in Indonesia.</p><p>It has
     forecast a wide guidance range of 525,000 to 665,000 tonnes for 2017.</p><p>The miner topped production
     forecasts for bauxite and coking coal, while aluminium output jumped 10 per cent in 2016.</p>"""))
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            'guid': article.get(GUID_FIELD, article.get('uri')),
            'version': str(article.get(config.VERSION, 1)),
            'type': self._get_type(article)
        }

        if article.get('byline'):
            ninjs['byline'] = article['byline']

        located = article.get('dateline', {}).get('located', {})
        if located:
            ninjs['located'] = located.get('city', '')

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if 'body_text' not in article and 'alt_text' in article:
            ninjs['body_text'] = article['alt_text']

        if 'title' in article:
            ninjs['headline'] = article['title']

        if article.get('body_html'):
            ninjs['body_html'] = self.append_body_footer(article)

        if article.get('description'):
            ninjs['description_html'] = self.append_body_footer(article)

        if article.get('place'):
            ninjs['place'] = self._format_place(article)

        if article.get('profile'):
            ninjs['profile'] = self._format_profile(article['profile'])

        extra_items = None
        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(article, subscriber)
                if article.get(ASSOCIATIONS):
                    associations, extra_items = self._format_related(article, subscriber)
                    ninjs[ASSOCIATIONS].update(associations)
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber)
        elif article.get(ASSOCIATIONS):
            ninjs[ASSOCIATIONS], extra_items = self._format_related(article, subscriber)
        if extra_items:
            ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items)

        if article.get(EMBARGO):
            ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get('priority'):
            ninjs['priority'] = article['priority']
        else:
            ninjs['priority'] = 5

        if article.get('subject'):
            ninjs['subject'] = self._get_subject(article)

        if article.get('anpa_category'):
            ninjs['service'] = self._get_service(article)
        if article.get('renditions'):
            ninjs['renditions'] = self._get_renditions(article)
        elif 'url' in article:
            ninjs['renditions'] = self._generate_renditions(article)

        # SDPA-317
        if 'abstract' in article:
            abstract = article.get('abstract', '')
            ninjs['description_html'] = abstract
            ninjs['description_text'] = text_utils.get_text(abstract)
        elif article.get('description_text'):
            ninjs['description_text'] = article.get('description_text')

        if article.get('company_codes'):
            ninjs['organisation'] = [{'name': c.get('name', ''), 'rel': 'Securities Identifier',
                                      'symbols': [{'ticker': c.get('qcode', ''),
                                                   'exchange': c.get('security_exchange', '')}]}
                                     for c in article['company_codes']]
        elif 'company' in article:
            ninjs['organisation'] = [{'name': article['company']}]

        if article.get('rewrite_of'):
            ninjs['evolvedfrom'] = article['rewrite_of']

        if not ninjs.get('copyrightholder') and not ninjs.get('copyrightnotice') and not ninjs.get('usageterms'):
            ninjs.update(superdesk.get_resource_service('vocabularies').get_rightsinfo(article))

        if 'genre' in article:
            ninjs['genre'] = self._get_genre(article)

        if article.get('flags', {}).get('marked_for_legal'):
            ninjs['signal'] = self._format_signal_cwarn()

        if article.get('attachments'):
            ninjs['attachments'] = self._format_attachments(article)

        if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs or 'body_text' in ninjs):
            if 'body_html' in ninjs:
                body_html = ninjs['body_html']
                word_count = text_utils.get_word_count(body_html)
                char_count = text_utils.get_char_count(body_html)
                readtime = text_utils.get_reading_time(body_html, word_count, article.get('language'))
            else:
                body_text = ninjs['body_text']
                word_count = text_utils.get_text_word_count(body_text)
                char_count = len(body_text)
                readtime = text_utils.get_reading_time(body_text, word_count, article.get('language'))
            ninjs['charcount'] = char_count
            ninjs['wordcount'] = word_count
            ninjs['readtime'] = readtime

        if article.get('authors'):
            ninjs['authors'] = self._format_authors(article)

        return ninjs
Exemple #9
0
 def test_word_count_p_tags(self):
     self.assertEqual(2, text_utils.get_word_count("<p>foo<strong>s</strong></p><p>bar</p>"))
     self.assertEqual(500, text_utils.get_word_count("<p>word</p>" * 500))
 def test_word_count_hrs(self):
     self.assertEqual(2, text_utils.get_word_count('<p>foo<br><hr>bar</p>'))
     self.assertEqual(2, text_utils.get_word_count('<p>foo<br /><hr />bar</p>'))
Exemple #11
0
 def test_word_count_hrs(self):
     self.assertEqual(2, text_utils.get_word_count("<p>foo<br><hr>bar</p>"))
     self.assertEqual(2, text_utils.get_word_count("<p>foo<br /><hr />bar</p>"))
Exemple #12
0
 def test_word_count_whitespace_string(self):
     self.assertEqual(0, text_utils.get_word_count("   "))
def brief_internal_routing(item: dict, **kwargs):
    guid = item.get('guid', 'unknown')
    logger.info('macro started item=%s', guid)

    try:
        assert str(item['profile']) == str(
            _get_profile_id(TEXT_PROFILE)), 'profile is not text'
        assert get_word_count(item['body_html']) < 301, 'body is too long'
        # The title should not start with the word "CORRECTION"
        if item.get('headline'):
            title_start_with_correction = item['headline'].lstrip().startswith(
                'CORRECTION')
            assert not title_start_with_correction, 'The headline/title should not start with word CORRECTION'
    except AssertionError as err:
        logger.info('macro stop on assert item=%s error=%s', guid, err)
        raise StopDuplication()
    except KeyError as err:
        logger.error(err)
        raise StopDuplication()

    item.setdefault('subject', [])
    item['urgency'] = 2
    item['profile'] = _get_profile_id(BRIEF_PROFILE)
    item['subject'] = _get_product_subject(
        _get_brief_subject(item.get('subject', [])))
    item['status'] = CONTENT_STATE.SCHEDULED
    item['operation'] = 'publish'

    _fix_headline(item)
    _fix_body_html(item)

    UTC_FIELD = 'utc_{}'.format(PUBLISH_SCHEDULE)
    try:
        published_at = item[SCHEDULE_SETTINGS][UTC_FIELD]
    except KeyError:
        published_at = utcnow()
    item[SCHEDULE_SETTINGS] = {
        'time_zone': 'Europe/Brussels',
    }

    # Set item publish schedule to 7:30 am for autopublish between 4 to 7 am
    is_press_headline = item.get(
        'headline') and 'press' in item['headline'].lower()
    current_datetime = utc_to_local(superdesk.app.config['DEFAULT_TIMEZONE'],
                                    utcnow())
    if is_press_headline and time(4, 00) <= current_datetime.time() <= time(
            7, 00):
        item[PUBLISH_SCHEDULE] = current_datetime.replace(hour=7,
                                                          minute=30,
                                                          second=00)
        logger.info(
            'Set publish schedule to 7:30 am for autopublish between 4 to 7 am item=%s',
            item.get('guid', 'unknown'))
    else:
        # schedule +30m
        item[PUBLISH_SCHEDULE] = utc_to_local(
            item[SCHEDULE_SETTINGS]['time_zone'],
            published_at + timedelta(minutes=30))

    update_schedule_settings(item, PUBLISH_SCHEDULE, item[PUBLISH_SCHEDULE])
    item[PUBLISH_SCHEDULE] = item[PUBLISH_SCHEDULE].replace(tzinfo=None)

    # remove text in () brackets along with brackets
    if item.get("headline"):
        title = re.sub(r"\([^()]*\)", "", item['headline'])
        item['headline'] = " ".join(title.split())

    # publish
    try:
        internal_destination_auto_publish(item)
    except StopDuplication:
        logger.info('macro done item=%s', guid)
    except DocumentError as err:
        logger.error('validation error when creating brief item=%s error=%s',
                     guid, err)
    except Exception as err:
        logger.exception(err)

    # avoid another item to be created
    raise StopDuplication()
Exemple #14
0
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            'guid': article.get(GUID_FIELD, article.get('uri')),
            'version': str(article.get(config.VERSION, 1)),
            'type': self._get_type(article)
        }

        if article.get('editor_state'):
            self._parse_editor_state(article, ninjs)

        if article.get('byline'):
            ninjs['byline'] = article['byline']

        located = article.get('dateline', {}).get('located', {})
        if located:
            ninjs['located'] = located.get('city', '')

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if 'body_text' not in article and 'alt_text' in article:
            ninjs['body_text'] = article['alt_text']

        if 'title' in article:
            ninjs['headline'] = article['title']

        if article.get('body_html'):
            ninjs['body_html'] = self.append_body_footer(article)

        if article.get('description'):
            ninjs['description_html'] = self.append_body_footer(article)

        if article.get('place'):
            ninjs['place'] = self._format_place(article['place'])

        if article.get('profile'):
            ninjs['profile'] = self._format_profile(article['profile'])

        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(
                    article, subscriber)
                if article.get(ASSOCIATIONS):
                    ninjs[ASSOCIATIONS].update(
                        self._format_related(article, subscriber))
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)
        elif article.get(ASSOCIATIONS):
            ninjs[ASSOCIATIONS] = self._format_related(article, subscriber)

        if article.get(EMBARGO):
            ninjs['embargoed'] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get('priority'):
            ninjs['priority'] = article['priority']
        else:
            ninjs['priority'] = 5

        if article.get('subject'):
            ninjs['subject'] = self._get_subject(article)

        if article.get('anpa_category'):
            ninjs['service'] = self._get_service(article)
        if article.get('renditions'):
            ninjs['renditions'] = self._get_renditions(article)
        elif 'url' in article:
            ninjs['renditions'] = self._generate_renditions(article)

        # SDPA-317
        if article.get('abstract'):
            abstract = article.get('abstract', '')
            ninjs['description_html'] = abstract
            ninjs['description_text'] = text_utils.get_text(abstract)
        elif article.get('description_text'):
            ninjs['description_text'] = article.get('description_text')

        if article.get('company_codes'):
            ninjs['organisation'] = [{
                'name':
                c.get('name', ''),
                'rel':
                'Securities Identifier',
                'symbols': [{
                    'ticker': c.get('qcode', ''),
                    'exchange': c.get('security_exchange', '')
                }]
            } for c in article['company_codes']]
        elif 'company' in article:
            ninjs['organisation'] = [{'name': article['company']}]

        if article.get('rewrite_of'):
            ninjs['evolvedfrom'] = article['rewrite_of']

        if not ninjs.get('copyrightholder') and not ninjs.get(
                'copyrightnotice') and not ninjs.get('usageterms'):
            ninjs.update(
                superdesk.get_resource_service('vocabularies').get_rightsinfo(
                    article))

        if article.get('genre'):
            ninjs['genre'] = self._format_qcodes(article['genre'])

        if article.get('flags', {}).get('marked_for_legal'):
            ninjs['signal'] = self._format_signal_cwarn()

        if article.get('attachments'):
            ninjs['attachments'] = self._format_attachments(article)

        if ninjs['type'] == CONTENT_TYPE.TEXT and ('body_html' in ninjs
                                                   or 'body_text' in ninjs):
            if 'body_html' in ninjs:
                word_count = text_utils.get_word_count(ninjs.get('body_html'))
            else:
                word_count = text_utils.get_text_word_count(ninjs['body_text'])
            ninjs['readtime'] = text_utils.get_reading_time(word_count)

        if article.get('authors'):
            ninjs['authors'] = self._format_authors(article)

        return ninjs
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        ninjs = {
            "guid": article.get(GUID_FIELD, article.get("uri")),
            "version": str(article.get(config.VERSION, 1)),
            "type": self._get_type(article),
        }

        if article.get("byline"):
            ninjs["byline"] = article["byline"]

        located = article.get("dateline", {}).get("located", {})
        if located:
            ninjs["located"] = located.get("city", "")

        for copy_property in self.direct_copy_properties:
            if article.get(copy_property) is not None:
                ninjs[copy_property] = article[copy_property]

        if "body_text" not in article and "alt_text" in article:
            ninjs["body_text"] = article["alt_text"]

        if "title" in article:
            ninjs["headline"] = article["title"]

        if article.get("body_html"):
            ninjs["body_html"] = self.append_body_footer(article)

        if article.get("description"):
            ninjs["description_html"] = self.append_body_footer(article)

        if article.get("place"):
            ninjs["place"] = self._format_place(article)

        if article.get("profile"):
            ninjs["profile"] = self._format_profile(article["profile"])

        extra_items = None
        if recursive:
            if article[ITEM_TYPE] == CONTENT_TYPE.COMPOSITE:
                ninjs[ASSOCIATIONS] = self._get_associations(
                    article, subscriber)
                if article.get(ASSOCIATIONS):
                    associations, extra_items = self._format_related(
                        article, subscriber)
                    ninjs[ASSOCIATIONS].update(associations)
            elif article.get(ASSOCIATIONS):
                ninjs[ASSOCIATIONS], extra_items = self._format_related(
                    article, subscriber)
        elif article.get(ASSOCIATIONS) and recursive:
            ninjs[ASSOCIATIONS], extra_items = self._format_related(
                article, subscriber)
        if extra_items:
            ninjs.setdefault(EXTRA_ITEMS, {}).update(extra_items)

        if article.get("embargoed"):
            ninjs["embargoed"] = article["embargoed"].isoformat()

        if article.get(
                EMBARGO):  # embargo set in superdesk overrides ingested one
            ninjs["embargoed"] = get_utc_schedule(article, EMBARGO).isoformat()

        if article.get("priority"):
            ninjs["priority"] = article["priority"]
        else:
            ninjs["priority"] = 5

        if article.get("subject"):
            ninjs["subject"] = self._get_subject(article)

        if article.get("anpa_category"):
            ninjs["service"] = self._get_service(article)
        if article.get("renditions"):
            ninjs["renditions"] = self._get_renditions(article)
        elif "url" in article:
            ninjs["renditions"] = self._generate_renditions(article)

        if "order" in article:
            ninjs["order"] = article["order"]

        # SDPA-317
        if "abstract" in article:
            abstract = article.get("abstract", "")
            ninjs["description_html"] = abstract
            ninjs["description_text"] = text_utils.get_text(abstract)
        elif article.get("description_text"):
            ninjs["description_text"] = article.get("description_text")

        if article.get("company_codes"):
            ninjs["organisation"] = [{
                "name":
                c.get("name", ""),
                "rel":
                "Securities Identifier",
                "symbols": [{
                    "ticker": c.get("qcode", ""),
                    "exchange": c.get("security_exchange", "")
                }],
            } for c in article["company_codes"]]
        elif "company" in article:
            ninjs["organisation"] = [{"name": article["company"]}]

        if article.get("rewrite_of"):
            ninjs["evolvedfrom"] = article["rewrite_of"]

        if not ninjs.get("copyrightholder") and not ninjs.get(
                "copyrightnotice") and not ninjs.get("usageterms"):
            ninjs.update(
                superdesk.get_resource_service("vocabularies").get_rightsinfo(
                    article))

        if article.get("genre"):
            ninjs["genre"] = self._get_genre(article)

        if article.get("flags", {}).get("marked_for_legal"):
            ninjs["signal"] = self._format_signal_cwarn()

        if article.get("attachments"):
            ninjs["attachments"] = self._format_attachments(article)

        if ninjs["type"] == CONTENT_TYPE.TEXT and ("body_html" in ninjs
                                                   or "body_text" in ninjs):
            if "body_html" in ninjs:
                body_html = ninjs["body_html"]
                word_count = text_utils.get_word_count(body_html)
                char_count = text_utils.get_char_count(body_html)
                readtime = text_utils.get_reading_time(body_html, word_count,
                                                       article.get("language"))
            else:
                body_text = ninjs["body_text"]
                word_count = text_utils.get_text_word_count(body_text)
                char_count = len(body_text)
                readtime = text_utils.get_reading_time(body_text, word_count,
                                                       article.get("language"))
            ninjs["charcount"] = char_count
            ninjs["wordcount"] = word_count
            ninjs["readtime"] = readtime

        if article.get("authors"):
            ninjs["authors"] = self._format_authors(article)

        if (article.get("schedule_settings")
                or {}).get("utc_publish_schedule"):
            ninjs["publish_schedule"] = article["schedule_settings"][
                "utc_publish_schedule"]

        return ninjs
 def test_word_count_p_tags(self):
     self.assertEqual(
         2,
         text_utils.get_word_count(
             '<p>foo<strong>s</strong></p><p>bar</p>'))
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if 'guid' in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item['priority'] = 6
                        item['anpa_category'] = [{'qcode': 'f'}]
                        item['subject'] = [{
                            'qcode': '04000000',
                            'name': subject_codes['04000000']
                        }]
                        item.setdefault('word_count',
                                        get_word_count(item['body_html']))
                        # Hard code the urgency
                        item['urgency'] = 3
                        # Dateline is always Wellington in NZ
                        located = [
                            c for c in app.locators.find_cities(
                                country_code='NZ', state_code='NZ.G2')
                            if c.get('city', '').lower() == 'wellington'
                        ]
                        if len(located) == 1:
                            item['dateline'] = dict()
                            item['dateline']['located'] = located[0]

                        if item.get('body_html') and item['dateline']:
                            parsed = parse_html(item.get('body_html'),
                                                content='xml')
                            pars = parsed.xpath('//p')
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(
                                        par) == 0 and par.text.startswith(
                                            'By '):
                                    item['byline'] = par.text.replace(
                                        'By ', '')
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(
                                    ' (BusinessDesk) - ')
                                if source:
                                    item['dateline']['date'] = date_parser(
                                        date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == '(BusinessDesk)' and pars.index(
                                        par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item['body_html'] = to_string(parsed,
                                                          remove_root_div=True)
                        locator_map = superdesk.get_resource_service(
                            'vocabularies').find_one(req=None, _id='locators')
                        if locator_map:
                            item['place'] = [
                                x for x in locator_map.get('items', [])
                                if x['qcode'].upper() == 'NZ'
                            ]

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
 def test_word_count_nitf(self):
     self.assertEqual(40, text_utils.get_word_count("""
     <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location>
     with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their
     hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side
     sealed their place at the finals in <chron>October 2015</chron>.</p>"""))
Exemple #19
0
def word_count(html):
    return get_word_count(html or '')
Exemple #20
0
    def _format_item(self, root, item, pub_seq_num, service, services) -> None:
        if is_picture(item):
            D2P1 = "http://www.w3.org/2001/XMLSchema-instance"
            content = etree.SubElement(
                root,
                "ContentItem",
                {"{%s}type" % D2P1: "PhotoContentItem"},
                nsmap={
                    "d2p1": D2P1,
                },
            )
        else:
            content = etree.SubElement(root, "ContentItem")
        extra = item.get("extra") or {}

        # root system fields
        etree.SubElement(root, "Reschedule").text = "false"
        etree.SubElement(root, "IsRegional").text = "false"
        etree.SubElement(root, "CanAutoRoute").text = "true"
        etree.SubElement(root, "PublishID").text = str(pub_seq_num)
        etree.SubElement(root, "Username")
        etree.SubElement(root, "UseLocalsOut").text = "false"
        etree.SubElement(root, "UserProfileID").text = "0"
        etree.SubElement(root, "PublishOrder").text = "0"
        etree.SubElement(root, "NewCycle").text = "false"
        etree.SubElement(root, "OnlineResend").text = "false"

        # item system fields
        etree.SubElement(content, "AutoSaveID").text = "0"
        etree.SubElement(content, "Type").text = "0"
        etree.SubElement(content, "MediaType").text = "0"
        etree.SubElement(content, "Status").text = "0"

        if is_picture(item):
            etree.SubElement(root, "Services").text = "Pictures"
            self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS)
            if root.find("PscCodes") is None:
                etree.SubElement(root, "PscCodes").text = "Online"
        elif service:
            etree.SubElement(root, "Services").text = "Écrit" if is_french(item) else "Print"
            etree.SubElement(root, "PscCodes").text = service
        else:
            self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS)
            self._format_services(root, item)

        is_broadcast = cp.is_broadcast(item)

        # content system fields
        orig = self._get_original_item(item)
        seq_id = "{:08d}".format(pub_seq_num % 100000000)
        item_id = "{:08d}".format(self.get_item_id(orig) % 100000000)
        etree.SubElement(content, "Name")
        etree.SubElement(content, "Cachable").text = "false"
        etree.SubElement(content, "FileName").text = filename(orig)
        etree.SubElement(content, "NewsCompID").text = item_id
        etree.SubElement(content, "SystemSlug").text = slug(orig)
        etree.SubElement(content, "ContentItemID").text = seq_id
        etree.SubElement(content, "ProfileID").text = "204"
        etree.SubElement(content, "SysContentType").text = "0"

        if is_picture(item):
            etree.SubElement(content, "PhotoContentItemID").text = item_id

        if extra.get(cp.FILENAME):
            etree.SubElement(content, "OrigTransRef").text = extra[cp.FILENAME]

        if service:
            etree.SubElement(content, "Note").text = ",".join(services)

        # timestamps
        firstpublished = item.get("firstpublished") or item["versioncreated"]
        etree.SubElement(root, "PublishDateTime").text = self._format_datetime(
            firstpublished
        )
        try:
            etree.SubElement(content, "EmbargoTime").text = self._format_datetime(
                item[SCHEDULE_SETTINGS]["utc_embargo"],
                local=True,
            )
        except KeyError:
            etree.SubElement(content, "EmbargoTime").text = self._format_datetime(
                item.get("embargoed"), local=True
            )
        etree.SubElement(content, "CreatedDateTime").text = self._format_datetime(
            firstpublished
        )  # SDCP-380
        etree.SubElement(content, "UpdatedDateTime").text = self._format_datetime(
            item["versioncreated"], rel=True
        )

        # obvious
        etree.SubElement(content, "ContentType").text = (
            "Photo" if is_picture(item) else item["type"].capitalize()
        )

        # SDCP-309
        etree.SubElement(content, "Headline").text = format_maxlength(
            extra.get(cp.HEADLINE2) or item.get("headline"), OUTPUT_LENGTH_LIMIT
        )
        if not is_picture(item):
            etree.SubElement(content, "Headline2").text = format_maxlength(
                item.get("headline"), OUTPUT_LENGTH_LIMIT
            )

        etree.SubElement(content, "SlugProper").text = item.get("slugline")
        etree.SubElement(content, "Credit").text = self._format_credit(item)
        etree.SubElement(content, "Source").text = item.get("source")

        content_html = self._format_content(item, is_broadcast)
        etree.SubElement(content, "DirectoryText").text = self._format_text(
            item.get("abstract")
        )
        etree.SubElement(content, "ContentText").text = self._format_html(content_html)
        etree.SubElement(content, "Language").text = (
            "2" if is_french(item) else "1"
        )

        if item["type"] == "text" and content_html:
            content.find("DirectoryText").text = format_maxlength(
                get_text(content_html, "html", lf_on_block=False).replace("\n", " "),
                200,
            )
            word_count = str(get_word_count(content_html))
            etree.SubElement(content, "Length").text = word_count
            etree.SubElement(content, "WordCount").text = word_count
            etree.SubElement(content, "BreakWordCount").text = word_count

        if item.get("keywords") and item.get("source") == globenewswire.SOURCE:
            etree.SubElement(content, "Stocks").text = ",".join(item["keywords"])

        self._format_category_index(content, item)
        self._format_genre(content, item)
        self._format_urgency(content, item.get("urgency"), item["language"])
        self._format_keyword(
            content,
            item.get("keywords"),
            ", " if item.get("type") == "picture" else ",",
        )
        self._format_dateline(content, item.get("dateline"))
        self._format_writethru(content, item)

        if item.get("byline"):
            etree.SubElement(content, "Byline").text = item["byline"]

        if is_picture(item):
            self._format_picture_metadata(content, item)
        else:
            etree.SubElement(content, "EditorNote").text = item.get("ednote")
            if extra.get(cp.UPDATE):
                etree.SubElement(content, "UpdateNote").text = extra[cp.UPDATE]
            if extra.get(cp.CORRECTION):
                etree.SubElement(content, "Corrections").text = extra[cp.CORRECTION]

        if item.get("associations"):
            self._format_associations(content, item)
 def test_word_count_nitf(self):
     self.assertEqual(40, text_utils.get_word_count("""
     <p>2014: Northern Ireland beat <location>Greece</location> 2-0 in <location>Athens</location>
     with goals from <person>Jamie Ward</person> and <person>Kyle Lafferty</person> to boost their
     hopes of qualifying for <money>Euro 2016</money>. <person>Michael O'Neill's</person> side
     sealed their place at the finals in <chron>October 2015</chron>.</p>"""))
Exemple #22
0
    def _format_item(self, root, item, pub_seq_num, service, services) -> None:
        if is_picture(item):
            D2P1 = 'http://www.w3.org/2001/XMLSchema-instance'
            content = etree.SubElement(root,
                                       'ContentItem',
                                       {'{%s}type' % D2P1: 'PhotoContentItem'},
                                       nsmap={
                                           'd2p1': D2P1,
                                       })
        else:
            content = etree.SubElement(root, 'ContentItem')
        extra = item.get('extra') or {}

        # root system fields
        etree.SubElement(root, 'Reschedule').text = 'false'
        etree.SubElement(root, 'IsRegional').text = 'false'
        etree.SubElement(root, 'CanAutoRoute').text = 'true'
        etree.SubElement(root, 'PublishID').text = str(pub_seq_num)
        etree.SubElement(root, 'Username')
        etree.SubElement(root, 'UseLocalsOut').text = 'false'
        etree.SubElement(root, 'UserProfileID').text = '0'
        etree.SubElement(root, 'PublishOrder').text = '0'
        etree.SubElement(root, 'NewCycle').text = 'false'
        etree.SubElement(root, 'OnlineResend').text = 'false'

        # item system fields
        etree.SubElement(content, 'AutoSaveID').text = '0'
        etree.SubElement(content, 'Type').text = '0'
        etree.SubElement(content, 'MediaType').text = '0'
        etree.SubElement(content, 'Status').text = '0'

        if is_picture(item):
            etree.SubElement(root, 'Services').text = 'Pictures'
            self._format_subject_code(root, item, 'PscCodes', cp.DESTINATIONS)
            if root.find('PscCodes') is None:
                etree.SubElement(root, 'PscCodes').text = 'Online'
        elif service:
            etree.SubElement(root, 'Services').text = 'Print'
            etree.SubElement(root, 'PscCodes').text = service
        else:
            self._format_subject_code(root, item, 'PscCodes', cp.DESTINATIONS)
            self._format_services(root, item)

        is_broadcast = cp.is_broadcast(item)

        # content system fields
        orig = self._get_original_item(item)
        seq_id = '{:08d}'.format(pub_seq_num % 100000000)
        item_id = '{:08d}'.format(orig['unique_id'] % 100000000)
        etree.SubElement(content, 'Name')
        etree.SubElement(content, 'Cachable').text = 'false'
        etree.SubElement(content, 'FileName').text = filename(orig)
        etree.SubElement(content, 'NewsCompID').text = item_id
        etree.SubElement(content, 'SystemSlug').text = slug(orig)
        etree.SubElement(content, 'ContentItemID').text = seq_id
        etree.SubElement(content, 'ProfileID').text = '204'
        etree.SubElement(content, 'SysContentType').text = '0'

        if is_picture(item):
            etree.SubElement(content, 'PhotoContentItemID').text = item_id

        if extra.get(cp.FILENAME):
            etree.SubElement(content, 'OrigTransRef').text = extra[cp.FILENAME]

        if service:
            etree.SubElement(content, 'Note').text = ','.join(services)

        # timestamps
        firstpublished = item.get('firstpublished') or item['versioncreated']
        etree.SubElement(
            root,
            'PublishDateTime').text = self._format_datetime(firstpublished)
        try:
            etree.SubElement(content,
                             'EmbargoTime').text = self._format_datetime(
                                 item[SCHEDULE_SETTINGS]['utc_embargo'],
                                 local=True,
                             )
        except KeyError:
            etree.SubElement(content,
                             'EmbargoTime').text = self._format_datetime(
                                 item.get('embargoed'), local=True)
        etree.SubElement(content,
                         'CreatedDateTime').text = self._format_datetime(
                             firstpublished)  # SDCP-380
        etree.SubElement(content,
                         'UpdatedDateTime').text = self._format_datetime(
                             item['versioncreated'], rel=True)

        # obvious
        etree.SubElement(content, 'ContentType').text = 'Photo' if is_picture(
            item) else item['type'].capitalize()

        # SDCP-309
        etree.SubElement(content, 'Headline').text = format_maxlength(
            extra.get(cp.HEADLINE2) or item.get('headline'),
            OUTPUT_LENGTH_LIMIT)
        if not is_picture(item):
            etree.SubElement(content, 'Headline2').text = format_maxlength(
                item.get('headline'), OUTPUT_LENGTH_LIMIT)

        etree.SubElement(content, 'SlugProper').text = item.get('slugline')
        etree.SubElement(content, 'Credit').text = self._format_credit(item)
        etree.SubElement(content, 'Source').text = item.get('source')

        content_html = self._format_content(item, is_broadcast)
        etree.SubElement(content, 'DirectoryText').text = self._format_text(
            item.get('abstract'))
        etree.SubElement(content,
                         'ContentText').text = self._format_html(content_html)
        etree.SubElement(
            content,
            'Language').text = '2' if 'fr' in item.get('language', '') else '1'

        if item['type'] == 'text' and content_html:
            content.find('DirectoryText').text = format_maxlength(
                get_text(content_html, 'html',
                         lf_on_block=False).replace('\n', ' '), 200)
            word_count = str(get_word_count(content_html))
            etree.SubElement(content, 'Length').text = word_count
            etree.SubElement(content, 'WordCount').text = word_count
            etree.SubElement(content, 'BreakWordCount').text = word_count

        if item.get('keywords') and item.get('source') == globenewswire.SOURCE:
            etree.SubElement(content,
                             'Stocks').text = ','.join(item['keywords'])

        self._format_category_index(content, item)
        self._format_genre(content, item)
        self._format_urgency(content, item.get('urgency'), item['language'])
        self._format_keyword(content, item.get('keywords'),
                             ', ' if item.get('type') == 'picture' else ',')
        self._format_dateline(content, item.get('dateline'))
        self._format_writethru(content, item)

        if item.get('byline'):
            etree.SubElement(content, 'Byline').text = item['byline']

        if is_picture(item):
            self._format_picture_metadata(content, item)
        else:
            etree.SubElement(content, 'EditorNote').text = item.get('ednote')
            if extra.get(cp.UPDATE):
                etree.SubElement(content, 'UpdateNote').text = extra[cp.UPDATE]
            if extra.get(cp.CORRECTION):
                etree.SubElement(content,
                                 'Corrections').text = extra[cp.CORRECTION]

        if item.get('associations'):
            self._format_associations(content, item)
 def test_word_count_brs(self):
     self.assertEqual(2, text_utils.get_word_count('<p>foo<br><br>bar</p>'))
     self.assertEqual(
         2, text_utils.get_word_count('<p>foo<br /><br />bar</p>'))
 def test_word_count_p_tags(self):
     self.assertEqual(2, text_utils.get_word_count('<p>foo<strong>s</strong></p><p>bar</p>'))
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname("itemSet")):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if "guid" in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item["priority"] = 6
                        item["anpa_category"] = [{"qcode": "f"}]
                        item["subject"] = [{
                            "qcode": "04000000",
                            "name": subject_codes["04000000"]
                        }]
                        item.setdefault("word_count",
                                        get_word_count(item["body_html"]))
                        # Hard code the urgency
                        item["urgency"] = 3
                        # Dateline is always Wellington in NZ
                        located = [
                            c for c in app.locators.find_cities(
                                country_code="NZ", state_code="NZ.G2")
                            if c.get("city", "").lower() == "wellington"
                        ]
                        if len(located) == 1:
                            item["dateline"] = dict()
                            item["dateline"]["located"] = located[0]

                        if item.get("body_html") and item["dateline"]:
                            parsed = parse_html(item.get("body_html"),
                                                content="xml")
                            pars = parsed.xpath("//p")
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(
                                        par) == 0 and par.text.startswith(
                                            "By "):
                                    item["byline"] = par.text.replace(
                                        "By ", "")
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(
                                    " (BusinessDesk) - ")
                                if source:
                                    item["dateline"]["date"] = date_parser(
                                        date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == "(BusinessDesk)" and pars.index(
                                        par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item["body_html"] = to_string(parsed,
                                                          remove_root_div=True)
                        locator_map = superdesk.get_resource_service(
                            "vocabularies").find_one(req=None, _id="locators")
                        if locator_map:
                            item["place"] = [
                                x for x in locator_map.get("items", [])
                                if x["qcode"].upper() == "NZ"
                            ]

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)