def format(self, article, subscriber, codes=None):
        """Create article in NewsML G2 format

        :param dict article:
        :param dict subscriber:
        :param list codes: selector codes
        :return [(int, str)]: return a List of tuples. A tuple consist of
            publish sequence number and formatted article string.
        :raises FormatterError: if the formatter fails to format an article
        """
        try:
            self.subscriber = subscriber
            pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
            is_package = self._is_package(article)
            news_message = etree.Element('newsMessage', attrib=self._debug_message_extra, nsmap=self._message_nsmap)
            self._format_header(article, news_message, pub_seq_num)
            item_set = self._format_item(news_message)
            if is_package:
                item = self._format_item_set(article, item_set, 'packageItem')
                self._format_groupset(article, item)
            elif article[ITEM_TYPE] in {CONTENT_TYPE.PICTURE, CONTENT_TYPE.AUDIO, CONTENT_TYPE.VIDEO}:
                item = self._format_item_set(article, item_set, 'newsItem')
                self._format_contentset(article, item)
            else:
                nitfFormater = NITFFormatter()
                nitf = nitfFormater.get_nitf(article, subscriber, pub_seq_num)
                newsItem = self._format_item_set(article, item_set, 'newsItem')
                self._format_content(article, newsItem, nitf)

            sd_etree.fix_html_void_elements(news_message)
            return [(pub_seq_num, self.XML_ROOT + etree.tostring(news_message, pretty_print=True).decode('utf-8'))]
        except Exception as ex:
            raise FormatterError.newmsmlG2FormatterError(ex, subscriber)
    def format(self, article, subscriber, codes=None):
        """Create article in NewsML G2 format

        :param dict article:
        :param dict subscriber:
        :param list codes: selector codes
        :return [(int, str)]: return a List of tuples. A tuple consist of
            publish sequence number and formatted article string.
        :raises FormatterError: if the formatter fails to format an article
        """
        try:
            self.subscriber = subscriber
            pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
            is_package = self._is_package(article)
            news_message = etree.Element('newsMessage', attrib=self._debug_message_extra, nsmap=self._message_nsmap)
            self._format_header(article, news_message, pub_seq_num)
            item_set = self._format_item(news_message)
            if is_package:
                item = self._format_item_set(article, item_set, 'packageItem')
                self._format_groupset(article, item)
            elif article[ITEM_TYPE] in {CONTENT_TYPE.PICTURE, CONTENT_TYPE.AUDIO, CONTENT_TYPE.VIDEO}:
                item = self._format_item_set(article, item_set, 'newsItem')
                self._format_contentset(article, item)
            else:
                nitfFormater = NITFFormatter()
                nitf = nitfFormater.get_nitf(article, subscriber, pub_seq_num)
                newsItem = self._format_item_set(article, item_set, 'newsItem')
                self._format_content(article, newsItem, nitf)

            sd_etree.fix_html_void_elements(news_message)
            return [(pub_seq_num, self.XML_ROOT + etree.tostring(news_message, pretty_print=True).decode('utf-8'))]
        except Exception as ex:
            raise FormatterError.newmsmlG2FormatterError(ex, subscriber)
Example #3
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        # we need to convert CRLF to <p>
        # cf. SDTS-22
        html = html.replace("&#13;", "\r")
        splitted = html.split("\r\n")
        if len(splitted) == 1 and "<p>" not in html:
            splitted = html.split("\n")
        if len(splitted) > 1:
            html = "".join([
                "<p>{}</p>".format(s) if not is_block_elem(s) else s
                for s in splitted if s.strip()
            ])

        if "img" in html:
            content = sd_etree.parse_html(html, "html")
            for img in content.xpath("//img"):
                try:
                    src = self.check_url(img.get("src"))
                except ValueError:
                    logger.warning("Can't fetch image: {elt}".format(
                        elt=sd_etree.to_string(img)))
                    continue
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                url = media_data["renditions"]["original"]["href"]
                img.set("src", url)
                if key == "featuremedia":
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format("START", key))
                embed_end = etree.Comment(embed_TPL.format("END", key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            content = sd_etree.fix_html_void_elements(content)

            html = sd_etree.to_string(content,
                                      encoding="unicode",
                                      method="xml")

        html = remove_shortcodes(html)

        item["body_html"] = html
Example #4
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        # we need to convert CRLF to <p>
        # cf. SDTS-22
        html = html.replace('&#13;', '\r')
        splitted = html.split('\r\n')
        if len(splitted) == 1 and '<p>' not in html:
            splitted = html.split('\n')
        if len(splitted) > 1:
            html = ''.join([
                '<p>{}</p>'.format(s) if not is_block_elem(s) else s
                for s in splitted if s.strip()
            ])

        if "img" in html:
            content = sd_etree.parse_html(html, 'html')
            for img in content.xpath('//img'):
                try:
                    src = self.check_url(img.get('src'))
                except ValueError:
                    logger.warning("Can't fetch image: {elt}".format(
                        elt=sd_etree.to_string(img)))
                    continue
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                url = media_data['renditions']['original']['href']
                img.set("src", url)
                if key == 'featuremedia':
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format('START', key))
                embed_end = etree.Comment(embed_TPL.format('END', key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            content = sd_etree.fix_html_void_elements(content)
            html = sd_etree.to_string(content,
                                      encoding="unicode",
                                      method='xml')

        item['body_html'] = html
 def test_void_elements_fix(self):
     html_raw = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>'
     expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>'
     parsed = sd_etree.parse_html(html_raw)
     sd_etree.fix_html_void_elements(parsed)
     self.assertEqual(sd_etree.to_string(parsed), expected)
Example #6
0
 def test_void_elements_fix(self):
     html = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>'
     expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>'
     parsed = sd_etree.parse_html(html)
     sd_etree.fix_html_void_elements(parsed)
     self.assertEqual(sd_etree.to_string(parsed), expected)