Example #1
0
    def test_encode_carriage_return(self):
        text = 'This is first line.\r\nThis is second line.\r\n'
        parsed = sd_etree.parse_html(text)
        self.assertEqual(text.replace('\r', '
'), sd_etree.to_string(parsed))

        text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>'
        parsed = sd_etree.parse_html(text, content='html')
        self.assertEqual(text.replace('\r', '&#13;'), sd_etree.to_string(parsed))
    def test_encode_carriage_return(self):
        text = 'This is first line.\r\nThis is second line.\r\n'
        parsed = sd_etree.parse_html(text)
        self.assertEqual(text.replace('\r', '&#13;'), sd_etree.to_string(parsed))

        text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>'
        parsed = sd_etree.parse_html(text, content='html')
        self.assertEqual(text.replace('\r', '&#13;'), sd_etree.to_string(parsed))
Example #3
0
    def test_encode_carriage_return(self):
        text = 'This is first line.\r\nThis is second line.\r\n'
        parsed = parse_html(text)
        self.assertEqual(text.replace('\r', '&#13;'), to_string(parsed))

        text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>'
        parsed = parse_html(text, content='html')
        self.assertEqual(
            '<html><body>{}</body></html>'.format(text.replace('\r', '&#13;')),
            to_string(parsed))
Example #4
0
    def test_encode_carriage_return(self):
        text = "This is first line.\r\nThis is second line.\r\n"
        parsed = sd_etree.parse_html(text)
        self.assertEqual(text.replace("\r", "&#13;"),
                         sd_etree.to_string(parsed))

        text = "<pre>This is first line.\r\nThis is second line.\r\n</pre>"
        parsed = sd_etree.parse_html(text, content="html")
        self.assertEqual(text.replace("\r", "&#13;"),
                         sd_etree.to_string(parsed))
Example #5
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        # we need to convert CRLF to <p>
        # cf. SDTS-22
        html = html.replace("&#13;", "\r")
        splitted = html.split("\r\n")
        if len(splitted) == 1 and "<p>" not in html:
            splitted = html.split("\n")
        if len(splitted) > 1:
            html = "".join([
                "<p>{}</p>".format(s) if not is_block_elem(s) else s
                for s in splitted if s.strip()
            ])

        if "img" in html:
            content = sd_etree.parse_html(html, "html")
            for img in content.xpath("//img"):
                try:
                    src = self.check_url(img.get("src"))
                except ValueError:
                    logger.warning("Can't fetch image: {elt}".format(
                        elt=sd_etree.to_string(img)))
                    continue
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                url = media_data["renditions"]["original"]["href"]
                img.set("src", url)
                if key == "featuremedia":
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format("START", key))
                embed_end = etree.Comment(embed_TPL.format("END", key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            content = sd_etree.fix_html_void_elements(content)

            html = sd_etree.to_string(content,
                                      encoding="unicode",
                                      method="xml")

        html = remove_shortcodes(html)

        item["body_html"] = html
Example #6
0
    def parse_thumbnail(self, item_elt, item):
        """Check for _thumbnail_id meta_key, and use its attachment as feature media

        If the key is found, the linked item is looked for, and its attachment_url is used as feature media
        """
        thumbnail_elt = item_elt.xpath(
            'wp:postmeta/wp:meta_key[text()="_thumbnail_id"]',
            namespaces=nsmap)
        if not thumbnail_elt:
            return
        thumbnail_elt = thumbnail_elt[0]

        try:
            post_id = thumbnail_elt.xpath("../wp:meta_value/text()",
                                          namespaces=nsmap)[0].strip()
            if not post_id:
                raise IndexError
        except IndexError:
            logger.warning("invalid post_id, ignoring: {elt}".format(
                elt=sd_etree.to_string(thumbnail_elt.xpath("..")[0])))
            return
        try:
            if '"' in post_id:
                raise ValueError('post id should not contain " (double quote)')
            post_id_elt = item_elt.xpath(
                '/rss/channel/item/wp:post_id[text()="{}"]'.format(post_id),
                namespaces=nsmap)[0]
            att_item_elt = post_id_elt.getparent()
            url = att_item_elt.xpath("wp:attachment_url",
                                     namespaces=nsmap)[0].text.strip()
            url = self.check_url(url)
        except (IndexError, ValueError) as e:
            logger.warning(
                "Can't find attachement URL, ignoring: {e}\n{elt}".format(
                    e=e, elt=sd_etree.to_string(thumbnail_elt.getparent())))
            return
        try:
            key, media_data = self._add_image(item, url)
        except Exception as e:
            logger.error(e)
            return

        for key, elt_names in (("description_text", ("description", "title")),
                               ("alt_text", ("title", ))):
            for elt_name in elt_names:
                elt = att_item_elt.find(elt_name)
                if elt is not None and elt.text:
                    media_data[key] = elt.text
                    break
            else:
                media_data[key] = ""
Example #7
0
    def body_hook(self, item, html):
        """Copy content to body_html

        if img are found in the content, they are uploaded.
        First image is used as feature media, then there are embeds
        """
        # we need to convert CRLF to <p>
        # cf. SDTS-22
        html = html.replace('&#13;', '\r')
        splitted = html.split('\r\n')
        if len(splitted) == 1 and '<p>' not in html:
            splitted = html.split('\n')
        if len(splitted) > 1:
            html = ''.join([
                '<p>{}</p>'.format(s) if not is_block_elem(s) else s
                for s in splitted if s.strip()
            ])

        if "img" in html:
            content = sd_etree.parse_html(html, 'html')
            for img in content.xpath('//img'):
                try:
                    src = self.check_url(img.get('src'))
                except ValueError:
                    logger.warning("Can't fetch image: {elt}".format(
                        elt=sd_etree.to_string(img)))
                    continue
                try:
                    key, media_data = self._add_image(item, src)
                except Exception as e:
                    logger.error(e)
                    img.getparent().remove(img)
                    continue
                url = media_data['renditions']['original']['href']
                img.set("src", url)
                if key == 'featuremedia':
                    # no need to embed the image for featuremedia
                    continue
                embed_start = etree.Comment(embed_TPL.format('START', key))
                embed_end = etree.Comment(embed_TPL.format('END', key))
                img.addprevious(embed_start)
                img.addnext(embed_end)

            content = sd_etree.fix_html_void_elements(content)
            html = sd_etree.to_string(content,
                                      encoding="unicode",
                                      method='xml')

        item['body_html'] = html
    def _format_body_content(self, article, body_content):
        nitf_body = []

        if article.get('ednote'):
            nitf_body.append(to_ascii(self._format_line(article.get('ednote'))))

        if article.get(BYLINE):
            nitf_body.append(to_ascii(self._format_line(get_text(article.get(BYLINE)))))

        if article.get(FORMAT) == FORMATS.PRESERVED:
            nitf_body.append(to_ascii(get_text(self.append_body_footer(article), content='html')))
        else:
            body = article.get('body_html', '')
            # we need to inject the dateline
            if article.get('dateline', {}).get('text') and not article.get('auto_publish', False):
                body_html_elem = parse_html(article.get('body_html'))
                ptag = body_html_elem.find('.//p')
                if ptag is not None:
                    ptag.text = article['dateline']['text'] + ' ' + (ptag.text or '')
                    body = to_string(body_html_elem)

            nitf_body.append(self.get_text_content(body))
            if article.get('body_footer'):
                nitf_body.append(self.get_text_content(article.get('body_footer', '')))

        sign_off = '{} {}'.format(article.get('source') or '', (article.get('sign_off') or '')).strip()
        if sign_off:
            nitf_body.append(to_ascii(self._format_line(sign_off)))

        SubElement(body_content, 'pre').text = ''.join(nitf_body)
Example #9
0
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname('html'))
        body_elt = html_elt.find(self.qname('body'))
        body_elt = sd_etree.clean_html(body_elt)
        # replace <pre> with <p>
        for pre in body_elt.findall('.//pre'):
            pre.tag = 'p'
        # add target blank for all links
        for a in body_elt.findall('.//a'):
            a.attrib['target'] = '_blank'

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']

        if len(body_elt) > 0:
            contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt]
            content['content'] = '\n'.join(contents)
        elif body_elt.text:
            content['content'] = '<p>' + body_elt.text + '</p>'
            content['format'] = 'xhtml/xml'

        if content.get('content'):
            content['content'] = content['content'].replace('&lt;endash&gt;-&lt;/endash&gt;', '-')

        return content
Example #10
0
 def _inject_dateline(self, formatted_article):
     """Inject dateline in article's body_html"""
     body_html_elem = sd_etree.parse_html(formatted_article.get('body_html', '<p> </p>'))
     ptag = body_html_elem.find('.//p')
     if ptag is not None:
         ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '')
         formatted_article['body_html'] = sd_etree.to_string(body_html_elem)
Example #11
0
def extract_kill_reason_from_html(html, is_kill):
    """Extract the reason from html for a kill/takedown

    Iterates over the xml nodes and find the node that contains the reason prefix.
    Once the reason prefix has been found add the proceeding nodes to our reason tree,
    until the kill/takedown suffix has been found.

    :param html:
    :param is_kill:
    :return:
    """
    try:
        # Create a new tree that we will use to construct the reason nodes
        root = etree.Element('div')

        # A flag to indicate if we're to add the current child node to our reason tree
        adding_nodes = False
        for child in parse_html(html, content='html'):
            # Obtain the text from our child nodes (including sub-child nodes)
            child_text = ''.join(child.itertext())

            if not adding_nodes and REASON_PREFIX in child_text:
                # This child node contains the reason prefix (and we haven't found it already)
                # Therefor set the flag to True indicating that the following child nodes
                # are to be added to our reason tree
                adding_nodes = True
                continue
            elif adding_nodes:
                # If the kill/takedown suffix has been found, then our reason tree is complete
                if is_kill and KILL_SUFFIX in child_text:
                    break
                elif not is_kill and TAKEDOWN_SUFFIX in child_text:
                    break

                # Otherwise continue adding the child nodes to our reason tree

                # Remove the last sub-child if it only contains a line break
                if len(child) > 0:
                    last_child = child[-1]
                    if etree.tostring(last_child) == b'<p><br/></p>':
                        child.remove(last_child)

                # Then add this child node to our reason tree
                root.append(child)

        num_children = len(list(root))

        # If the reason tree was not populated, then return the original html provided
        if num_children == 0:
            return html

        # Our reason tree was populated, convert the tree to a string and return it
        return to_string(root,
                         method='html',
                         remove_root_div=num_children == 1)
    except Exception as e:
        logger.exception(e)
        return html
Example #12
0
def clean_html(body_html):
    '''
    Make sure the html will parse and inject \r\n in an attempt to avoid issues with lines being too long for SMTP
    :param body_html:
    :return: parsed and re-written html
    '''
    root = sd_etree.parse_html(body_html, content='html', lf_on_block=True)
    return sd_etree.to_string(root, method='html',
                              pretty_print=True).replace('>\n', '>\r\n')
Example #13
0
 def _inject_dateline(self, formatted_article):
     """Inject dateline in article's body_html"""
     body_html_elem = sd_etree.parse_html(
         formatted_article.get("body_html", "<p> </p>"))
     ptag = body_html_elem.find(".//p")
     if ptag is not None:
         ptag.text = formatted_article["dateline"]["text"] + " " + (
             ptag.text or "")
         formatted_article["body_html"] = sd_etree.to_string(body_html_elem)
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        """
        Re-wire that href's in the document to be relative to the destination FTP server root, it expects the
        destination to be an FTP server
        :param article:
        :param subscriber:
        :param recursive:
        :return:
        """

        include_original = subscriber.get("destinations")[0].get("config").get(
            "include_original", False)
        if include_original:
            self.internal_renditions = ["original"]

        ninjs = super()._transform_to_ninjs(article, subscriber, recursive)

        # Get the path that the renditions will be pushed to
        path = subscriber.get("destinations")[0].get("config").get(
            "associated_path")

        if path:
            renditions = ninjs.get("renditions")
            if renditions:
                for name, rendition in renditions.items():
                    rendition["href"] = (
                        "/" + path.lstrip("/") +
                        ("/" if not path.endswith("/") else "") +
                        get_rendition_file_name(rendition))

        if article.get("type", "") == "text":
            # Find any embeded image references in the body_html and re-wire the img src reference and insert an id
            html_updated = False
            root_elem = lxml_html.fromstring(ninjs.get("body_html"))
            # Scan any comments for embed markers
            comments = root_elem.xpath("//comment()")
            for comment in comments:
                if "EMBED START Image" in comment.text:
                    regex = r"<!-- EMBED START Image {id: \"editor_([0:9]+)"
                    m = re.search(regex, ninjs.get("body_html", ""))
                    # Assumes the sibling of the Embed Image comment is the figure tag containing the image
                    figureElem = comment.getnext()
                    if figureElem is not None and figureElem.tag == "figure":
                        imgElem = figureElem.find("./img")
                        if imgElem is not None and m and m.group(1):
                            embed_id = "editor_" + m.group(1)
                            imgElem.attrib["id"] = embed_id
                            src = self._get_source_ref(embed_id, ninjs)
                            if src:
                                imgElem.attrib["src"] = src
                            html_updated = True
            if html_updated:
                ninjs["body_html"] = to_string(root_elem, method="html")
        return ninjs
Example #15
0
def clean_html(html):
    cleaner = lxml.html.clean.Cleaner()
    root = lxml.html.fromstring(html)

    for elem in root.iter():
        elem.attrib.pop("id", None)
        elem.attrib.pop("class", None)
        if elem.tag in ('hl2', 'pre', 'note'):
            elem.tag = 'p'

    root = cleaner.clean_html(root)
    return sd_etree.to_string(root, method="html")
    def _transform_to_ninjs(self, article, subscriber, recursive=True):
        """
        Re-wire that href's in the document to be relative to the destination FTP server root, it expects the
        destination to be an FTP server
        :param article:
        :param subscriber:
        :param recursive:
        :return:
        """

        include_original = subscriber.get('destinations')[0].get('config').get(
            'include_original', False)
        if include_original:
            self.internal_renditions = ['original']

        ninjs = super()._transform_to_ninjs(article, subscriber, recursive)

        # Get the path that the renditions will be pushed to
        path = subscriber.get('destinations')[0].get('config').get(
            'associated_path')

        if path:
            renditions = ninjs.get('renditions')
            if renditions:
                for name, rendition in renditions.items():
                    rendition['href'] = '/' + path.lstrip('/') + (
                        '/' if not path.endswith('/') else
                        '') + get_rendition_file_name(rendition)

        if article.get('type', '') == 'text':
            # Find any embeded image references in the body_html and re-wire the img src reference and insert an id
            html_updated = False
            root_elem = lxml_html.fromstring(ninjs.get('body_html'))
            # Scan any comments for embed markers
            comments = root_elem.xpath('//comment()')
            for comment in comments:
                if 'EMBED START Image' in comment.text:
                    regex = r"<!-- EMBED START Image {id: \"editor_([0:9]+)"
                    m = re.search(regex, ninjs.get('body_html', ''))
                    # Assumes the sibling of the Embed Image comment is the figure tag containing the image
                    figureElem = comment.getnext()
                    if figureElem is not None and figureElem.tag == 'figure':
                        imgElem = figureElem.find('./img')
                        if imgElem is not None and m and m.group(1):
                            embed_id = 'editor_' + m.group(1)
                            imgElem.attrib['id'] = embed_id
                            src = self._get_source_ref(embed_id, ninjs)
                            if src:
                                imgElem.attrib['src'] = src
                            html_updated = True
            if html_updated:
                ninjs['body_html'] = to_string(root_elem, method='html')
        return ninjs
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname('html'))
        body_elt = html_elt.find(self.qname('body'))
        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        if len(body_elt) > 0:
            contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt]
            content['content'] = '\n'.join(contents)
        elif body_elt.text:
            content['content'] = '<pre>' + body_elt.text + '</pre>'
            content['format'] = CONTENT_TYPE.PREFORMATTED
        return content
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname('html'))
        body_elt = html_elt.find(self.qname('body'))
        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        if len(body_elt) > 0:
            contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt]
            content['content'] = '\n'.join(contents)
        elif body_elt.text:
            content['content'] = '<pre>' + body_elt.text + '</pre>'
            content['format'] = CONTENT_TYPE.PREFORMATTED
        return content
Example #19
0
 def parse_inline_content(self, tree, item):
     try:
         body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0]
     except IndexError:
         body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0]
     body_elt = sd_etree.clean_html(body_elt)
     content = dict()
     content['contenttype'] = tree.attrib['contenttype']
     if len(body_elt) > 0:
         content['content'] = sd_etree.to_string(body_elt, method="html")
     elif body_elt.text:
         content['content'] = '<pre>' + body_elt.text + '</pre>'
         content['format'] = CONTENT_TYPE.PREFORMATTED
     return content
def remove_breaks(item, **kwargs):
    try:
        html = item.get('body_html')
        if html:
            html = html.replace('<br>', '<br/>').replace('</br>', ' ')
            parsed = parse_html(html, content='xml')
            for br in parsed.xpath('//br'):
                br.tail = ' ' + br.tail if br.tail else ' '
            etree.strip_elements(parsed, 'br', with_tail=False)
            item['body_html'] = to_string(parsed)
            return item

    except Exception as ex:
        logging.exception('Exception in preserve format macro: ', ex)
        raise ex
    def parse(self, xml, provider=None):
        self.root = xml
        items = []
        try:
            for item_set in xml.findall(self.qname('itemSet')):
                for item_tree in item_set:
                    # Ignore the packageItem, it has no guid
                    if 'guid' in item_tree.attrib:
                        item = self.parse_item(item_tree)
                        item['priority'] = 6
                        item['anpa_category'] = [{'qcode': 'f'}]
                        item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}]
                        item.setdefault('word_count', get_word_count(item['body_html']))
                        # Hard code the urgency
                        item['urgency'] = 3
                        # Dateline is always Wellington in NZ
                        located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if
                                   c.get('city', '').lower() == 'wellington']
                        if len(located) == 1:
                            item['dateline'] = dict()
                            item['dateline']['located'] = located[0]

                        if item.get('body_html') and item['dateline']:
                            parsed = parse_html(item.get('body_html'), content='xml')
                            pars = parsed.xpath('//p')
                            for par in pars:
                                if not par.text:
                                    continue
                                # check the first par for a byline
                                if pars.index(par) == 0 and par.text.startswith('By '):
                                    item['byline'] = par.text.replace('By ', '')
                                    par.getparent().remove(par)
                                date, source, the_rest = par.text.partition(' (BusinessDesk) - ')
                                if source:
                                    item['dateline']['date'] = date_parser(date, fuzzy=True)
                                    par.text = the_rest
                                # remove the signoff if in the last par
                                if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars):
                                    par.getparent().remove(par)
                            item['body_html'] = to_string(parsed, remove_root_div=True)
                        locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators')
                        if locator_map:
                            item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ']

                        items.append(item)
            return items
        except Exception as ex:
            raise ParserError.newsmlTwoParserError(ex, provider)
Example #22
0
 def _format_content(self, item, is_broadcast):
     if is_broadcast and item.get("abstract"):
         content = item["abstract"]
         if "<p>" not in content:
             content = "<p>{}</p>".format(content)
     else:
         content = item.get("body_html")
     if not content:
         return ""
     tree = lxml.html.fromstring(content)
     for elem in tree.iter():
         if elem.tag == "b":
             elem.tag = "strong"
         elif elem.tag == "i":
             elem.tag = "em"
     return sd_etree.to_string(tree, encoding="unicode", method="html")
Example #23
0
 def _format_content(self, item, is_broadcast):
     if is_broadcast and item.get('abstract'):
         content = item['abstract']
         if '<p>' not in content:
             content = '<p>{}</p>'.format(content)
     else:
         content = item.get('body_html')
     if not content:
         return ''
     tree = lxml.html.fromstring(content)
     for elem in tree.iter():
         if elem.tag == 'b':
             elem.tag = 'strong'
         elif elem.tag == 'i':
             elem.tag = 'em'
     return sd_etree.to_string(tree, encoding='unicode', method='html')
 def map_html_to_xml(self, element, html):
     """
     Map the html text tags to xml
     :param element: The xml element to populate
     :param html: the html to parse the text from
     :return:
     """
     html = html.replace('<br>', '<br/>').replace('</br>', '')
     html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html)
     html = html.replace('\n', ' ')
     html = re.sub(r'\s\s+', ' ', html)
     parsed = parse_html(html, content='html')
     for tag in parsed.xpath('//*'):
         if tag.getparent() is not None and tag.getparent().tag == 'body':
             p = etree.Element('p')
             p.text = to_ascii(get_text(to_string(tag, method='html'), content='html'))
             element.append(p)
 def map_html_to_xml(self, element, html):
     """
     Map the html text tags to xml
     :param element: The xml element to populate
     :param html: the html to parse the text from
     :return:
     """
     html = html.replace('<br>', '<br/>').replace('</br>', '')
     html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html)
     html = html.replace('\n', ' ')
     html = re.sub(r'\s\s+', ' ', html)
     parsed = parse_html(html, content='html')
     for tag in parsed.xpath('/html/div/child::*'):
         p = etree.Element('p')
         p.text = to_ascii(
             get_text(to_string(tag, method='html'), content='html'))
         element.append(p)
Example #26
0
def generate_embed_renditions(item):

    def _get_source_ref(marker, item):
        try:
            return item.get("associations").get(marker).get("renditions").get("_newsroom_custom").get("href")
        except Exception:
            return None

    has_editor_assoc = False
    # generate required watermarked renditions for any embedded renditions
    for name, association in ((item.get('associations') or {})).items():
        if name.startswith('editor_') and association:
            generate_preview_details_renditions(item.get('associations', {}).get(name), 'viewImage')
            has_editor_assoc = True

    if has_editor_assoc:
        # parse out any editor embeds in the item and re-point to the required rendition
        regex = r' EMBED START Image {id: \"editor_([0-9]+)'
        html_updated = False
        root_elem = lxml_html.fromstring(item.get('body_html', ''))
        comments = root_elem.xpath('//comment()')
        for comment in comments:
            if 'EMBED START Image' in comment.text:
                m = re.search(regex, comment.text)
                # Assumes the sibling of the Embed Image comment is the figure tag containing the image
                figure_elem = comment.getnext()
                if figure_elem is not None and figure_elem.tag == "figure":
                    imgElem = figure_elem.find("./img")
                    if imgElem is not None and m and m.group(1):
                        embed_id = "editor_" + m.group(1)
                        imgElem.attrib["id"] = embed_id
                        src = _get_source_ref(embed_id, item)
                        if src:
                            imgElem.attrib["src"] = src
                        html_updated = True
        if html_updated:
            item["body_html"] = to_string(root_elem, method="html")
            # If there is no feature media them copy the last embedded image to be the feature media
            if not ((item.get('associations') or {}).get('featuremedia') or {}).get('renditions'):
                item['associations']['featuremedia'] = deepcopy(item.get('associations').get(embed_id))
                generate_renditions(item)
Example #27
0
    def _format_content(self, item, is_broadcast):
        if is_broadcast and item.get("abstract"):
            content = item["abstract"]
            if "<p>" not in content:
                content = "<p>{}</p>".format(content)
        else:
            content = item.get("body_html")
        if not content:
            return ""
        tree = lxml.html.fromstring(content)
        for elem in tree.iter():
            if elem.tag == "b":
                elem.tag = "strong"
            elif elem.tag == "i":
                elem.tag = "em"

            # Remove whitespace and empty tags
            if elem.tag in INLINE_ELEMENTS and elem.text is not None and not elem.text.strip():
                elem.drop_tree()

        return sd_etree.to_string(tree, encoding="unicode", method="html")
Example #28
0
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname("html"))
        body_elt = html_elt.find(self.qname("body"))
        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content["contenttype"] = tree.attrib["contenttype"]
        if len(body_elt) > 0:
            contents = [
                sd_etree.to_string(e, encoding="unicode", method="html")
                for e in body_elt
            ]
            content["content"] = "\n".join(contents)
        elif body_elt.text:
            content["content"] = "<pre>" + body_elt.text + "</pre>"
            content["format"] = CONTENT_TYPE.PREFORMATTED

        if content.get("content"):
            content["content"] = content["content"].replace(
                "&lt;endash&gt;-&lt;/endash&gt;", "-")

        return content
Example #29
0
    def get_body(self, news_item):
        try:
            raw_content = news_item.xpath(
                'NewsComponent/ContentItem[@Euid="announcement_html"]/DataContent/text()'
            )[0]
        except IndexError:
            logger.warning("No content found in element: {xml}".format(
                xml=etree.tostring(news_item, encoding="unicode")))
            return ""

        content_elt = sd_etree.parse_html(raw_content)
        h1 = content_elt.find('h1')
        if h1 is not None:
            content_elt.remove(h1)

        categories = news_item.xpath(
            'NewsComponent/Metadata/Property[@FormalName="Message Category"]/@Value'
        )

        if categories:
            category = categories[0]
            p_elt = etree.Element('p')
            p_elt.text = category
            content_elt.insert(0, p_elt)

        ori_ann_urls = news_item.xpath(
            'NewsComponent/Metadata/Property[@FormalName="nordicAgencyWebsite"]/@Value'
        )
        if ori_ann_urls:
            url = ori_ann_urls[0]
            if not url.startswith('http'):
                raise ValueError("Invalid url: {url}".format(url=url))
            p_elt = etree.SubElement(content_elt, "p")
            p_elt.text = 'Se saken i sin helhet: '
            a_elt = etree.SubElement(p_elt, "a", attrib={'href': url})
            a_elt.text = url

        ret = sd_etree.to_string(content_elt)
        return ret
Example #30
0
    def parse(self, xml, provider=None):
        item = super().parse(xml, provider)
        item['slugline'] = ''
        category = ingest_category_from_subject(item.get(
            'subject'))  # check for sports using all ingested subjects
        item['subject'] = filter_missing_subjects(item.get('subject'))
        item['subject'].append(category)

        urgency = item.get('urgency', None)
        if urgency == 2:
            item['urgency'] = 3
        elif urgency == 4:
            item['urgency'] = 5

        set_default_service(item)

        if not item.get('headline') and item.get('body_html'):
            first_line = item.get('body_html').strip().split('\n')[0]
            parsed_headline = etree.parse_html(first_line, 'html')
            item['headline'] = etree.to_string(
                parsed_headline, method="text").strip().split('\n')[0]

        return item
Example #31
0
    def parse_inline_content(self, tree, item):
        try:
            body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0]
        except IndexError:
            body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0]

        try:
            notepad = self.item_tree.xpath('.//iptc:edNote[@role="dpaednoterole:notepad"]//xhtml:section',
                                           namespaces=NS)[0]
            for elem in notepad:
                body_elt.append(elem)
        except IndexError:
            pass

        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        if len(body_elt) > 0:
            content['content'] = sd_etree.to_string(body_elt, method="html")
        elif body_elt.text:
            content['content'] = '<pre>' + body_elt.text + '</pre>'
            content['format'] = CONTENT_TYPE.PREFORMATTED
        return content
 def test_void_elements_fix(self):
     html_raw = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>'
     expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>'
     parsed = sd_etree.parse_html(html_raw)
     sd_etree.fix_html_void_elements(parsed)
     self.assertEqual(sd_etree.to_string(parsed), expected)
    def _parse_content(self, article):
        """Parse body_html and mapping to fields required for apple news format

        :param article:
        """
        statement_regex = re.compile(r'^The Statement$', re.IGNORECASE)
        analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE)
        verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE)
        references_regex = re.compile(r'^The References$', re.IGNORECASE)
        url_regex = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%.]+', re.IGNORECASE)
        abstract = get_text(article.get('abstract'), content='html').strip()

        article['_title'] = abstract
        body_html = article.get('body_html')
        article['_analysis_first_line'] = ''
        article['_analysis'] = ''
        article['_statement'] = ''
        article['_statement_attribution'] = ''
        article['_verdict1'] = ''
        article['_verdict2'] = ''
        article['_references'] = ''
        article['_revision_history'] = ''

        if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED:
            article['_title'] = 'This article has been removed.'
            article['_analysis_first_line'] = 'This article has been removed.'
            article['_analysis'] = 'This article has been removed.'
            article['_statement'] = 'This article has been removed.'
            article['_statement_attribution'] = 'This article has been removed.'
            article['_verdict1'] = 'This article has been removed.'
            article['_verdict2'] = 'This article has been removed.'
            article['_references'] = 'This article has been removed.'
            self._set_revision_history(article)
            return

        parsed_content = parse_html(body_html, content='html')
        statement_found = False
        analysis_found = False
        analysis_first_line = False
        verdict1_found = False
        verdict2_found = False
        references_found = False
        statement_elements = []

        for top_level_tag in parsed_content.xpath('/html/div/child::*'):
            tag_text = format_text_content(top_level_tag).strip()
            if not tag_text:
                continue

            if not verdict1_found:
                if not statement_found:
                    match = statement_regex.search(tag_text)
                    if match:
                        statement_found = True
                    continue
                else:
                    # statement found
                    match = verdict_regex.search(tag_text)
                    if match:
                        verdict1_found = True
                        if len(statement_elements) > 1:
                            statement_length = len(statement_elements) - 1
                            for i in range(statement_length):
                                article['_statement'] += get_text(
                                    to_string(statement_elements[i], remove_root_div=False),
                                    content='html'
                                ).strip()
                                if statement_length > 1 and i != statement_length - 1:
                                    article['_statement'] += '\r\n'

                            article['_statement_attribution'] = get_text(
                                to_string(statement_elements[-1:][0], remove_root_div=False),
                                content='html'
                            ).strip()
                        elif len(statement_elements) == 1:
                            article['_statement'] = to_string(
                                statement_elements[0],
                                remove_root_div=False
                            )
                        continue

                    statement_elements.append(top_level_tag)
                    continue

            if verdict1_found and not analysis_found:
                match = analysis_regex.search(tag_text)
                if match:
                    analysis_found = True
                else:
                    article['_verdict1'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if analysis_found and not verdict2_found:
                if not analysis_first_line:
                    article['_analysis_first_line'] = tag_text
                    analysis_first_line = True

                match = verdict_regex.search(tag_text)
                if match:
                    verdict2_found = True
                else:
                    article['_analysis'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if verdict2_found and not references_found:
                match = references_regex.search(tag_text)
                if match:
                    references_found = True
                else:
                    article['_verdict2'] += to_string(top_level_tag, remove_root_div=False)
                continue

            if references_found:
                def replacement(match_object):
                    value = match_object.group(0)
                    if value:
                        return '<a href="{0}">{0}</a>'.format(value)
                    return ''

                tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip()

                article['_references'] += '<li>{}</li>'.format(
                    re.sub(url_regex, replacement, tag_text)
                )

        if len(article['_references']):
            article['_references'] = '<ol>{}</ol>'.format(article['_references'])

        if not article.get('_statement') and article.get('_statement_attribution'):
            # if statement is not as per the format
            article['_statement'] = article.get('_statement_attribution')
            article['_statement_attribution'] = ''

        self._set_revision_history(article)
Example #34
0
 def test_void_elements_fix(self):
     html = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>'
     expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>'
     parsed = sd_etree.parse_html(html)
     sd_etree.fix_html_void_elements(parsed)
     self.assertEqual(sd_etree.to_string(parsed), expected)
Example #35
0
    def post_process_item(self, item, provider):

        InvestorRelease = (len(item.get('anpa_category', []))
                           and item['anpa_category'][0].get('qcode',
                                                            '').lower() == 'k')

        if InvestorRelease:
            # IRW News Release:
            item['slugline'] = 'IRW News Release'
            item['headline'] = 'IRW News Release: ' + item.get(
                self.ITEM_TAKE_KEY, '')
        else:
            item['slugline'] = 'Media Release'
            item['headline'] = 'Media Release: ' + item.get(
                self.ITEM_TAKE_KEY, '')

        # Truncate the take key if required
        if len(item.get(self.ITEM_TAKE_KEY, '')) > 24:
            item[self.ITEM_TAKE_KEY] = item.get(self.ITEM_TAKE_KEY, '')[0:24]

        genre_map = superdesk.get_resource_service('vocabularies').find_one(
            req=None, _id='genre')
        item['genre'] = [
            x for x in genre_map.get('items', [])
            if x['qcode'] == 'Press Release' and x['is_active']
        ]
        body_html_elem = parse_html(item.get('body_html', '<pre> </pre>'))
        ptag = body_html_elem.find('.//pre')
        if ptag is not None:
            body = ''
            lines = ptag.text.split('\n')
            for line in lines:
                if len(line) > 75:
                    line = textwrap.fill(line, 75)
                body += '{}\n'.format(line)
            ptag.text = body
            if InvestorRelease:
                ptag.text = '{} '.format('Investor Relations news release distributed by AAP Medianet. \r\n\r\n\r\n') \
                            + ptag.text
            else:
                ptag.text = '{} '.format(
                    'Media release distributed by AAP Medianet. \r\n\r\n\r\n'
                ) + ptag.text
            item['body_html'] = to_string(body_html_elem)

        locator_map = superdesk.get_resource_service('vocabularies').find_one(
            req=None, _id='locators')
        place_strs = item.pop('place').split(' ')
        for place in place_strs:
            if place in self.place_map:
                replace = [
                    x for x in locator_map.get('items', [])
                    if x['qcode'] == self.place_map.get(place, '').upper()
                ]
                if replace is not None:
                    item[self.ITEM_PLACE] = replace

            if place in self.subject_map:
                if item.get(self.ITEM_SUBJECT) is None:
                    item[self.ITEM_SUBJECT] = []
                item['subject'].append({
                    'qcode':
                    self.subject_map.get(place),
                    'name':
                    subject_codes[self.subject_map.get(place)]
                })

        return item
Example #36
0
    def format(self, article, subscriber, codes=None):
        try:
            docs = []
            formatted_article = deepcopy(article)
            for category in self._get_category_list(
                    formatted_article.get('anpa_category')):
                mapped_source = self._get_mapped_source(formatted_article)
                formatted_article[config.ID_FIELD] = formatted_article.get(
                    'item_id', formatted_article.get(config.ID_FIELD))
                pub_seq_num = superdesk.get_resource_service(
                    'subscribers').generate_sequence_number(subscriber)
                anpa = []

                if codes:
                    anpa.append(b'\x05')
                    anpa.append(' '.join(codes).encode('ascii'))
                    anpa.append(b'\x0D\x0A')

                # start of message header (syn syn soh)
                anpa.append(b'\x16\x16\x01')
                anpa.append(
                    get_service_level(category,
                                      formatted_article).encode('ascii'))

                # story number
                anpa.append(str(pub_seq_num).zfill(4).encode('ascii'))

                # field seperator
                anpa.append(b'\x0A')  # -LF
                anpa.append(
                    map_priority(
                        formatted_article.get('priority')).encode('ascii'))
                anpa.append(b'\x20')

                anpa.append(category['qcode'].lower().encode('ascii'))

                anpa.append(b'\x13')
                # format identifier
                if formatted_article.get(FORMAT,
                                         FORMATS.HTML) == FORMATS.PRESERVED:
                    anpa.append(b'\x12')
                else:
                    anpa.append(b'\x11')
                anpa.append(b'\x20')

                # keyword
                keyword = 'bc-{}'.format(
                    self.append_legal(article=formatted_article,
                                      truncate=True)).replace(' ', '-')
                keyword = keyword[:24] if len(keyword) > 24 else keyword
                anpa.append(keyword.encode('ascii'))
                anpa.append(b'\x20')

                # version field
                anpa.append(b'\x20')

                # reference field
                anpa.append(b'\x20')

                # filing date
                anpa.append('{}-{}'.format(
                    formatted_article['_updated'].strftime('%m'),
                    formatted_article['_updated'].strftime('%d')).encode(
                        'ascii'))
                anpa.append(b'\x20')

                # add the word count
                anpa.append(
                    str(formatted_article.get(
                        'word_count', '0000')).zfill(4).encode('ascii'))
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x02')  # STX

                self._process_headline(anpa, formatted_article,
                                       category['qcode'].encode('ascii'))

                keyword = SluglineMapper().map(
                    article=formatted_article,
                    category=category['qcode'].upper(),
                    truncate=True).encode('ascii', 'ignore')
                anpa.append(keyword)
                take_key = (formatted_article.get('anpa_take_key', '')
                            or '').encode('ascii', 'ignore')
                anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                if formatted_article.get('ednote', '') != '':
                    ednote = '{}\r\n'.format(
                        to_ascii(formatted_article.get('ednote')))
                    anpa.append(ednote.encode('ascii', 'replace'))

                if formatted_article.get(BYLINE):
                    anpa.append(
                        get_text(formatted_article.get(BYLINE)).encode(
                            'ascii', 'replace'))
                    anpa.append(b'\x0D\x0A')

                if formatted_article.get(FORMAT) == FORMATS.PRESERVED:
                    anpa.append(
                        get_text(self.append_body_footer(formatted_article),
                                 content='html').encode('ascii', 'replace'))
                else:
                    body = to_ascii(formatted_article.get('body_html', ''))
                    # we need to inject the dateline
                    if formatted_article.get(
                            'dateline', {}).get('text') and not article.get(
                                'auto_publish', False):
                        body_html_elem = parse_html(
                            formatted_article.get('body_html'))
                        ptag = body_html_elem.find('.//p')
                        if ptag is not None:
                            ptag.text = formatted_article['dateline'][
                                'text'] + ' ' + (ptag.text or '')
                            body = to_string(body_html_elem)
                    anpa.append(self.get_text_content(body))
                    if formatted_article.get('body_footer'):
                        anpa.append(
                            self.get_text_content(
                                to_ascii(
                                    formatted_article.get('body_footer', ''))))

                anpa.append(b'\x0D\x0A')
                anpa.append(mapped_source.encode('ascii'))
                sign_off = (formatted_article.get('sign_off', '')
                            or '').encode('ascii')
                anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x03')  # ETX

                # time and date
                anpa.append(datetime.datetime.now().strftime(
                    '%d-%m-%y %H-%M-%S').encode('ascii'))

                anpa.append(b'\x04')  # EOT
                anpa.append(
                    b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A'
                )

                docs.append({
                    'published_seq_num': pub_seq_num,
                    'encoded_item': b''.join(anpa),
                    'formatted_item': b''.join(anpa).decode('ascii')
                })

            return docs
        except Exception as ex:
            raise FormatterError.AnpaFormatterError(ex, subscriber)
Example #37
0
    def format(self, article, subscriber, codes=None):
        try:
            docs = []
            formatted_article = deepcopy(article)
            for category in self._get_category_list(formatted_article.get('anpa_category')):
                mapped_source = self._get_mapped_source(formatted_article)
                formatted_article[config.ID_FIELD] = formatted_article.get('item_id',
                                                                           formatted_article.get(config.ID_FIELD))
                pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
                anpa = []

                if codes:
                    anpa.append(b'\x05')
                    anpa.append(' '.join(codes).encode('ascii'))
                    anpa.append(b'\x0D\x0A')

                # start of message header (syn syn soh)
                anpa.append(b'\x16\x16\x01')
                anpa.append(get_service_level(category, formatted_article).encode('ascii'))

                # story number
                anpa.append(str(pub_seq_num).zfill(4).encode('ascii'))

                # field seperator
                anpa.append(b'\x0A')  # -LF
                anpa.append(map_priority(formatted_article.get('priority')).encode('ascii'))
                anpa.append(b'\x20')

                anpa.append(category['qcode'].lower().encode('ascii'))

                anpa.append(b'\x13')
                # format identifier
                if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED:
                    anpa.append(b'\x12')
                else:
                    anpa.append(b'\x11')
                anpa.append(b'\x20')

                # keyword
                keyword = 'bc-{}'.format(self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-')
                keyword = keyword[:24] if len(keyword) > 24 else keyword
                anpa.append(keyword.encode('ascii'))
                anpa.append(b'\x20')

                # version field
                anpa.append(b'\x20')

                # reference field
                anpa.append(b'\x20')

                # filing date
                anpa.append('{}-{}'.format(formatted_article['_updated'].strftime('%m'),
                                           formatted_article['_updated'].strftime('%d')).encode('ascii'))
                anpa.append(b'\x20')

                # add the word count
                anpa.append(str(formatted_article.get('word_count', '0000')).zfill(4).encode('ascii'))
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x02')  # STX

                self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii'))

                keyword = SluglineMapper().map(article=formatted_article, category=category['qcode'].upper(),
                                               truncate=True).encode('ascii', 'ignore')
                anpa.append(keyword)
                take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore')
                anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                if formatted_article.get('ednote', '') != '':
                    ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote')))
                    anpa.append(ednote.encode('ascii', 'replace'))

                if formatted_article.get(BYLINE):
                    anpa.append(get_text(formatted_article.get(BYLINE)).encode('ascii', 'replace'))
                    anpa.append(b'\x0D\x0A')

                if formatted_article.get(FORMAT) == FORMATS.PRESERVED:
                    anpa.append(get_text(self.append_body_footer(formatted_article),
                                         content='html').encode('ascii', 'replace'))
                else:
                    body = to_ascii(formatted_article.get('body_html', ''))
                    # we need to inject the dateline
                    if formatted_article.get('dateline', {}).get('text') and not article.get('auto_publish', False):
                        body_html_elem = parse_html(formatted_article.get('body_html'))
                        ptag = body_html_elem.find('.//p')
                        if ptag is not None:
                            ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '')
                            body = to_string(body_html_elem)
                    anpa.append(self.get_text_content(body))
                    if formatted_article.get('body_footer'):
                        anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', ''))))

                anpa.append(b'\x0D\x0A')
                anpa.append(mapped_source.encode('ascii'))
                sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii')
                anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'')
                anpa.append(b'\x0D\x0A')

                anpa.append(b'\x03')  # ETX

                # time and date
                anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii'))

                anpa.append(b'\x04')  # EOT
                anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A')

                docs.append({'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa),
                             'formatted_item': b''.join(anpa).decode('ascii')})

            return docs
        except Exception as ex:
            raise FormatterError.AnpaFormatterError(ex, subscriber)