def test_encode_carriage_return(self): text = 'This is first line.\r\nThis is second line.\r\n' parsed = sd_etree.parse_html(text) self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed)) text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>' parsed = sd_etree.parse_html(text, content='html') self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed))
def test_encode_carriage_return(self): text = 'This is first line.\r\nThis is second line.\r\n' parsed = sd_etree.parse_html(text) self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed)) text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>' parsed = sd_etree.parse_html(text, content='html') self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed))
def test_encode_carriage_return(self): text = 'This is first line.\r\nThis is second line.\r\n' parsed = parse_html(text) self.assertEqual(text.replace('\r', ' '), to_string(parsed)) text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>' parsed = parse_html(text, content='html') self.assertEqual( '<html><body>{}</body></html>'.format(text.replace('\r', ' ')), to_string(parsed))
def test_encode_carriage_return(self): text = "This is first line.\r\nThis is second line.\r\n" parsed = sd_etree.parse_html(text) self.assertEqual(text.replace("\r", " "), sd_etree.to_string(parsed)) text = "<pre>This is first line.\r\nThis is second line.\r\n</pre>" parsed = sd_etree.parse_html(text, content="html") self.assertEqual(text.replace("\r", " "), sd_etree.to_string(parsed))
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ # we need to convert CRLF to <p> # cf. SDTS-22 html = html.replace(" ", "\r") splitted = html.split("\r\n") if len(splitted) == 1 and "<p>" not in html: splitted = html.split("\n") if len(splitted) > 1: html = "".join([ "<p>{}</p>".format(s) if not is_block_elem(s) else s for s in splitted if s.strip() ]) if "img" in html: content = sd_etree.parse_html(html, "html") for img in content.xpath("//img"): try: src = self.check_url(img.get("src")) except ValueError: logger.warning("Can't fetch image: {elt}".format( elt=sd_etree.to_string(img))) continue try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue url = media_data["renditions"]["original"]["href"] img.set("src", url) if key == "featuremedia": # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format("START", key)) embed_end = etree.Comment(embed_TPL.format("END", key)) img.addprevious(embed_start) img.addnext(embed_end) content = sd_etree.fix_html_void_elements(content) html = sd_etree.to_string(content, encoding="unicode", method="xml") html = remove_shortcodes(html) item["body_html"] = html
def parse_thumbnail(self, item_elt, item): """Check for _thumbnail_id meta_key, and use its attachment as feature media If the key is found, the linked item is looked for, and its attachment_url is used as feature media """ thumbnail_elt = item_elt.xpath( 'wp:postmeta/wp:meta_key[text()="_thumbnail_id"]', namespaces=nsmap) if not thumbnail_elt: return thumbnail_elt = thumbnail_elt[0] try: post_id = thumbnail_elt.xpath("../wp:meta_value/text()", namespaces=nsmap)[0].strip() if not post_id: raise IndexError except IndexError: logger.warning("invalid post_id, ignoring: {elt}".format( elt=sd_etree.to_string(thumbnail_elt.xpath("..")[0]))) return try: if '"' in post_id: raise ValueError('post id should not contain " (double quote)') post_id_elt = item_elt.xpath( '/rss/channel/item/wp:post_id[text()="{}"]'.format(post_id), namespaces=nsmap)[0] att_item_elt = post_id_elt.getparent() url = att_item_elt.xpath("wp:attachment_url", namespaces=nsmap)[0].text.strip() url = self.check_url(url) except (IndexError, ValueError) as e: logger.warning( "Can't find attachement URL, ignoring: {e}\n{elt}".format( e=e, elt=sd_etree.to_string(thumbnail_elt.getparent()))) return try: key, media_data = self._add_image(item, url) except Exception as e: logger.error(e) return for key, elt_names in (("description_text", ("description", "title")), ("alt_text", ("title", ))): for elt_name in elt_names: elt = att_item_elt.find(elt_name) if elt is not None and elt.text: media_data[key] = elt.text break else: media_data[key] = ""
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ # we need to convert CRLF to <p> # cf. SDTS-22 html = html.replace(' ', '\r') splitted = html.split('\r\n') if len(splitted) == 1 and '<p>' not in html: splitted = html.split('\n') if len(splitted) > 1: html = ''.join([ '<p>{}</p>'.format(s) if not is_block_elem(s) else s for s in splitted if s.strip() ]) if "img" in html: content = sd_etree.parse_html(html, 'html') for img in content.xpath('//img'): try: src = self.check_url(img.get('src')) except ValueError: logger.warning("Can't fetch image: {elt}".format( elt=sd_etree.to_string(img))) continue try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue url = media_data['renditions']['original']['href'] img.set("src", url) if key == 'featuremedia': # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format('START', key)) embed_end = etree.Comment(embed_TPL.format('END', key)) img.addprevious(embed_start) img.addnext(embed_end) content = sd_etree.fix_html_void_elements(content) html = sd_etree.to_string(content, encoding="unicode", method='xml') item['body_html'] = html
def _format_body_content(self, article, body_content): nitf_body = [] if article.get('ednote'): nitf_body.append(to_ascii(self._format_line(article.get('ednote')))) if article.get(BYLINE): nitf_body.append(to_ascii(self._format_line(get_text(article.get(BYLINE))))) if article.get(FORMAT) == FORMATS.PRESERVED: nitf_body.append(to_ascii(get_text(self.append_body_footer(article), content='html'))) else: body = article.get('body_html', '') # we need to inject the dateline if article.get('dateline', {}).get('text') and not article.get('auto_publish', False): body_html_elem = parse_html(article.get('body_html')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = article['dateline']['text'] + ' ' + (ptag.text or '') body = to_string(body_html_elem) nitf_body.append(self.get_text_content(body)) if article.get('body_footer'): nitf_body.append(self.get_text_content(article.get('body_footer', ''))) sign_off = '{} {}'.format(article.get('source') or '', (article.get('sign_off') or '')).strip() if sign_off: nitf_body.append(to_ascii(self._format_line(sign_off))) SubElement(body_content, 'pre').text = ''.join(nitf_body)
def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname('html')) body_elt = html_elt.find(self.qname('body')) body_elt = sd_etree.clean_html(body_elt) # replace <pre> with <p> for pre in body_elt.findall('.//pre'): pre.tag = 'p' # add target blank for all links for a in body_elt.findall('.//a'): a.attrib['target'] = '_blank' content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt] content['content'] = '\n'.join(contents) elif body_elt.text: content['content'] = '<p>' + body_elt.text + '</p>' content['format'] = 'xhtml/xml' if content.get('content'): content['content'] = content['content'].replace('<endash>-</endash>', '-') return content
def _inject_dateline(self, formatted_article): """Inject dateline in article's body_html""" body_html_elem = sd_etree.parse_html(formatted_article.get('body_html', '<p> </p>')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '') formatted_article['body_html'] = sd_etree.to_string(body_html_elem)
def extract_kill_reason_from_html(html, is_kill): """Extract the reason from html for a kill/takedown Iterates over the xml nodes and find the node that contains the reason prefix. Once the reason prefix has been found add the proceeding nodes to our reason tree, until the kill/takedown suffix has been found. :param html: :param is_kill: :return: """ try: # Create a new tree that we will use to construct the reason nodes root = etree.Element('div') # A flag to indicate if we're to add the current child node to our reason tree adding_nodes = False for child in parse_html(html, content='html'): # Obtain the text from our child nodes (including sub-child nodes) child_text = ''.join(child.itertext()) if not adding_nodes and REASON_PREFIX in child_text: # This child node contains the reason prefix (and we haven't found it already) # Therefor set the flag to True indicating that the following child nodes # are to be added to our reason tree adding_nodes = True continue elif adding_nodes: # If the kill/takedown suffix has been found, then our reason tree is complete if is_kill and KILL_SUFFIX in child_text: break elif not is_kill and TAKEDOWN_SUFFIX in child_text: break # Otherwise continue adding the child nodes to our reason tree # Remove the last sub-child if it only contains a line break if len(child) > 0: last_child = child[-1] if etree.tostring(last_child) == b'<p><br/></p>': child.remove(last_child) # Then add this child node to our reason tree root.append(child) num_children = len(list(root)) # If the reason tree was not populated, then return the original html provided if num_children == 0: return html # Our reason tree was populated, convert the tree to a string and return it return to_string(root, method='html', remove_root_div=num_children == 1) except Exception as e: logger.exception(e) return html
def clean_html(body_html): ''' Make sure the html will parse and inject \r\n in an attempt to avoid issues with lines being too long for SMTP :param body_html: :return: parsed and re-written html ''' root = sd_etree.parse_html(body_html, content='html', lf_on_block=True) return sd_etree.to_string(root, method='html', pretty_print=True).replace('>\n', '>\r\n')
def _inject_dateline(self, formatted_article): """Inject dateline in article's body_html""" body_html_elem = sd_etree.parse_html( formatted_article.get("body_html", "<p> </p>")) ptag = body_html_elem.find(".//p") if ptag is not None: ptag.text = formatted_article["dateline"]["text"] + " " + ( ptag.text or "") formatted_article["body_html"] = sd_etree.to_string(body_html_elem)
def _transform_to_ninjs(self, article, subscriber, recursive=True): """ Re-wire that href's in the document to be relative to the destination FTP server root, it expects the destination to be an FTP server :param article: :param subscriber: :param recursive: :return: """ include_original = subscriber.get("destinations")[0].get("config").get( "include_original", False) if include_original: self.internal_renditions = ["original"] ninjs = super()._transform_to_ninjs(article, subscriber, recursive) # Get the path that the renditions will be pushed to path = subscriber.get("destinations")[0].get("config").get( "associated_path") if path: renditions = ninjs.get("renditions") if renditions: for name, rendition in renditions.items(): rendition["href"] = ( "/" + path.lstrip("/") + ("/" if not path.endswith("/") else "") + get_rendition_file_name(rendition)) if article.get("type", "") == "text": # Find any embeded image references in the body_html and re-wire the img src reference and insert an id html_updated = False root_elem = lxml_html.fromstring(ninjs.get("body_html")) # Scan any comments for embed markers comments = root_elem.xpath("//comment()") for comment in comments: if "EMBED START Image" in comment.text: regex = r"<!-- EMBED START Image {id: \"editor_([0:9]+)" m = re.search(regex, ninjs.get("body_html", "")) # Assumes the sibling of the Embed Image comment is the figure tag containing the image figureElem = comment.getnext() if figureElem is not None and figureElem.tag == "figure": imgElem = figureElem.find("./img") if imgElem is not None and m and m.group(1): embed_id = "editor_" + m.group(1) imgElem.attrib["id"] = embed_id src = self._get_source_ref(embed_id, ninjs) if src: imgElem.attrib["src"] = src html_updated = True if html_updated: ninjs["body_html"] = to_string(root_elem, method="html") return ninjs
def clean_html(html): cleaner = lxml.html.clean.Cleaner() root = lxml.html.fromstring(html) for elem in root.iter(): elem.attrib.pop("id", None) elem.attrib.pop("class", None) if elem.tag in ('hl2', 'pre', 'note'): elem.tag = 'p' root = cleaner.clean_html(root) return sd_etree.to_string(root, method="html")
def _transform_to_ninjs(self, article, subscriber, recursive=True): """ Re-wire that href's in the document to be relative to the destination FTP server root, it expects the destination to be an FTP server :param article: :param subscriber: :param recursive: :return: """ include_original = subscriber.get('destinations')[0].get('config').get( 'include_original', False) if include_original: self.internal_renditions = ['original'] ninjs = super()._transform_to_ninjs(article, subscriber, recursive) # Get the path that the renditions will be pushed to path = subscriber.get('destinations')[0].get('config').get( 'associated_path') if path: renditions = ninjs.get('renditions') if renditions: for name, rendition in renditions.items(): rendition['href'] = '/' + path.lstrip('/') + ( '/' if not path.endswith('/') else '') + get_rendition_file_name(rendition) if article.get('type', '') == 'text': # Find any embeded image references in the body_html and re-wire the img src reference and insert an id html_updated = False root_elem = lxml_html.fromstring(ninjs.get('body_html')) # Scan any comments for embed markers comments = root_elem.xpath('//comment()') for comment in comments: if 'EMBED START Image' in comment.text: regex = r"<!-- EMBED START Image {id: \"editor_([0:9]+)" m = re.search(regex, ninjs.get('body_html', '')) # Assumes the sibling of the Embed Image comment is the figure tag containing the image figureElem = comment.getnext() if figureElem is not None and figureElem.tag == 'figure': imgElem = figureElem.find('./img') if imgElem is not None and m and m.group(1): embed_id = 'editor_' + m.group(1) imgElem.attrib['id'] = embed_id src = self._get_source_ref(embed_id, ninjs) if src: imgElem.attrib['src'] = src html_updated = True if html_updated: ninjs['body_html'] = to_string(root_elem, method='html') return ninjs
def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname('html')) body_elt = html_elt.find(self.qname('body')) body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt] content['content'] = '\n'.join(contents) elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname('html')) body_elt = html_elt.find(self.qname('body')) body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt] content['content'] = '\n'.join(contents) elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def parse_inline_content(self, tree, item): try: body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0] except IndexError: body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0] body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: content['content'] = sd_etree.to_string(body_elt, method="html") elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def remove_breaks(item, **kwargs): try: html = item.get('body_html') if html: html = html.replace('<br>', '<br/>').replace('</br>', ' ') parsed = parse_html(html, content='xml') for br in parsed.xpath('//br'): br.tail = ' ' + br.tail if br.tail else ' ' etree.strip_elements(parsed, 'br', with_tail=False) item['body_html'] = to_string(parsed) return item except Exception as ex: logging.exception('Exception in preserve format macro: ', ex) raise ex
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = 6 item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) # Hard code the urgency item['urgency'] = 3 # Dateline is always Wellington in NZ located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if c.get('city', '').lower() == 'wellington'] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] if item.get('body_html') and item['dateline']: parsed = parse_html(item.get('body_html'), content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue # check the first par for a byline if pars.index(par) == 0 and par.text.startswith('By '): item['byline'] = par.text.replace('By ', '') par.getparent().remove(par) date, source, the_rest = par.text.partition(' (BusinessDesk) - ') if source: item['dateline']['date'] = date_parser(date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars): par.getparent().remove(par) item['body_html'] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') if locator_map: item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ'] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def _format_content(self, item, is_broadcast): if is_broadcast and item.get("abstract"): content = item["abstract"] if "<p>" not in content: content = "<p>{}</p>".format(content) else: content = item.get("body_html") if not content: return "" tree = lxml.html.fromstring(content) for elem in tree.iter(): if elem.tag == "b": elem.tag = "strong" elif elem.tag == "i": elem.tag = "em" return sd_etree.to_string(tree, encoding="unicode", method="html")
def _format_content(self, item, is_broadcast): if is_broadcast and item.get('abstract'): content = item['abstract'] if '<p>' not in content: content = '<p>{}</p>'.format(content) else: content = item.get('body_html') if not content: return '' tree = lxml.html.fromstring(content) for elem in tree.iter(): if elem.tag == 'b': elem.tag = 'strong' elif elem.tag == 'i': elem.tag = 'em' return sd_etree.to_string(tree, encoding='unicode', method='html')
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param element: The xml element to populate :param html: the html to parse the text from :return: """ html = html.replace('<br>', '<br/>').replace('</br>', '') html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html) html = html.replace('\n', ' ') html = re.sub(r'\s\s+', ' ', html) parsed = parse_html(html, content='html') for tag in parsed.xpath('//*'): if tag.getparent() is not None and tag.getparent().tag == 'body': p = etree.Element('p') p.text = to_ascii(get_text(to_string(tag, method='html'), content='html')) element.append(p)
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param element: The xml element to populate :param html: the html to parse the text from :return: """ html = html.replace('<br>', '<br/>').replace('</br>', '') html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html) html = html.replace('\n', ' ') html = re.sub(r'\s\s+', ' ', html) parsed = parse_html(html, content='html') for tag in parsed.xpath('/html/div/child::*'): p = etree.Element('p') p.text = to_ascii( get_text(to_string(tag, method='html'), content='html')) element.append(p)
def generate_embed_renditions(item): def _get_source_ref(marker, item): try: return item.get("associations").get(marker).get("renditions").get("_newsroom_custom").get("href") except Exception: return None has_editor_assoc = False # generate required watermarked renditions for any embedded renditions for name, association in ((item.get('associations') or {})).items(): if name.startswith('editor_') and association: generate_preview_details_renditions(item.get('associations', {}).get(name), 'viewImage') has_editor_assoc = True if has_editor_assoc: # parse out any editor embeds in the item and re-point to the required rendition regex = r' EMBED START Image {id: \"editor_([0-9]+)' html_updated = False root_elem = lxml_html.fromstring(item.get('body_html', '')) comments = root_elem.xpath('//comment()') for comment in comments: if 'EMBED START Image' in comment.text: m = re.search(regex, comment.text) # Assumes the sibling of the Embed Image comment is the figure tag containing the image figure_elem = comment.getnext() if figure_elem is not None and figure_elem.tag == "figure": imgElem = figure_elem.find("./img") if imgElem is not None and m and m.group(1): embed_id = "editor_" + m.group(1) imgElem.attrib["id"] = embed_id src = _get_source_ref(embed_id, item) if src: imgElem.attrib["src"] = src html_updated = True if html_updated: item["body_html"] = to_string(root_elem, method="html") # If there is no feature media them copy the last embedded image to be the feature media if not ((item.get('associations') or {}).get('featuremedia') or {}).get('renditions'): item['associations']['featuremedia'] = deepcopy(item.get('associations').get(embed_id)) generate_renditions(item)
def _format_content(self, item, is_broadcast): if is_broadcast and item.get("abstract"): content = item["abstract"] if "<p>" not in content: content = "<p>{}</p>".format(content) else: content = item.get("body_html") if not content: return "" tree = lxml.html.fromstring(content) for elem in tree.iter(): if elem.tag == "b": elem.tag = "strong" elif elem.tag == "i": elem.tag = "em" # Remove whitespace and empty tags if elem.tag in INLINE_ELEMENTS and elem.text is not None and not elem.text.strip(): elem.drop_tree() return sd_etree.to_string(tree, encoding="unicode", method="html")
def parse_inline_content(self, tree, item): html_elt = tree.find(self.qname("html")) body_elt = html_elt.find(self.qname("body")) body_elt = sd_etree.clean_html(body_elt) content = dict() content["contenttype"] = tree.attrib["contenttype"] if len(body_elt) > 0: contents = [ sd_etree.to_string(e, encoding="unicode", method="html") for e in body_elt ] content["content"] = "\n".join(contents) elif body_elt.text: content["content"] = "<pre>" + body_elt.text + "</pre>" content["format"] = CONTENT_TYPE.PREFORMATTED if content.get("content"): content["content"] = content["content"].replace( "<endash>-</endash>", "-") return content
def get_body(self, news_item): try: raw_content = news_item.xpath( 'NewsComponent/ContentItem[@Euid="announcement_html"]/DataContent/text()' )[0] except IndexError: logger.warning("No content found in element: {xml}".format( xml=etree.tostring(news_item, encoding="unicode"))) return "" content_elt = sd_etree.parse_html(raw_content) h1 = content_elt.find('h1') if h1 is not None: content_elt.remove(h1) categories = news_item.xpath( 'NewsComponent/Metadata/Property[@FormalName="Message Category"]/@Value' ) if categories: category = categories[0] p_elt = etree.Element('p') p_elt.text = category content_elt.insert(0, p_elt) ori_ann_urls = news_item.xpath( 'NewsComponent/Metadata/Property[@FormalName="nordicAgencyWebsite"]/@Value' ) if ori_ann_urls: url = ori_ann_urls[0] if not url.startswith('http'): raise ValueError("Invalid url: {url}".format(url=url)) p_elt = etree.SubElement(content_elt, "p") p_elt.text = 'Se saken i sin helhet: ' a_elt = etree.SubElement(p_elt, "a", attrib={'href': url}) a_elt.text = url ret = sd_etree.to_string(content_elt) return ret
def parse(self, xml, provider=None): item = super().parse(xml, provider) item['slugline'] = '' category = ingest_category_from_subject(item.get( 'subject')) # check for sports using all ingested subjects item['subject'] = filter_missing_subjects(item.get('subject')) item['subject'].append(category) urgency = item.get('urgency', None) if urgency == 2: item['urgency'] = 3 elif urgency == 4: item['urgency'] = 5 set_default_service(item) if not item.get('headline') and item.get('body_html'): first_line = item.get('body_html').strip().split('\n')[0] parsed_headline = etree.parse_html(first_line, 'html') item['headline'] = etree.to_string( parsed_headline, method="text").strip().split('\n')[0] return item
def parse_inline_content(self, tree, item): try: body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0] except IndexError: body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0] try: notepad = self.item_tree.xpath('.//iptc:edNote[@role="dpaednoterole:notepad"]//xhtml:section', namespaces=NS)[0] for elem in notepad: body_elt.append(elem) except IndexError: pass body_elt = sd_etree.clean_html(body_elt) content = dict() content['contenttype'] = tree.attrib['contenttype'] if len(body_elt) > 0: content['content'] = sd_etree.to_string(body_elt, method="html") elif body_elt.text: content['content'] = '<pre>' + body_elt.text + '</pre>' content['format'] = CONTENT_TYPE.PREFORMATTED return content
def test_void_elements_fix(self): html_raw = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>' expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>' parsed = sd_etree.parse_html(html_raw) sd_etree.fix_html_void_elements(parsed) self.assertEqual(sd_etree.to_string(parsed), expected)
def _parse_content(self, article): """Parse body_html and mapping to fields required for apple news format :param article: """ statement_regex = re.compile(r'^The Statement$', re.IGNORECASE) analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE) verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE) references_regex = re.compile(r'^The References$', re.IGNORECASE) url_regex = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%.]+', re.IGNORECASE) abstract = get_text(article.get('abstract'), content='html').strip() article['_title'] = abstract body_html = article.get('body_html') article['_analysis_first_line'] = '' article['_analysis'] = '' article['_statement'] = '' article['_statement_attribution'] = '' article['_verdict1'] = '' article['_verdict2'] = '' article['_references'] = '' article['_revision_history'] = '' if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED: article['_title'] = 'This article has been removed.' article['_analysis_first_line'] = 'This article has been removed.' article['_analysis'] = 'This article has been removed.' article['_statement'] = 'This article has been removed.' article['_statement_attribution'] = 'This article has been removed.' article['_verdict1'] = 'This article has been removed.' article['_verdict2'] = 'This article has been removed.' article['_references'] = 'This article has been removed.' self._set_revision_history(article) return parsed_content = parse_html(body_html, content='html') statement_found = False analysis_found = False analysis_first_line = False verdict1_found = False verdict2_found = False references_found = False statement_elements = [] for top_level_tag in parsed_content.xpath('/html/div/child::*'): tag_text = format_text_content(top_level_tag).strip() if not tag_text: continue if not verdict1_found: if not statement_found: match = statement_regex.search(tag_text) if match: statement_found = True continue else: # statement found match = verdict_regex.search(tag_text) if match: verdict1_found = True if len(statement_elements) > 1: statement_length = len(statement_elements) - 1 for i in range(statement_length): article['_statement'] += get_text( to_string(statement_elements[i], remove_root_div=False), content='html' ).strip() if statement_length > 1 and i != statement_length - 1: article['_statement'] += '\r\n' article['_statement_attribution'] = get_text( to_string(statement_elements[-1:][0], remove_root_div=False), content='html' ).strip() elif len(statement_elements) == 1: article['_statement'] = to_string( statement_elements[0], remove_root_div=False ) continue statement_elements.append(top_level_tag) continue if verdict1_found and not analysis_found: match = analysis_regex.search(tag_text) if match: analysis_found = True else: article['_verdict1'] += to_string(top_level_tag, remove_root_div=False) continue if analysis_found and not verdict2_found: if not analysis_first_line: article['_analysis_first_line'] = tag_text analysis_first_line = True match = verdict_regex.search(tag_text) if match: verdict2_found = True else: article['_analysis'] += to_string(top_level_tag, remove_root_div=False) continue if verdict2_found and not references_found: match = references_regex.search(tag_text) if match: references_found = True else: article['_verdict2'] += to_string(top_level_tag, remove_root_div=False) continue if references_found: def replacement(match_object): value = match_object.group(0) if value: return '<a href="{0}">{0}</a>'.format(value) return '' tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip() article['_references'] += '<li>{}</li>'.format( re.sub(url_regex, replacement, tag_text) ) if len(article['_references']): article['_references'] = '<ol>{}</ol>'.format(article['_references']) if not article.get('_statement') and article.get('_statement_attribution'): # if statement is not as per the format article['_statement'] = article.get('_statement_attribution') article['_statement_attribution'] = '' self._set_revision_history(article)
def test_void_elements_fix(self): html = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>' expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>' parsed = sd_etree.parse_html(html) sd_etree.fix_html_void_elements(parsed) self.assertEqual(sd_etree.to_string(parsed), expected)
def post_process_item(self, item, provider): InvestorRelease = (len(item.get('anpa_category', [])) and item['anpa_category'][0].get('qcode', '').lower() == 'k') if InvestorRelease: # IRW News Release: item['slugline'] = 'IRW News Release' item['headline'] = 'IRW News Release: ' + item.get( self.ITEM_TAKE_KEY, '') else: item['slugline'] = 'Media Release' item['headline'] = 'Media Release: ' + item.get( self.ITEM_TAKE_KEY, '') # Truncate the take key if required if len(item.get(self.ITEM_TAKE_KEY, '')) > 24: item[self.ITEM_TAKE_KEY] = item.get(self.ITEM_TAKE_KEY, '')[0:24] genre_map = superdesk.get_resource_service('vocabularies').find_one( req=None, _id='genre') item['genre'] = [ x for x in genre_map.get('items', []) if x['qcode'] == 'Press Release' and x['is_active'] ] body_html_elem = parse_html(item.get('body_html', '<pre> </pre>')) ptag = body_html_elem.find('.//pre') if ptag is not None: body = '' lines = ptag.text.split('\n') for line in lines: if len(line) > 75: line = textwrap.fill(line, 75) body += '{}\n'.format(line) ptag.text = body if InvestorRelease: ptag.text = '{} '.format('Investor Relations news release distributed by AAP Medianet. \r\n\r\n\r\n') \ + ptag.text else: ptag.text = '{} '.format( 'Media release distributed by AAP Medianet. \r\n\r\n\r\n' ) + ptag.text item['body_html'] = to_string(body_html_elem) locator_map = superdesk.get_resource_service('vocabularies').find_one( req=None, _id='locators') place_strs = item.pop('place').split(' ') for place in place_strs: if place in self.place_map: replace = [ x for x in locator_map.get('items', []) if x['qcode'] == self.place_map.get(place, '').upper() ] if replace is not None: item[self.ITEM_PLACE] = replace if place in self.subject_map: if item.get(self.ITEM_SUBJECT) is None: item[self.ITEM_SUBJECT] = [] item['subject'].append({ 'qcode': self.subject_map.get(place), 'name': subject_codes[self.subject_map.get(place)] }) return item
def format(self, article, subscriber, codes=None): try: docs = [] formatted_article = deepcopy(article) for category in self._get_category_list( formatted_article.get('anpa_category')): mapped_source = self._get_mapped_source(formatted_article) formatted_article[config.ID_FIELD] = formatted_article.get( 'item_id', formatted_article.get(config.ID_FIELD)) pub_seq_num = superdesk.get_resource_service( 'subscribers').generate_sequence_number(subscriber) anpa = [] if codes: anpa.append(b'\x05') anpa.append(' '.join(codes).encode('ascii')) anpa.append(b'\x0D\x0A') # start of message header (syn syn soh) anpa.append(b'\x16\x16\x01') anpa.append( get_service_level(category, formatted_article).encode('ascii')) # story number anpa.append(str(pub_seq_num).zfill(4).encode('ascii')) # field seperator anpa.append(b'\x0A') # -LF anpa.append( map_priority( formatted_article.get('priority')).encode('ascii')) anpa.append(b'\x20') anpa.append(category['qcode'].lower().encode('ascii')) anpa.append(b'\x13') # format identifier if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED: anpa.append(b'\x12') else: anpa.append(b'\x11') anpa.append(b'\x20') # keyword keyword = 'bc-{}'.format( self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-') keyword = keyword[:24] if len(keyword) > 24 else keyword anpa.append(keyword.encode('ascii')) anpa.append(b'\x20') # version field anpa.append(b'\x20') # reference field anpa.append(b'\x20') # filing date anpa.append('{}-{}'.format( formatted_article['_updated'].strftime('%m'), formatted_article['_updated'].strftime('%d')).encode( 'ascii')) anpa.append(b'\x20') # add the word count anpa.append( str(formatted_article.get( 'word_count', '0000')).zfill(4).encode('ascii')) anpa.append(b'\x0D\x0A') anpa.append(b'\x02') # STX self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii')) keyword = SluglineMapper().map( article=formatted_article, category=category['qcode'].upper(), truncate=True).encode('ascii', 'ignore') anpa.append(keyword) take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore') anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'') anpa.append(b'\x0D\x0A') if formatted_article.get('ednote', '') != '': ednote = '{}\r\n'.format( to_ascii(formatted_article.get('ednote'))) anpa.append(ednote.encode('ascii', 'replace')) if formatted_article.get(BYLINE): anpa.append( get_text(formatted_article.get(BYLINE)).encode( 'ascii', 'replace')) anpa.append(b'\x0D\x0A') if formatted_article.get(FORMAT) == FORMATS.PRESERVED: anpa.append( get_text(self.append_body_footer(formatted_article), content='html').encode('ascii', 'replace')) else: body = to_ascii(formatted_article.get('body_html', '')) # we need to inject the dateline if formatted_article.get( 'dateline', {}).get('text') and not article.get( 'auto_publish', False): body_html_elem = parse_html( formatted_article.get('body_html')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = formatted_article['dateline'][ 'text'] + ' ' + (ptag.text or '') body = to_string(body_html_elem) anpa.append(self.get_text_content(body)) if formatted_article.get('body_footer'): anpa.append( self.get_text_content( to_ascii( formatted_article.get('body_footer', '')))) anpa.append(b'\x0D\x0A') anpa.append(mapped_source.encode('ascii')) sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii') anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'') anpa.append(b'\x0D\x0A') anpa.append(b'\x03') # ETX # time and date anpa.append(datetime.datetime.now().strftime( '%d-%m-%y %H-%M-%S').encode('ascii')) anpa.append(b'\x04') # EOT anpa.append( b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A' ) docs.append({ 'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa), 'formatted_item': b''.join(anpa).decode('ascii') }) return docs except Exception as ex: raise FormatterError.AnpaFormatterError(ex, subscriber)
def format(self, article, subscriber, codes=None): try: docs = [] formatted_article = deepcopy(article) for category in self._get_category_list(formatted_article.get('anpa_category')): mapped_source = self._get_mapped_source(formatted_article) formatted_article[config.ID_FIELD] = formatted_article.get('item_id', formatted_article.get(config.ID_FIELD)) pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) anpa = [] if codes: anpa.append(b'\x05') anpa.append(' '.join(codes).encode('ascii')) anpa.append(b'\x0D\x0A') # start of message header (syn syn soh) anpa.append(b'\x16\x16\x01') anpa.append(get_service_level(category, formatted_article).encode('ascii')) # story number anpa.append(str(pub_seq_num).zfill(4).encode('ascii')) # field seperator anpa.append(b'\x0A') # -LF anpa.append(map_priority(formatted_article.get('priority')).encode('ascii')) anpa.append(b'\x20') anpa.append(category['qcode'].lower().encode('ascii')) anpa.append(b'\x13') # format identifier if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED: anpa.append(b'\x12') else: anpa.append(b'\x11') anpa.append(b'\x20') # keyword keyword = 'bc-{}'.format(self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-') keyword = keyword[:24] if len(keyword) > 24 else keyword anpa.append(keyword.encode('ascii')) anpa.append(b'\x20') # version field anpa.append(b'\x20') # reference field anpa.append(b'\x20') # filing date anpa.append('{}-{}'.format(formatted_article['_updated'].strftime('%m'), formatted_article['_updated'].strftime('%d')).encode('ascii')) anpa.append(b'\x20') # add the word count anpa.append(str(formatted_article.get('word_count', '0000')).zfill(4).encode('ascii')) anpa.append(b'\x0D\x0A') anpa.append(b'\x02') # STX self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii')) keyword = SluglineMapper().map(article=formatted_article, category=category['qcode'].upper(), truncate=True).encode('ascii', 'ignore') anpa.append(keyword) take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore') anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'') anpa.append(b'\x0D\x0A') if formatted_article.get('ednote', '') != '': ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote'))) anpa.append(ednote.encode('ascii', 'replace')) if formatted_article.get(BYLINE): anpa.append(get_text(formatted_article.get(BYLINE)).encode('ascii', 'replace')) anpa.append(b'\x0D\x0A') if formatted_article.get(FORMAT) == FORMATS.PRESERVED: anpa.append(get_text(self.append_body_footer(formatted_article), content='html').encode('ascii', 'replace')) else: body = to_ascii(formatted_article.get('body_html', '')) # we need to inject the dateline if formatted_article.get('dateline', {}).get('text') and not article.get('auto_publish', False): body_html_elem = parse_html(formatted_article.get('body_html')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '') body = to_string(body_html_elem) anpa.append(self.get_text_content(body)) if formatted_article.get('body_footer'): anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', '')))) anpa.append(b'\x0D\x0A') anpa.append(mapped_source.encode('ascii')) sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii') anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'') anpa.append(b'\x0D\x0A') anpa.append(b'\x03') # ETX # time and date anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii')) anpa.append(b'\x04') # EOT anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A') docs.append({'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa), 'formatted_item': b''.join(anpa).decode('ascii')}) return docs except Exception as ex: raise FormatterError.AnpaFormatterError(ex, subscriber)