def test_encode_carriage_return(self): text = 'This is first line.\r\nThis is second line.\r\n' parsed = sd_etree.parse_html(text) self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed)) text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>' parsed = sd_etree.parse_html(text, content='html') self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed))
def test_encode_carriage_return(self): text = 'This is first line.\r\nThis is second line.\r\n' parsed = sd_etree.parse_html(text) self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed)) text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>' parsed = sd_etree.parse_html(text, content='html') self.assertEqual(text.replace('\r', ' '), sd_etree.to_string(parsed))
def test_encode_carriage_return(self): text = "This is first line.\r\nThis is second line.\r\n" parsed = sd_etree.parse_html(text) self.assertEqual(text.replace("\r", " "), sd_etree.to_string(parsed)) text = "<pre>This is first line.\r\nThis is second line.\r\n</pre>" parsed = sd_etree.parse_html(text, content="html") self.assertEqual(text.replace("\r", " "), sd_etree.to_string(parsed))
def test_encode_carriage_return(self): text = 'This is first line.\r\nThis is second line.\r\n' parsed = parse_html(text) self.assertEqual(text.replace('\r', ' '), to_string(parsed)) text = '<pre>This is first line.\r\nThis is second line.\r\n</pre>' parsed = parse_html(text, content='html') self.assertEqual( '<html><body>{}</body></html>'.format(text.replace('\r', ' ')), to_string(parsed))
def _format_body(self, formatted_article, main_news_component): """ Create an body text NewsComponent element :param dict formatted_article: :param Element main_news_component: """ content_item = SubElement(main_news_component, "ContentItem", attrib={'Duid': 'CI00001'}) SubElement(content_item, 'MediaType', {'FormalName': 'Text'}) SubElement(content_item, 'Format', {'FormalName': 'XHTML'}) data_content = SubElement(content_item, 'DataContent') html = SubElement(data_content, 'html', attrib={'xmlns': 'http://www.w3.org/1999/xhtml', 'lang': 'en', XML_LANG: formatted_article.get('language', 'en')}) head = SubElement(html, 'head') SubElement(head, 'title') # Title has been removed to match the existing feed # title.text = formatted_article.get('headline', '') body = SubElement(html, 'body') if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED: body.append(etree.fromstring(formatted_article.get('body_html'))) else: if formatted_article.get('byline'): body.append(etree.fromstring('<p>' + formatted_article.get('byline', '') + '</p>')) root = sd_etree.parse_html(self.append_body_footer(formatted_article), content='html') if formatted_article.get('dateline', {}).get('text') and not formatted_article.get('auto_publish', False): ptag = root.find('.//p') if ptag is not None: ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '') body_html = etree.tostring(root, encoding="unicode") body_html = body_html.replace('<p>', '__##br##__') body_html = body_html.replace('</p>', '__##br##__') body_html = body_html.replace('<br/>', '__##br##__') root = sd_etree.parse_html(body_html, content='html') body_html = etree.tostring(root, encoding="unicode", method="text") body_html = body_html.replace('\n', '__##br##__') list_paragraph = body_html.split('__##br##__') for p in list_paragraph: if p and p.strip(): body.append(etree.fromstring('<p>' + escape(p) + '</p>')) if SIGN_OFF in formatted_article: body.append(etree.fromstring( '<p>' + formatted_article.get('source', '') + ' ' + formatted_article.get(SIGN_OFF, '') + '</p>'))
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param etree.Element element: The xml element to populate :param str html: the html to parse the text from :return: """ root = parse_html(html, content="html") # if there are no ptags just br if not len(root.xpath("//p")) and len(root.xpath("//br")): para = etree.SubElement(element, "p") for br in root.xpath("//br"): etree.SubElement(para, "br").text = br.text for p in root.xpath("//p"): para = etree.SubElement(element, "p") if len(p.xpath(".//br")) > 0: for br in p.xpath(".//br"): etree.SubElement(para, "br").text = br.text para.text = etree.tostring(p, encoding="unicode", method="text") # there neither ptags pr br's if len(list(element)) == 0: etree.SubElement(element, "p").text = etree.tostring(root, encoding="unicode", method="text")
def _yonhap_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = sd_etree.parse_html(html, content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue city, source, the_rest = par.text.partition(' (Yonhap) -- ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return set_dateline(item, city, 'Yonhap') break return item except: logging.exception('Yonhap dateline macro exception')
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue city, source, the_rest = par.text.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return # there is already a dateline that is not Bangalore/BENGALURU don't do anything just return if 'located' in (item.get('dateline') or {}) and \ item['dateline']['located'].get('city').upper() not in ['BANGALORE', 'BENGALURU']: return set_dateline(item, city, 'Reuters') break return item except: logging.exception('Reuters dateline macro exception')
def racing_reformat_macro(item, **kwargs): """Given a pre tagged content convert it to HTML :param item: :param kwargs: :return: """ # If not preserved in the first place then don't do anything if item[FORMAT] != FORMATS.PRESERVED: return # Nothing to do! if 'body_html' not in item: return None root = sd_etree.parse_html(item['body_html'], content='html') body_html = etree.tostring(root, encoding="unicode", method="text") # Paragraphs created on new lines body_html = body_html.replace('\n', '__##br##__') list_paragraph = body_html.split('__##br##__') item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph if p and p.strip()) # Ensure that the format is HTML item[FORMAT] = FORMATS.HTML return item
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article['body_html'] = article['body_html'].replace('<br>', '<br/>') except KeyError: pass body = '' if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: body = article.get('body_html', '') elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]: body = article.get('description', '') if body and article.get(FORMAT, '') == FORMATS.PRESERVED: body = body.replace('\n', '\r\n').replace('\r\r', '\r') parsed = parse_html(body, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get('body_footer'): footer = article.get('body_footer') if article.get(FORMAT, '') == FORMATS.PRESERVED: body = '{}\r\n{}'.format(body, get_text(footer)) else: body = '{}{}'.format(body, footer) return body
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ if "img" in html: content = sd_etree.parse_html(html, 'html') for img in content.xpath('//img'): src = img.get('src') try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue _id = media_data['_id'] url = url_for_media(_id) img.set("src", url) if key == 'featuremedia': # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format('START', key)) embed_end = etree.Comment(embed_TPL.format('END', key)) img.addprevious(embed_start) img.addnext(embed_end) html = etree.tostring(content, encoding="unicode") item['body_html'] = html
def _format_body_content(self, article, body_content): nitf_body = [] if article.get('ednote'): nitf_body.append(to_ascii(self._format_line(article.get('ednote')))) if article.get(BYLINE): nitf_body.append(to_ascii(self._format_line(get_text(article.get(BYLINE))))) if article.get(FORMAT) == FORMATS.PRESERVED: nitf_body.append(to_ascii(get_text(self.append_body_footer(article), content='html'))) else: body = article.get('body_html', '') # we need to inject the dateline if article.get('dateline', {}).get('text') and not article.get('auto_publish', False): body_html_elem = parse_html(article.get('body_html')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = article['dateline']['text'] + ' ' + (ptag.text or '') body = to_string(body_html_elem) nitf_body.append(self.get_text_content(body)) if article.get('body_footer'): nitf_body.append(self.get_text_content(article.get('body_footer', ''))) sign_off = '{} {}'.format(article.get('source') or '', (article.get('sign_off') or '')).strip() if sign_off: nitf_body.append(to_ascii(self._format_line(sign_off))) SubElement(body_content, 'pre').text = ''.join(nitf_body)
def get_text(markup, content="xml", lf_on_block=False, space_on_elements=False, space=" "): """Get plain text version of (X)HTML or other XML element if the markup can't be parsed, it will be returned unchanged :param str markup: string to convert to plain text :param str content: 'xml' or 'html', as in parse_html :param bool lf_on_block: if True, add a line feed on block elements' tail :param bool space_on_elements: if True, add a space on each element's tail mainly used to count words with non HTML markup :param str space: space string which is used when `space_on_elements` is enabled :return str: plain text version of markup """ try: root = sd_etree.parse_html(markup, content=content, lf_on_block=lf_on_block, space_on_elements=space_on_elements, space=space) text = etree.tostring(root, encoding="unicode", method="text") return text except etree.ParseError: return markup
def get_wrapped_text_content(self, content): """Get a version of the body text that is wrapped :param content: :return: """ text = '' content = content.replace('<br>', '<br/>').replace('</br>', '') # remove control chars except \r and \n content = re.sub('[\x00-\x09\x0b\x0c\x0f-\x1f]', '', content) # Special case x0e denotes a line break content = re.sub('\x0e', '\r\n', content) # remove runs of spaces and stray line feeds content = re.sub(r' +', ' ', re.sub(r'(?<!\r)\n+', ' ', content).strip()) parsed = parse_html(content, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) for tag in parsed.xpath('/html/div/child::*'): ptext = '' for x in tag.itertext(): ptext += x text += self.format_wrapped_text_content(ptext) return text
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article['body_html'] = article['body_html'].replace('<br>', '<br/>') except KeyError: pass body = '' if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: body = article.get('body_html', '') elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]: body = article.get('description', '') if body and article.get(FORMAT, '') == FORMATS.PRESERVED: body = body.replace('\n', '\r\n').replace('\r\r', '\r') parsed = parse_html(body, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get('body_footer'): footer = article.get('body_footer') if article.get(FORMAT, '') == FORMATS.PRESERVED: body = '{}\r\n{}'.format(body, get_text(footer)) else: body = '{}{}'.format(body, footer) return body
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param etree.Element element: The xml element to populate :param str html: the html to parse the text from :return: """ root = parse_html(html, content='html') # if there are no ptags just br if not len(root.xpath('//p')) and len(root.xpath('//br')): para = etree.SubElement(element, 'p') for br in root.xpath('//br'): etree.SubElement(para, 'br').text = br.text for p in root.xpath('//p'): para = etree.SubElement(element, 'p') if len(p.xpath('.//br')) > 0: for br in p.xpath('.//br'): etree.SubElement(para, 'br').text = br.text para.text = etree.tostring(p, encoding="unicode", method="text") # there neither ptags pr br's if len(list(element)) == 0: etree.SubElement(element, 'p').text = etree.tostring(root, encoding="unicode", method="text")
def append_body_footer(self, article): """ Checks if the article has any Public Service Announcements and if available appends each of them to the body. :return: body with public service announcements. """ try: article["body_html"] = article["body_html"].replace("<br>", "<br/>") except KeyError: pass body = "" if article[ITEM_TYPE] in [CONTENT_TYPE.TEXT, CONTENT_TYPE.PREFORMATTED]: body = article.get("body_html", "") elif article[ITEM_TYPE] in [CONTENT_TYPE.AUDIO, CONTENT_TYPE.PICTURE, CONTENT_TYPE.VIDEO]: body = article.get("description", "") if body and article.get(FORMAT, "") == FORMATS.PRESERVED: body = body.replace("\n", "\r\n").replace("\r\r", "\r") parsed = parse_html(body, content="html") for br in parsed.xpath("//br"): br.tail = "\r\n" + br.tail if br.tail else "\r\n" etree.strip_elements(parsed, "br", with_tail=False) body = etree.tostring(parsed, encoding="unicode") if body and article.get("body_footer"): footer = article.get("body_footer") if article.get(FORMAT, "") == FORMATS.PRESERVED: body = "{}\r\n{}".format(body, get_text(footer)) else: body = "{}{}".format(body, footer) return body
def _inject_dateline(self, formatted_article): """Inject dateline in article's body_html""" body_html_elem = sd_etree.parse_html(formatted_article.get('body_html', '<p> </p>')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '') formatted_article['body_html'] = sd_etree.to_string(body_html_elem)
def racing_reformat_macro(item, **kwargs): """Given a pre tagged content convert it to HTML :param item: :param kwargs: :return: """ # If not preserved in the first place then don't do anything if item[FORMAT] != FORMATS.PRESERVED: return # Nothing to do! if 'body_html' not in item: return None root = sd_etree.parse_html(item['body_html'], content='html') body_html = etree.tostring(root, encoding="unicode", method="text") # Paragraphs created on new lines body_html = body_html.replace('\n', '__##br##__') list_paragraph = body_html.split('__##br##__') item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph if p and p.strip()) # Ensure that the format is HTML item[FORMAT] = FORMATS.HTML return item
def get_first_paragraph_text(input_string): try: elem = parse_html(input_string, content='html') except ValueError as e: logger.warning(e) else: # all non-empty paragraphs: ignores <p><br></p> sections return get_text_from_elem(elem) or get_text_from_elem(elem, tag=None)
def extract_kill_reason_from_html(html, is_kill): """Extract the reason from html for a kill/takedown Iterates over the xml nodes and find the node that contains the reason prefix. Once the reason prefix has been found add the proceeding nodes to our reason tree, until the kill/takedown suffix has been found. :param html: :param is_kill: :return: """ try: # Create a new tree that we will use to construct the reason nodes root = etree.Element('div') # A flag to indicate if we're to add the current child node to our reason tree adding_nodes = False for child in parse_html(html, content='html'): # Obtain the text from our child nodes (including sub-child nodes) child_text = ''.join(child.itertext()) if not adding_nodes and REASON_PREFIX in child_text: # This child node contains the reason prefix (and we haven't found it already) # Therefor set the flag to True indicating that the following child nodes # are to be added to our reason tree adding_nodes = True continue elif adding_nodes: # If the kill/takedown suffix has been found, then our reason tree is complete if is_kill and KILL_SUFFIX in child_text: break elif not is_kill and TAKEDOWN_SUFFIX in child_text: break # Otherwise continue adding the child nodes to our reason tree # Remove the last sub-child if it only contains a line break if len(child) > 0: last_child = child[-1] if etree.tostring(last_child) == b'<p><br/></p>': child.remove(last_child) # Then add this child node to our reason tree root.append(child) num_children = len(list(root)) # If the reason tree was not populated, then return the original html provided if num_children == 0: return html # Our reason tree was populated, convert the tree to a string and return it return to_string(root, method='html', remove_root_div=num_children == 1) except Exception as e: logger.exception(e) return html
def _inject_dateline(self, formatted_article): """Inject dateline in article's body_html""" body_html_elem = sd_etree.parse_html( formatted_article.get("body_html", "<p> </p>")) ptag = body_html_elem.find(".//p") if ptag is not None: ptag.text = formatted_article["dateline"]["text"] + " " + ( ptag.text or "") formatted_article["body_html"] = sd_etree.to_string(body_html_elem)
def clean_html(body_html): ''' Make sure the html will parse and inject \r\n in an attempt to avoid issues with lines being too long for SMTP :param body_html: :return: parsed and re-written html ''' root = sd_etree.parse_html(body_html, content='html', lf_on_block=True) return sd_etree.to_string(root, method='html', pretty_print=True).replace('>\n', '>\r\n')
def extract_html_macro(item, **kwargs): """ Delete from body_html all html tags except links """ if 'body_html' not in item: return None root = sd_etree.parse_html(item['body_html'], content='html') links = {} count = 0 # extract all links and add them to a dictionary with a unique # generated key for every link for a in root.findall('.//a'): links['__##link' + str(count) + '##__'] = etree.tostring( a, encoding="unicode") count = count + 1 # replace all text links with the generated keys # regenerate html back from root in order to avoid issues # on link replacements where are used text links generated from root body_html = etree.tostring(root, encoding="unicode") for link in links: body_html = body_html.replace(links[link], link) body_html = body_html.replace('<p>', '__##br##__') body_html = body_html.replace('</p>', '__##br##__') body_html = body_html.replace('<br/>', '__##br##__') # extract text from the html that don't contains any link, # it just contains link keys that are not affected by text extraction # because they are already text root = sd_etree.parse_html(body_html, content='html') body_html = etree.tostring(root, encoding="unicode", method="text") # in extracted text replace the link keys with links for link in links: body_html = body_html.replace(link, links[link]) body_html = body_html.replace('\n', '__##br##__') list_paragraph = body_html.split('__##br##__') item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph if p and p.strip()) return item
def get_text_content(self, content): # It's only a one line ticker so new line and carriage return become spaces content = re.sub('[\n]', ' ', content) content = re.sub('[\r]', ' ', content) # remove control chars as these will upset the ticker content = re.sub(r'[\x00-\x1f]', '', content) if content == '': return '' parsed = parse_html(content, content='html') text = etree.tostring(parsed, encoding="unicode", method="text") return text
def get_par_count(html): try: elem = sd_etree.parse_html(html, content='html') return len([ p for p in elem.iterfind('.//p') if p.text and len(p.text.strip()) > 0 ]) except ValueError as e: logger.warning(e) logger.warning('Failed to determine paragraph count from html: {}.'.format(html)) return 0
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ # we need to convert CRLF to <p> # cf. SDTS-22 html = html.replace(" ", "\r") splitted = html.split("\r\n") if len(splitted) == 1 and "<p>" not in html: splitted = html.split("\n") if len(splitted) > 1: html = "".join([ "<p>{}</p>".format(s) if not is_block_elem(s) else s for s in splitted if s.strip() ]) if "img" in html: content = sd_etree.parse_html(html, "html") for img in content.xpath("//img"): try: src = self.check_url(img.get("src")) except ValueError: logger.warning("Can't fetch image: {elt}".format( elt=sd_etree.to_string(img))) continue try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue url = media_data["renditions"]["original"]["href"] img.set("src", url) if key == "featuremedia": # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format("START", key)) embed_end = etree.Comment(embed_TPL.format("END", key)) img.addprevious(embed_start) img.addnext(embed_end) content = sd_etree.fix_html_void_elements(content) html = sd_etree.to_string(content, encoding="unicode", method="xml") html = remove_shortcodes(html) item["body_html"] = html
def first_paragraph_filter(input_string): try: elem = parse_html(input_string, content='html') except ValueError as e: logger.warning(e) else: # all non-empty paragraphs: ignores <p><br></p> sections for p in elem.iterfind('.//p'): if p.text: return etree.tostring(p, encoding="unicode") logger.warning('Failed to locate the first paragraph from input_string: {}.'.format(input_string)) return ''
def extract_html_macro(item, **kwargs): """Delete from body_html all html tags except links""" if 'body_html' not in item: return None root = sd_etree.parse_html(item['body_html'], content='html') links = {} count = 0 # extract all links and add them to a dictionary with a unique # generated key for every link for a in root.findall('.//a'): links['__##link' + str(count) + '##__'] = etree.tostring(a, encoding="unicode") count = count + 1 # replace all text links with the generated keys # regenerate html back from root in order to avoid issues # on link replacements where are used text links generated from root body_html = etree.tostring(root, encoding="unicode") for link in links: body_html = body_html.replace(links[link], link) body_html = body_html.replace('<p>', '__##br##__') body_html = body_html.replace('</p>', '__##br##__') body_html = body_html.replace('<br/>', '__##br##__') # extract text from the html that don't contains any link, # it just contains link keys that are not affected by text extraction # because they are already text root = sd_etree.parse_html(body_html, content='html') body_html = etree.tostring(root, encoding="unicode", method="text") # in extracted text replace the link keys with links for link in links: body_html = body_html.replace(link, links[link]) body_html = body_html.replace('\n', '__##br##__') list_paragraph = body_html.split('__##br##__') item['body_html'] = ''.join('<p>' + p + '</p>' for p in list_paragraph if p and p.strip()) return item
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue city, source, the_rest = par.text.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [ c for c in cities if c['city'].lower() == city.lower() ] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} # there is already a dateline that is not Bangalore don't do anything just return elif 'located' in item['dateline'] and 'BANGALORE' != item[ 'dateline']['located'].get('city').upper(): return item['dateline']['located'] = located[0] if len( located) == 1 else { 'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city' } item['dateline']['source'] = item.get( 'original_source', 'Reuters') item['dateline']['text'] = format_dateline_to_locmmmddsrc( item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'Reuters')) break return item except: logging.exception('Reuters dateline macro exception')
def _format_abstract(self, article, main_news_component): """ Create an abstract NewsComponent element :param dict article: :param Element main_news_component: """ abstract_news_component = SubElement(main_news_component, "NewsComponent") SubElement(abstract_news_component, 'Role', {'FormalName': 'Abstract'}) content_item = SubElement(abstract_news_component, "ContentItem") SubElement(content_item, 'MediaType', {'FormalName': 'Text'}) SubElement(content_item, 'Format', {'FormalName': 'Text'}) abstract = parse_html(article.get('abstract', '')) SubElement(content_item, 'DataContent').text = etree.tostring(abstract, encoding="unicode", method="text")
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ # we need to convert CRLF to <p> # cf. SDTS-22 html = html.replace(' ', '\r') splitted = html.split('\r\n') if len(splitted) == 1 and '<p>' not in html: splitted = html.split('\n') if len(splitted) > 1: html = ''.join([ '<p>{}</p>'.format(s) if not is_block_elem(s) else s for s in splitted if s.strip() ]) if "img" in html: content = sd_etree.parse_html(html, 'html') for img in content.xpath('//img'): try: src = self.check_url(img.get('src')) except ValueError: logger.warning("Can't fetch image: {elt}".format( elt=sd_etree.to_string(img))) continue try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue url = media_data['renditions']['original']['href'] img.set("src", url) if key == 'featuremedia': # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format('START', key)) embed_end = etree.Comment(embed_TPL.format('END', key)) img.addprevious(embed_start) img.addnext(embed_end) content = sd_etree.fix_html_void_elements(content) html = sd_etree.to_string(content, encoding="unicode", method='xml') item['body_html'] = html
def first_paragraph_filter(input_string): try: elem = parse_html(input_string, content='html') except ValueError as e: logger.warning(e) else: # all non-empty paragraphs: ignores <p><br></p> sections for p in elem.iterfind('.//p'): if p.text: return etree.tostring(p, encoding="unicode") logger.warning( 'Failed to locate the first paragraph from input_string: {}.'.format( input_string)) return ''
def remove_breaks(item, **kwargs): try: html = item.get('body_html') if html: html = html.replace('<br>', '<br/>').replace('</br>', ' ') parsed = parse_html(html, content='xml') for br in parsed.xpath('//br'): br.tail = ' ' + br.tail if br.tail else ' ' etree.strip_elements(parsed, 'br', with_tail=False) item['body_html'] = to_string(parsed) return item except Exception as ex: logging.exception('Exception in preserve format macro: ', ex) raise ex
def parse(self, xml, provider=None): self.root = xml items = [] try: for item_set in xml.findall(self.qname('itemSet')): for item_tree in item_set: # Ignore the packageItem, it has no guid if 'guid' in item_tree.attrib: item = self.parse_item(item_tree) item['priority'] = 6 item['anpa_category'] = [{'qcode': 'f'}] item['subject'] = [{'qcode': '04000000', 'name': subject_codes['04000000']}] item.setdefault('word_count', get_word_count(item['body_html'])) # Hard code the urgency item['urgency'] = 3 # Dateline is always Wellington in NZ located = [c for c in app.locators.find_cities(country_code='NZ', state_code='NZ.G2') if c.get('city', '').lower() == 'wellington'] if len(located) == 1: item['dateline'] = dict() item['dateline']['located'] = located[0] if item.get('body_html') and item['dateline']: parsed = parse_html(item.get('body_html'), content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue # check the first par for a byline if pars.index(par) == 0 and par.text.startswith('By '): item['byline'] = par.text.replace('By ', '') par.getparent().remove(par) date, source, the_rest = par.text.partition(' (BusinessDesk) - ') if source: item['dateline']['date'] = date_parser(date, fuzzy=True) par.text = the_rest # remove the signoff if in the last par if par.text == '(BusinessDesk)' and pars.index(par) + 1 == len(pars): par.getparent().remove(par) item['body_html'] = to_string(parsed, remove_root_div=True) locator_map = superdesk.get_resource_service('vocabularies').find_one(req=None, _id='locators') if locator_map: item['place'] = [x for x in locator_map.get('items', []) if x['qcode'].upper() == 'NZ'] items.append(item) return items except Exception as ex: raise ParserError.newsmlTwoParserError(ex, provider)
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param element: The xml element to populate :param html: the html to parse the text from :return: """ html = html.replace('<br>', '<br/>').replace('</br>', '') html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html) html = html.replace('\n', ' ') html = re.sub(r'\s\s+', ' ', html) parsed = parse_html(html, content='html') for tag in parsed.xpath('/html/div/child::*'): p = etree.Element('p') p.text = to_ascii( get_text(to_string(tag, method='html'), content='html')) element.append(p)
def reuters_derive_dateline(item, **kwargs): """ It seems that most locations injected into the item by the parser are Bangalor This function looks for a dateline in the article body an uses that. :param items: :return: """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='xml') pars = parsed.xpath('//p') if len(pars) >= 2: if BYLINE in item and item.get(BYLINE) in ''.join(pars[0].itertext()): first = ''.join(pars[1].itertext()) else: first = ''.join(pars[0].itertext()) city, source, the_rest = first.partition(' (Reuters) - ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] # if not dateline we create one if 'dateline' not in item: item['dateline'] = {} # there is already a dateline that is not Bangalore don't do anything just return elif 'located' in item['dateline'] and 'BANGALORE' != item['dateline']['located'].get( 'city').upper(): return item['dateline']['located'] = located[0] if len(located) > 0 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'Reuters') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'Reuters')) return item except: logging.exception('Reuters dateline macro exception')
def map_html_to_xml(self, element, html): """ Map the html text tags to xml :param element: The xml element to populate :param html: the html to parse the text from :return: """ html = html.replace('<br>', '<br/>').replace('</br>', '') html = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', html) html = html.replace('\n', ' ') html = re.sub(r'\s\s+', ' ', html) parsed = parse_html(html, content='html') for tag in parsed.xpath('//*'): if tag.getparent() is not None and tag.getparent().tag == 'body': p = etree.Element('p') p.text = to_ascii(get_text(to_string(tag, method='html'), content='html')) element.append(p)
def _format_abstract(self, article, main_news_component): """ Create an abstract NewsComponent element :param dict article: :param Element main_news_component: """ abstract_news_component = SubElement(main_news_component, "NewsComponent") SubElement(abstract_news_component, "Role", {"FormalName": "Abstract"}) content_item = SubElement(abstract_news_component, "ContentItem") SubElement(content_item, "MediaType", {"FormalName": "Text"}) SubElement(content_item, "Format", {"FormalName": "Text"}) abstract = parse_html(article.get("abstract", "")) SubElement(content_item, "DataContent").text = etree.tostring(abstract, encoding="unicode", method="text")
def sanitize_tags(item): content = item.get('body_html', '') content = content.replace('<br>', '<br/>').replace('</br>', '') content = content.replace(' ', ' ') parsed = parse_html(content, content='html') # breaks are replaced with line feeds for br in parsed.xpath('//br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(parsed, 'br', with_tail=False) for tag in parsed.xpath('/html/div/child::*'): format_text_content(tag) item['body_html'] = '<pre>{}</pre>'.format(html.escape(''.join(parsed.itertext()))) item[FORMAT] = FORMATS.PRESERVED return item
def get_text_content(self, content): content = content.replace('<br>', '<br/>').replace('</br>', '') content = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', content) content = content.replace('\xA0', ' ') parsed = parse_html(content, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) for tag in parsed.xpath('/html/div/child::*'): if tag.tag not in ('br') and tag.text is not None and tag.text.strip() != '': tag.text = ' ' + re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.text)) if tag.text else '' tag.tail = '\r\n' + tag.tail if tag.tail else '\r\n' para_text = "".join(x for x in parsed.itertext()) para_text = para_text.replace('\xA0', ' ') return para_text.encode('ascii', 'replace')
def _fix_headline(self, item): """ AP Alerts do not get a healdine parsed out so pick up the first par of the content and put it in the headline :param item: :return: """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='html') pars = parsed.xpath('/html/div/child::*') if pars and len(pars) > 0: city, source, the_rest = pars[0].text.partition(' (AP) _ ') if the_rest: item['headline'] = the_rest else: item['headline'] = pars[0].text except: pass
def get_text_content(self, content): content = content.replace('<br>', '<br/>').replace('</br>', '') content = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', content) content = content.replace('\xA0', ' ') parsed = parse_html(content, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) for tag in parsed.xpath('/html/div/child::*'): if tag.tag != 'br' and tag.text is not None and tag.text.strip() != '': tag.text = self.line_prefix + re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', tag.text)) tag.tail = '\r\n' + tag.tail if tag.tail else '\r\n' para_text = "".join(x for x in parsed.itertext()) # multiple line breaks to one line break para_text = re.sub('[{}]+'.format(self.line_feed), self.line_feed, para_text) return to_ascii(para_text)
def get_text_content(self, content): text = '' content = content.replace('<br>', '<br/>').replace('</br>', '') content = re.sub(' +', ' ', re.sub('(?<!\r)\n+', ' ', content).strip()) content = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', content) parsed = parse_html(content, content='html') for br in parsed.xpath('//br'): br.tail = '\r\n' + br.tail if br.tail else '\r\n' etree.strip_elements(parsed, 'br', with_tail=False) for tag in parsed.xpath('//*'): if tag.getparent() is not None and tag.getparent().tag == 'body': ptext = '' for x in tag.itertext(): ptext += x text += self.format_text_content(ptext) return text
def get_text(markup, content='xml', lf_on_block=False, space_on_elements=False): """Get plain text version of (X)HTML or other XML element if the markup can't be parsed, it will be returned unchanged :param str markup: string to convert to plain text :param str content: 'xml' or 'html', as in parse_html :param bool lf_on_block: if True, add a line feed on block elements' tail :param bool space_on_elements: if True, add a space on each element's tail mainly used to count words with non HTML markup :return str: plain text version of markup """ try: root = sd_etree.parse_html( markup, content=content, lf_on_block=lf_on_block, space_on_elements=space_on_elements) text = etree.tostring(root, encoding='unicode', method='text') return text except etree.ParseError: return markup
def get_text_content(self, content): content = content.replace('<br>', '<br/>').replace('</br>', '') # remove control chars except \n content = re.sub('[\x00-\x09\x0b-\x1f]', '', content) # new lines are spaces content = re.sub('[\n]', ' ', content) if content == '': return '' parsed = parse_html(content, content='html', space_on_elements=True) # breaks are replaced with spaces for br in parsed.xpath('//br'): br.tail = ' ' + br.tail if br.tail else ' ' etree.strip_elements(parsed, 'br', with_tail=False) text = '' for top_level_tag in parsed.xpath('/html/div/child::*'): text += self.format_text_content(top_level_tag) return re.sub(' +', ' ', text)
def yonhap_format(item, **kwargs): try: html = item.get('body_html') # Article must be from Yonhap if '(Yonhap)' not in html: return item item['source'] = 'Yonhap' if html: parsed = sd_etree.parse_html(html, content='xml') pars = parsed.xpath('//body') if len(pars) == 1: pars[0].tag = 'p' content = etree.tostring(pars[0], encoding="unicode") item['body_html'] = content.replace(' \n ', '</p><p>').replace(' \n', '').replace('<br/>', ' ') _yonhap_derive_dateline(item) except Exception as ex: logging.exception('Exception in yonhap format macro: ', ex) raise ex return item
def remove_dateline(item): """Remove the dateline from item""" html = item.get('body_html') if not html: return match = re.search(DATELINE_REGEX, html, re.IGNORECASE) if not match: return # get the matched string matched_string = match.group(0) parsed = parse_html(html, content='xml') pars = parsed.xpath('//p') for par in pars: if not par.text: continue if matched_string in par.text: city, source, the_rest = par.text.partition(matched_string) search_string = ''.join([s for s in [city, source]]) item['body_html'] = html.replace(search_string, '') break
def ap_derive_dateline(self, item): """This function looks for a dateline in the article body an uses that. :param item: :return: item populated with a dateline """ try: html = item.get('body_html') if html: parsed = parse_html(html, content='html') for par in parsed.xpath('/html/div/child::*'): if not par.text: continue city, source, the_rest = par.text.partition(' (AP) _ ') if source: # sometimes the city is followed by a comma and either a date or a state city = city.split(',')[0] if any(char.isdigit() for char in city): return cities = app.locators.find_cities() located = [c for c in cities if c['city'].lower() == city.lower()] item.setdefault('dateline', {}) item['dateline']['located'] = located[0] if len(located) == 1 else {'city_code': city, 'city': city, 'tz': 'UTC', 'dateline': 'city'} item['dateline']['source'] = item.get('original_source', 'AP') item['dateline']['text'] = format_dateline_to_locmmmddsrc(item['dateline']['located'], get_date(item['firstcreated']), source=item.get('original_source', 'AP')) break return item except: logging.exception('AP dateline extraction exception')
def body_hook(self, item, html): """Copy content to body_html if img are found in the content, they are uploaded. First image is used as feature media, then there are embeds """ # we need to convert CRLF to <p> # cf. SDTS-22 html = html.replace(' ', '\r') splitted = html.split('\r\n') if len(splitted) > 1: html = ''.join(['<p>{}</p>'.format(s) if not s.startswith('<hr') else s for s in splitted if s]) if "img" in html: content = sd_etree.parse_html(html, 'html') for img in content.xpath('//img'): src = img.get('src') try: key, media_data = self._add_image(item, src) except Exception as e: logger.error(e) img.getparent().remove(img) continue url = media_data['renditions']['original']['href'] img.set("src", url) if key == 'featuremedia': # no need to embed the image for featuremedia continue embed_start = etree.Comment(embed_TPL.format('START', key)) embed_end = etree.Comment(embed_TPL.format('END', key)) img.addprevious(embed_start) img.addnext(embed_end) html = etree.tostring(content, encoding="unicode") item['body_html'] = html
def format(self, article, subscriber, codes=None): try: docs = [] formatted_article = deepcopy(article) for category in self._get_category_list(formatted_article.get('anpa_category')): mapped_source = self._get_mapped_source(formatted_article) formatted_article[config.ID_FIELD] = formatted_article.get('item_id', formatted_article.get(config.ID_FIELD)) pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber) anpa = [] if codes: anpa.append(b'\x05') anpa.append(' '.join(codes).encode('ascii')) anpa.append(b'\x0D\x0A') # start of message header (syn syn soh) anpa.append(b'\x16\x16\x01') anpa.append(get_service_level(category, formatted_article).encode('ascii')) # story number anpa.append(str(pub_seq_num).zfill(4).encode('ascii')) # field seperator anpa.append(b'\x0A') # -LF anpa.append(map_priority(formatted_article.get('priority')).encode('ascii')) anpa.append(b'\x20') anpa.append(category['qcode'].lower().encode('ascii')) anpa.append(b'\x13') # format identifier if formatted_article.get(FORMAT, FORMATS.HTML) == FORMATS.PRESERVED: anpa.append(b'\x12') else: anpa.append(b'\x11') anpa.append(b'\x20') # keyword keyword = 'bc-{}'.format(self.append_legal(article=formatted_article, truncate=True)).replace(' ', '-') keyword = keyword[:24] if len(keyword) > 24 else keyword anpa.append(keyword.encode('ascii')) anpa.append(b'\x20') # version field anpa.append(b'\x20') # reference field anpa.append(b'\x20') # filing date anpa.append('{}-{}'.format(formatted_article['_updated'].strftime('%m'), formatted_article['_updated'].strftime('%d')).encode('ascii')) anpa.append(b'\x20') # add the word count anpa.append(str(formatted_article.get('word_count', '0000')).zfill(4).encode('ascii')) anpa.append(b'\x0D\x0A') anpa.append(b'\x02') # STX self._process_headline(anpa, formatted_article, category['qcode'].encode('ascii')) keyword = SluglineMapper().map(article=formatted_article, category=category['qcode'].upper(), truncate=True).encode('ascii', 'ignore') anpa.append(keyword) take_key = (formatted_article.get('anpa_take_key', '') or '').encode('ascii', 'ignore') anpa.append((b'\x20' + take_key) if len(take_key) > 0 else b'') anpa.append(b'\x0D\x0A') if formatted_article.get('ednote', '') != '': ednote = '{}\r\n'.format(to_ascii(formatted_article.get('ednote'))) anpa.append(ednote.encode('ascii', 'replace')) if formatted_article.get(BYLINE): anpa.append(get_text(formatted_article.get(BYLINE)).encode('ascii', 'replace')) anpa.append(b'\x0D\x0A') if formatted_article.get(FORMAT) == FORMATS.PRESERVED: anpa.append(get_text(self.append_body_footer(formatted_article), content='html').encode('ascii', 'replace')) else: body = to_ascii(formatted_article.get('body_html', '')) # we need to inject the dateline if formatted_article.get('dateline', {}).get('text') and not article.get('auto_publish', False): body_html_elem = parse_html(formatted_article.get('body_html')) ptag = body_html_elem.find('.//p') if ptag is not None: ptag.text = formatted_article['dateline']['text'] + ' ' + (ptag.text or '') body = to_string(body_html_elem) anpa.append(self.get_text_content(body)) if formatted_article.get('body_footer'): anpa.append(self.get_text_content(to_ascii(formatted_article.get('body_footer', '')))) anpa.append(b'\x0D\x0A') anpa.append(mapped_source.encode('ascii')) sign_off = (formatted_article.get('sign_off', '') or '').encode('ascii') anpa.append((b'\x20' + sign_off) if len(sign_off) > 0 else b'') anpa.append(b'\x0D\x0A') anpa.append(b'\x03') # ETX # time and date anpa.append(datetime.datetime.now().strftime('%d-%m-%y %H-%M-%S').encode('ascii')) anpa.append(b'\x04') # EOT anpa.append(b'\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A\x0D\x0A') docs.append({'published_seq_num': pub_seq_num, 'encoded_item': b''.join(anpa), 'formatted_item': b''.join(anpa).decode('ascii')}) return docs except Exception as ex: raise FormatterError.AnpaFormatterError(ex, subscriber)
def test_void_elements_fix(self): html_raw = '<p>this is a test with empty <h3/> non-void <em/> elements and a void <br/> one</p>' expected = '<p>this is a test with empty <h3></h3> non-void <em></em> elements and a void <br/> one</p>' parsed = sd_etree.parse_html(html_raw) sd_etree.fix_html_void_elements(parsed) self.assertEqual(sd_etree.to_string(parsed), expected)
def _parse_content(self, article): """Parse body_html and mapping to fields required for apple news format :param article: """ statement_regex = re.compile(r'^The Statement$', re.IGNORECASE) analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE) verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE) references_regex = re.compile(r'^The References$', re.IGNORECASE) url_regex = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%.]+', re.IGNORECASE) abstract = get_text(article.get('abstract'), content='html').strip() article['_title'] = abstract body_html = article.get('body_html') article['_analysis_first_line'] = '' article['_analysis'] = '' article['_statement'] = '' article['_statement_attribution'] = '' article['_verdict1'] = '' article['_verdict2'] = '' article['_references'] = '' article['_revision_history'] = '' if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED: article['_title'] = 'This article has been removed.' article['_analysis_first_line'] = 'This article has been removed.' article['_analysis'] = 'This article has been removed.' article['_statement'] = 'This article has been removed.' article['_statement_attribution'] = 'This article has been removed.' article['_verdict1'] = 'This article has been removed.' article['_verdict2'] = 'This article has been removed.' article['_references'] = 'This article has been removed.' self._set_revision_history(article) return parsed_content = parse_html(body_html, content='html') statement_found = False analysis_found = False analysis_first_line = False verdict1_found = False verdict2_found = False references_found = False statement_elements = [] for top_level_tag in parsed_content.xpath('/html/div/child::*'): tag_text = format_text_content(top_level_tag).strip() if not tag_text: continue if not verdict1_found: if not statement_found: match = statement_regex.search(tag_text) if match: statement_found = True continue else: # statement found match = verdict_regex.search(tag_text) if match: verdict1_found = True if len(statement_elements) > 1: statement_length = len(statement_elements) - 1 for i in range(statement_length): article['_statement'] += get_text( to_string(statement_elements[i], remove_root_div=False), content='html' ).strip() if statement_length > 1 and i != statement_length - 1: article['_statement'] += '\r\n' article['_statement_attribution'] = get_text( to_string(statement_elements[-1:][0], remove_root_div=False), content='html' ).strip() elif len(statement_elements) == 1: article['_statement'] = to_string( statement_elements[0], remove_root_div=False ) continue statement_elements.append(top_level_tag) continue if verdict1_found and not analysis_found: match = analysis_regex.search(tag_text) if match: analysis_found = True else: article['_verdict1'] += to_string(top_level_tag, remove_root_div=False) continue if analysis_found and not verdict2_found: if not analysis_first_line: article['_analysis_first_line'] = tag_text analysis_first_line = True match = verdict_regex.search(tag_text) if match: verdict2_found = True else: article['_analysis'] += to_string(top_level_tag, remove_root_div=False) continue if verdict2_found and not references_found: match = references_regex.search(tag_text) if match: references_found = True else: article['_verdict2'] += to_string(top_level_tag, remove_root_div=False) continue if references_found: def replacement(match_object): value = match_object.group(0) if value: return '<a href="{0}">{0}</a>'.format(value) return '' tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip() article['_references'] += '<li>{}</li>'.format( re.sub(url_regex, replacement, tag_text) ) if len(article['_references']): article['_references'] = '<ol>{}</ol>'.format(article['_references']) if not article.get('_statement') and article.get('_statement_attribution'): # if statement is not as per the format article['_statement'] = article.get('_statement_attribution') article['_statement_attribution'] = '' self._set_revision_history(article)