def read_gold_standard_file(data_dir, fileroot, encoding=None, cetr=False, format='1.0'): """ Read the gold standard content file corresponding to identifier ``fileroot`` in the gold standard directory below the root ``data_dir``. Args: data_dir (str) fileroot (str) encoding (str) cetr (bool): if True, assume no comments and parse the gold standard to remove tags format (bool): if True, assume no comments and parse the gold standard to remove tags Returns: List[str, str]: contents string and comments string, respectively """ ds_format: DatasetFormat = FORMATS[format] fname = os.path.join( data_dir, ds_format.gold_standard_dirname, fileroot + ds_format.gold_standard_ext) encodings = (encoding,) if encoding else ('utf-8', 'utf-16', 'iso-8859-1') for encoding in encodings: try: with io.open(fname, mode='rt', encoding=encoding) as f: gold_standard = f.read() break except (UnicodeDecodeError, UnicodeError): gold_standard = None if not gold_standard: return [u'', u''] if format == '1.0': if not cetr: content_comments = RE_COMMENTS_DELIM.split(gold_standard, maxsplit=1) # if no comments delimiter found, append empty comments string if len(content_comments) == 1: content_comments = [content_comments[0], u''] else: tree = etree.fromstring(gold_standard, parser=etree.HTMLParser()) content_comments = [u' '.join(text_from_subtree(tree)), u''] elif format == '2.0': # Load toml data # toml parser has an issue with multiline text strings text = gold_standard.split("'''")[1] # data = toml.loads(gold_standard) # text = data['text'] tree = etree.fromstring(text, parser=etree.HTMLParser()) content_comments = [u' '.join(text_from_subtree(tree)), u''] else: raise NotImplementedError(f'Format version {format} is not implemented') # fix text in case of mangled encodings content_comments = [ftfy.fix_encoding(content_comments[0]).strip(), ftfy.fix_encoding(content_comments[1]).strip()] return content_comments
def test_text_from_subtree(self): s = '<a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a>' tree = etree.fromstring(s, etree.HTMLParser(recover=True)) text_list = blocks.text_from_subtree(tree) text_str = " ".join( [ele.strip() for ele in text_list if ele.strip() != ""]) assert text_str == "WILL THIS PASS THE TEST ??"
def test_text_from_subtree_decode_error(self): from lxml import etree # this is an invalid utf-8 character s = '<div>\x92</div>' tree = etree.fromstring(s, etree.HTMLParser(recover=True, encoding='utf-8')) text_list = blocks.text_from_subtree(tree) text_str = ' '.join([ele.strip() for ele in text_list if ele.strip() != '']) self.assertEqual(text_str, '')
def test_text_from_subtree(self): from lxml import etree s = """<a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a>""" tree = etree.fromstring(s, etree.HTMLParser(recover=True)) text_list = blocks.text_from_subtree(tree) text_str = ' '.join([ele.strip() for ele in text_list if ele.strip() != '']) self.assertEqual(text_str, 'WILL THIS PASS THE TEST ??')
def test_text_from_subtree(self): from lxml import etree s = """<a href=".">WILL <img src="."> THIS PASS <b>THE TEST</b> ??</a>""" tree = etree.fromstring(s, etree.HTMLParser(recover=True)) text_list = blocks.text_from_subtree(tree) text_str = ' '.join( [ele.strip() for ele in text_list if ele.strip() != '']) self.assertEqual(text_str, 'WILL THIS PASS THE TEST ??')
def test_text_from_subtree_decode_error(self): # this is an invalid utf-8 character s = b"<div>\x92</div>" tree = etree.fromstring( s, etree.HTMLParser(recover=True, encoding="utf-8")) text_list = blocks.text_from_subtree(tree) text_str = " ".join( [ele.strip() for ele in text_list if ele.strip() != ""]) assert text_str == ""
def test_text_from_subtree_decode_error(self): from lxml import etree # this is an invalid utf-8 character s = '<div>\x92</div>' tree = etree.fromstring( s, etree.HTMLParser(recover=True, encoding='utf-8')) text_list = blocks.text_from_subtree(tree) text_str = ' '.join( [ele.strip() for ele in text_list if ele.strip() != '']) self.assertEqual(text_str, '')
def read_gold_standard_file(data_dir, fileroot, encoding=None, cetr=False): """ Read the gold standard content file corresponding to identifier ``fileroot`` in the gold standard directory below the root ``data_dir``. Args: data_dir (str) fileroot (str) encoding (str) cetr (bool): if True, assume no comments and parse the gold standard to remove tags Returns: List[str, str]: contents string and comments string, respectively """ fname = os.path.join(data_dir, GOLD_STANDARD_DIRNAME, fileroot + GOLD_STANDARD_EXT) encodings = (encoding, ) if encoding else ('utf-8', 'utf-16', 'iso-8859-1') for encoding in encodings: try: with io.open(fname, mode='rt', encoding=encoding) as f: gold_standard = f.read() break except (UnicodeDecodeError, UnicodeError): gold_standard = None if not gold_standard: return [u'', u''] if not cetr: content_comments = RE_COMMENTS_DELIM.split(gold_standard, maxsplit=1) # if no comments delimiter found, append empty comments string if len(content_comments) == 1: content_comments = [content_comments[0], u''] else: tree = etree.fromstring(gold_standard, parser=etree.HTMLParser()) content_comments = [u' '.join(text_from_subtree(tree)), u''] # fix text in case of mangled encodings content_comments = [ ftfy.fix_encoding(content_comments[0]).strip(), ftfy.fix_encoding(content_comments[1]).strip() ] return content_comments