def tokenize_sgml_test_1_no_normalize(): nlp.get_global_PTB_config().normalize_parentheses = False nlp.get_global_PTB_config().normalize_brackets = False sent1 = ( "Significant improvements in peak FEV1 were demonstrated " "with tiotropium/olodaterol 5/2 \u03BCg (p = 0.008), 5/5 \u03BCg " "(p = 0.012), and 5/10 \u03BCg (p < 0.0001) versus tiotropium " "monotherapy [51]." ) gold = [ "Significant", "improvements", "in", "peak", "FEV1", "were", "demonstrated", "with", "tiotropium/olodaterol", "5/2", "\u03BCg", "(", "p", "=", "0.008", ")", ",", "5/5", "\u03BCg", "(", "p", "=", "0.012", ")", ",", "and", "5/10", "\u03BCg", "(", "p", "<", "0.0001", ")", "versus", "tiotropium", "monotherapy", "[", "51", "]", ".", ] tokens = nlp.tokenize(sent1) # assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): print(token, gold_token.encode("utf-8")) assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_2_no_normalize(): nlp.get_global_PTB_config().strict_ptb3 = False sent2 = "Panasonic brand products are produced by Samsung Electronics " "Co. Ltd. Sanyo products aren't." gold = [ "Panasonic", "brand", "products", "are", "produced", "by", "Samsung", "Electronics", "Co.", "Ltd.", ".", "Sanyo", "products", "are", "n't", ".", ] tokens = nlp.tokenize(sent2) print(tokens) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_4_no_normalize(): nlp.get_global_PTB_config().normalize_spaces = False sent4 = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">' gold = ['<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">'] # spaces go to \u00A0 tokens = nlp.tokenize(sent4) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_1_no_normalize(): nlp.get_global_PTB_config().normalize_parentheses = False nlp.get_global_PTB_config().normalize_brackets = False sent1 = "Significant improvements in peak FEV1 were demonstrated " \ "with tiotropium/olodaterol 5/2 \u03BCg (p = 0.008), 5/5 \u03BCg " \ "(p = 0.012), and 5/10 \u03BCg (p < 0.0001) versus tiotropium " \ "monotherapy [51]." gold = [ "Significant", "improvements", "in", "peak", "FEV1", "were", "demonstrated", "with", "tiotropium/olodaterol", "5/2", "\u03BCg", "(", "p", "=", "0.008", ")", ",", "5/5", "\u03BCg", "(", "p", "=", "0.012", ")", ",", "and", "5/10", "\u03BCg", "(", "p", "<", "0.0001", ")", "versus", "tiotropium", "monotherapy", "[", "51", "]", "." ] tokens = nlp.tokenize(sent1) #assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): print(token, gold_token.encode("utf-8")) assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_8_no_normalize(): nlp.get_global_PTB_config().escape_forward_slash_asterisk = False sent8 = "<a href=\"http:\\\\it's\\here\"> <quote orig_author='some " "\"dude'/> <not sgmltag" gold = ['<a href="http:\\\\it\'s\\here">', "<quote orig_author='some \"dude'/>", "<", "not", "sgmltag"] tokens = nlp.tokenize(sent8) print(tokens) print(gold) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_4_no_normalize(): nlp.get_global_PTB_config().normalize_spaces = False sent4 = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" " \ "\"http://www.w3.org/TR/html4/strict.dtd\">" gold = [ "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" " \ "\"http://www.w3.org/TR/html4/strict.dtd\">"] # spaces go to \u00A0 tokens = nlp.tokenize(sent4) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_2_no_normalize(): nlp.get_global_PTB_config().strict_ptb3 = False sent2 = "Panasonic brand products are produced by Samsung Electronics " \ "Co. Ltd. Sanyo products aren't." gold = [ "Panasonic", "brand", "products", "are", "produced", "by", "Samsung", "Electronics", "Co.", "Ltd.", ".", "Sanyo", "products", "are", "n't", "." ] tokens = nlp.tokenize(sent2) print(tokens) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8") assert len(tokens) == len(gold)
def tokenize_sgml_test_8_no_normalize(): nlp.get_global_PTB_config().escape_forward_slash_asterisk = False sent8 = "<a href=\"http:\\\\it's\\here\"> <quote orig_author='some " \ "\"dude'/> <not sgmltag" gold = [ "<a href=\"http:\\\\it's\\here\">", "<quote orig_author='some \"dude'/>", "<", "not", "sgmltag" ] tokens = nlp.tokenize(sent8) print(tokens) print(gold) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_3_no_normalize(): nlp.get_global_PTB_config().normalize_parentheses = False sent3 = "Oesophageal acid exposure (% time <pH 4) was similar in " \ "patients with or without complications (19.2% v 19.3% p>0.05)." gold = [ "Oesophageal", "acid", "exposure", "(", "%", "time", "<", "pH", "4", ")", "was", "similar", "in", "patients", "with", "or", "without", "complications", "(", "19.2", "%", "v", "19.3", "%", "p", ">", "0.05", ")", "." ] tokens = nlp.tokenize(sent3) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def tokenize_sgml_test_3_no_normalize(): nlp.get_global_PTB_config().normalize_parentheses = False sent3 = ( "Oesophageal acid exposure (% time <pH 4) was similar in " "patients with or without complications (19.2% v 19.3% p>0.05)." ) gold = [ "Oesophageal", "acid", "exposure", "(", "%", "time", "<", "pH", "4", ")", "was", "similar", "in", "patients", "with", "or", "without", "complications", "(", "19.2", "%", "v", "19.3", "%", "p", ">", "0.05", ")", ".", ] tokens = nlp.tokenize(sent3) assert len(tokens) == len(gold) for token, gold_token in zip(tokens, gold): assert bytes(token) == gold_token.encode("utf-8")
def setUp(self): nlp.get_global_PTB_config().default()
def test_ptb_tok_config1(): cfg = nlp.get_global_PTB_config() cfg.normalize_quotes = u"unicode" assert cfg.normalize_quotes == u"unicode"
def test_ptb_tok_config_exception(): cfg = nlp.get_global_PTB_config() cfg.normalize_quotes = u"not a valid option"
def test_ptb_tok_config4(): cfg = nlp.get_global_PTB_config() cfg.normalize_quotes = None assert cfg.normalize_quotes == None
def test_ptb_tok_config3(): cfg = nlp.get_global_PTB_config() cfg.normalize_quotes = u"ascii" assert cfg.normalize_quotes == u"ascii"
import os import re import nlp import nltk import nltk.tree nlp.get_global_PTB_config().normalize_quotes = None nlp.get_global_PTB_config().normalize_brackets = False nlp.get_global_PTB_config().normalize_dashes = False nlp.get_global_PTB_config().strict_ptb3 = True def collect_paths(ontonotes_base_dir, language="english"): on_bdir_lang = os.path.join( ontonotes_base_dir, "data", "files", "data", language, "annotations") onf_paths = [] for path, dirs, files in os.walk(on_bdir_lang): for file in files: if file.endswith(".onf"): onf_paths.append(os.path.join(path, file)) return onf_paths def traverse(tree): tokens = [] for index, subtree in enumerate(tree): if subtree.height() == 2: tok = subtree[0] pos = subtree._label if pos != "-NONE-": tokens.append([tok, pos]) elif type(subtree) == nltk.tree.Tree:
def test_ptb_tok_config2(): cfg = nlp.get_global_PTB_config() cfg.normalize_quotes = u"latex" assert cfg.normalize_quotes == u"latex"
import os import re import nlp import nltk import nltk.tree nlp.get_global_PTB_config().normalize_quotes = None nlp.get_global_PTB_config().normalize_brackets = False nlp.get_global_PTB_config().normalize_dashes = False nlp.get_global_PTB_config().strict_ptb3 = True def collect_paths(ontonotes_base_dir, language="english"): on_bdir_lang = os.path.join(ontonotes_base_dir, "data", "files", "data", language, "annotations") onf_paths = [] for path, dirs, files in os.walk(on_bdir_lang): for file in files: if file.endswith(".onf"): onf_paths.append(os.path.join(path, file)) return onf_paths def traverse(tree): tokens = [] for index, subtree in enumerate(tree): if subtree.height() == 2: tok = subtree[0] pos = subtree._label if pos != "-NONE-": tokens.append([tok, pos])