Example #1
0
def tokenize_sgml_test_1_no_normalize():
    nlp.get_global_PTB_config().normalize_parentheses = False
    nlp.get_global_PTB_config().normalize_brackets = False
    sent1 = (
        "Significant improvements in peak FEV1 were demonstrated "
        "with tiotropium/olodaterol 5/2 \u03BCg (p = 0.008), 5/5 \u03BCg "
        "(p = 0.012), and 5/10 \u03BCg (p < 0.0001) versus tiotropium "
        "monotherapy [51]."
    )
    gold = [
        "Significant",
        "improvements",
        "in",
        "peak",
        "FEV1",
        "were",
        "demonstrated",
        "with",
        "tiotropium/olodaterol",
        "5/2",
        "\u03BCg",
        "(",
        "p",
        "=",
        "0.008",
        ")",
        ",",
        "5/5",
        "\u03BCg",
        "(",
        "p",
        "=",
        "0.012",
        ")",
        ",",
        "and",
        "5/10",
        "\u03BCg",
        "(",
        "p",
        "<",
        "0.0001",
        ")",
        "versus",
        "tiotropium",
        "monotherapy",
        "[",
        "51",
        "]",
        ".",
    ]

    tokens = nlp.tokenize(sent1)
    # assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        print(token, gold_token.encode("utf-8"))
        assert bytes(token) == gold_token.encode("utf-8")
Example #2
0
def tokenize_sgml_test_2_no_normalize():
    nlp.get_global_PTB_config().strict_ptb3 = False
    sent2 = "Panasonic brand products are produced by Samsung Electronics " "Co. Ltd. Sanyo products aren't."
    gold = [
        "Panasonic",
        "brand",
        "products",
        "are",
        "produced",
        "by",
        "Samsung",
        "Electronics",
        "Co.",
        "Ltd.",
        ".",
        "Sanyo",
        "products",
        "are",
        "n't",
        ".",
    ]

    tokens = nlp.tokenize(sent2)
    print(tokens)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
    assert len(tokens) == len(gold)
Example #3
0
def tokenize_sgml_test_4_no_normalize():
    nlp.get_global_PTB_config().normalize_spaces = False
    sent4 = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">'

    gold = ['<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Strict//EN" ' '"http://www.w3.org/TR/html4/strict.dtd">']
    # spaces go to &nbsp; \u00A0

    tokens = nlp.tokenize(sent4)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Example #4
0
def tokenize_sgml_test_1_no_normalize():
    nlp.get_global_PTB_config().normalize_parentheses = False
    nlp.get_global_PTB_config().normalize_brackets = False
    sent1 = "Significant improvements in peak FEV1 were demonstrated " \
            "with tiotropium/olodaterol 5/2 \u03BCg (p = 0.008), 5/5 \u03BCg " \
            "(p = 0.012), and 5/10 \u03BCg (p < 0.0001) versus tiotropium " \
            "monotherapy [51]."
    gold = [
        "Significant", "improvements", "in", "peak", "FEV1", "were",
        "demonstrated", "with", "tiotropium/olodaterol", "5/2", "\u03BCg", "(",
        "p", "=", "0.008", ")", ",", "5/5", "\u03BCg", "(", "p", "=", "0.012",
        ")", ",", "and", "5/10", "\u03BCg", "(", "p", "<", "0.0001", ")",
        "versus", "tiotropium", "monotherapy", "[", "51", "]", "."
    ]

    tokens = nlp.tokenize(sent1)
    #assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        print(token, gold_token.encode("utf-8"))
        assert bytes(token) == gold_token.encode("utf-8")
Example #5
0
def tokenize_sgml_test_8_no_normalize():
    nlp.get_global_PTB_config().escape_forward_slash_asterisk = False
    sent8 = "<a href=\"http:\\\\it's\\here\"> <quote orig_author='some " "\"dude'/> <not sgmltag"

    gold = ['<a href="http:\\\\it\'s\\here">', "<quote orig_author='some \"dude'/>", "<", "not", "sgmltag"]

    tokens = nlp.tokenize(sent8)
    print(tokens)
    print(gold)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Example #6
0
def tokenize_sgml_test_4_no_normalize():
    nlp.get_global_PTB_config().normalize_spaces = False
    sent4 = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" " \
            "\"http://www.w3.org/TR/html4/strict.dtd\">"

    gold = [
        "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Strict//EN\" " \
        "\"http://www.w3.org/TR/html4/strict.dtd\">"]
    # spaces go to &nbsp; \u00A0

    tokens = nlp.tokenize(sent4)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Example #7
0
def tokenize_sgml_test_2_no_normalize():
    nlp.get_global_PTB_config().strict_ptb3 = False
    sent2 = "Panasonic brand products are produced by Samsung Electronics " \
            "Co. Ltd. Sanyo products aren't."
    gold = [
        "Panasonic", "brand", "products", "are", "produced", "by", "Samsung",
        "Electronics", "Co.", "Ltd.", ".", "Sanyo", "products", "are", "n't",
        "."
    ]

    tokens = nlp.tokenize(sent2)
    print(tokens)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
    assert len(tokens) == len(gold)
Example #8
0
def tokenize_sgml_test_8_no_normalize():
    nlp.get_global_PTB_config().escape_forward_slash_asterisk = False
    sent8 = "<a href=\"http:\\\\it's\\here\"> <quote orig_author='some " \
            "\"dude'/> <not sgmltag"

    gold = [
        "<a href=\"http:\\\\it's\\here\">",
        "<quote orig_author='some \"dude'/>", "<", "not", "sgmltag"
    ]

    tokens = nlp.tokenize(sent8)
    print(tokens)
    print(gold)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Example #9
0
def tokenize_sgml_test_3_no_normalize():
    nlp.get_global_PTB_config().normalize_parentheses = False
    sent3 = "Oesophageal acid exposure (% time <pH 4) was similar in " \
            "patients with or without complications (19.2% v 19.3% p>0.05)."

    gold = [
        "Oesophageal", "acid", "exposure", "(", "%", "time", "<", "pH", "4",
        ")", "was", "similar", "in", "patients", "with", "or", "without",
        "complications", "(", "19.2", "%", "v", "19.3", "%", "p", ">", "0.05",
        ")", "."
    ]

    tokens = nlp.tokenize(sent3)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Example #10
0
def tokenize_sgml_test_3_no_normalize():
    nlp.get_global_PTB_config().normalize_parentheses = False
    sent3 = (
        "Oesophageal acid exposure (% time <pH 4) was similar in "
        "patients with or without complications (19.2% v 19.3% p>0.05)."
    )

    gold = [
        "Oesophageal",
        "acid",
        "exposure",
        "(",
        "%",
        "time",
        "<",
        "pH",
        "4",
        ")",
        "was",
        "similar",
        "in",
        "patients",
        "with",
        "or",
        "without",
        "complications",
        "(",
        "19.2",
        "%",
        "v",
        "19.3",
        "%",
        "p",
        ">",
        "0.05",
        ")",
        ".",
    ]

    tokens = nlp.tokenize(sent3)
    assert len(tokens) == len(gold)
    for token, gold_token in zip(tokens, gold):
        assert bytes(token) == gold_token.encode("utf-8")
Example #11
0
 def setUp(self):
     nlp.get_global_PTB_config().default()
Example #12
0
def test_ptb_tok_config1():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = u"unicode"
    assert cfg.normalize_quotes == u"unicode"
Example #13
0
def test_ptb_tok_config_exception():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = u"not a valid option"
Example #14
0
def test_ptb_tok_config4():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = None
    assert cfg.normalize_quotes == None
Example #15
0
def test_ptb_tok_config3():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = u"ascii"
    assert cfg.normalize_quotes == u"ascii"
Example #16
0
import os
import re
import nlp
import nltk
import nltk.tree
nlp.get_global_PTB_config().normalize_quotes = None
nlp.get_global_PTB_config().normalize_brackets = False
nlp.get_global_PTB_config().normalize_dashes = False
nlp.get_global_PTB_config().strict_ptb3 = True


def collect_paths(ontonotes_base_dir, language="english"):
    on_bdir_lang = os.path.join(
        ontonotes_base_dir, "data", "files", "data", language, "annotations")
    onf_paths = []
    for path, dirs, files in os.walk(on_bdir_lang):
        for file in files:
            if file.endswith(".onf"):
                onf_paths.append(os.path.join(path, file))
    return onf_paths 

def traverse(tree):
    tokens = []
    for index, subtree in enumerate(tree):
        if subtree.height() == 2:
            tok = subtree[0]
            pos = subtree._label
            if pos != "-NONE-":
                tokens.append([tok, pos])
                
        elif type(subtree) == nltk.tree.Tree:
Example #17
0
def test_ptb_tok_config2():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = u"latex"
    assert cfg.normalize_quotes == u"latex"
Example #18
0
def test_ptb_tok_config1():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = u"unicode"
    assert cfg.normalize_quotes == u"unicode"
Example #19
0
def test_ptb_tok_config_exception():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = u"not a valid option"
Example #20
0
def test_ptb_tok_config4():
    cfg = nlp.get_global_PTB_config()
    cfg.normalize_quotes = None
    assert cfg.normalize_quotes == None
Example #21
0
import os
import re
import nlp
import nltk
import nltk.tree
nlp.get_global_PTB_config().normalize_quotes = None
nlp.get_global_PTB_config().normalize_brackets = False
nlp.get_global_PTB_config().normalize_dashes = False
nlp.get_global_PTB_config().strict_ptb3 = True


def collect_paths(ontonotes_base_dir, language="english"):
    on_bdir_lang = os.path.join(ontonotes_base_dir, "data", "files", "data",
                                language, "annotations")
    onf_paths = []
    for path, dirs, files in os.walk(on_bdir_lang):
        for file in files:
            if file.endswith(".onf"):
                onf_paths.append(os.path.join(path, file))
    return onf_paths


def traverse(tree):
    tokens = []
    for index, subtree in enumerate(tree):
        if subtree.height() == 2:
            tok = subtree[0]
            pos = subtree._label
            if pos != "-NONE-":
                tokens.append([tok, pos])
Example #22
0
 def setUp(self):
     nlp.get_global_PTB_config().default()