def __init__(self, pcfg=None):
        if pcfg is None:
            pcfg = build_model()

        self.pcfg = pcfg
        self.tokenizer = PennTreebankTokenizer()

        if nltk_is_available:
            self.parse = self.nltk_parse
        else:
            self.parse = self.raw_parse
class Parser:
    def __init__(self, pcfg=None):
        if pcfg is None:
            pcfg = build_model()

        self.pcfg = pcfg
        self.tokenizer = PennTreebankTokenizer()

        if nltk_is_available:
            self.parse = self.nltk_parse
        else:
            self.parse = self.raw_parse

    def norm_parse(self, sentence):
        words = self.tokenizer.tokenize(sentence)
        if is_cap_word(words[0]):
            words[0] = words[0].lower()

        norm_words = []
        for word in words:
            if isinstance(word, tuple):
                # This is already a word normalized to the Treebank conventions
                norm_words.append(word)
            else:
                # rare words normalization
                norm_words.append((self.pcfg.norm_word(word), word))
        return CKY(self.pcfg, norm_words)

    def raw_parse(self, sentence):
        tree = self.norm_parse(sentence)
        un_chomsky_normal_form(tree)
        return tree

    def nltk_parse(self, sentence):
        return nltk_tree(self.raw_parse(sentence))
Ejemplo n.º 3
0
class Parser:
    def __init__(self, pcfg):
        self.pcfg = pcfg
        self.tokenizer = PennTreebankTokenizer()
    
    def parse(self, sentence):
        words = self.tokenizer.tokenize(sentence)
        norm_words = []
        for word in words:                # rare words normalization + keep word
            norm_words.append((self.pcfg.norm_word(word), word))
        tree = CKY(self.pcfg, norm_words)
        tree[0] = tree[0].split("|")[0]
        return tree
Ejemplo n.º 4
0
 def __init__(self, pcfg):
     self.pcfg = pcfg
     self.tokenizer = PennTreebankTokenizer()