def build_parser(brill_dir): lexicon = Lexicon( path = os.path.join(brill_dir, "es-lexicon.txt"), morphology = os.path.join(brill_dir, "es-morphology.txt"), context = os.path.join(brill_dir, "es-context.txt"), language = "es" ) parser = SpanishParser( lexicon = lexicon, default = ("NCS", "NP", "Z"), language = "es" ) return parser
def find_chunks(self, tokens, **kwargs): return _Parser.find_chunks(self, tokens, **kwargs) def find_lemmata(self, tokens, **kwargs): return find_lemmata(tokens) # The parser's part-of-speech tagger requires a lexicon of tagged known words, # and rules for unknown words. See pattern.text.Morphology and pattern.text.Context # for further details. A tutorial on how to acquire data for the lexicon is here: # http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger lexicon = Lexicon( path = os.path.join(MODULE, "xx-lexicon.txt"), morphology = os.path.join(MODULE, "xx-morphology.txt"), context = os.path.join(MODULE, "xx-context.txt"), entities = os.path.join(MODULE, "xx-entities.txt"), language = "xx" ) # Create the parser with default tags for unknown words: # (noun, proper noun, numeric). lexicon.load() parser = Parser( lexicon = lexicon, default = ("NN", "NNP", "CD"), language = "xx" ) # Create the sentiment lexicon,
return find_lemmata(tokens) def find_tags(self, tokens, **kwargs): if kwargs.get("tagset") in (PENN, None): kwargs.setdefault( "map", lambda token, tag: parole2penntreebank(token, tag)) if kwargs.get("tagset") == UNIVERSAL: kwargs.setdefault("map", lambda token, tag: parole2universal(token, tag)) if kwargs.get("tagset") is PAROLE: kwargs.setdefault("map", lambda token, tag: (token, tag)) return _Parser.find_tags(self, tokens, **kwargs) lexicon = Lexicon(path=os.path.join(MODULE, "es-lexicon.txt"), morphology=os.path.join(MODULE, "es-morphology.txt"), context=os.path.join(MODULE, "es-context.txt"), language="es") parser = Parser(lexicon=lexicon, default=("NCS", "NP", "Z"), language="es") def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs) def parse(s, *args, **kwargs): """ Returns a tagged Unicode string. """ return parser.parse(s, *args, **kwargs)
lemma = singularize(word) # sat => sit if pos.startswith(("VB", "MD")): lemma = conjugate(word, INFINITIVE) or word token.append(lemma.lower()) return tokens class Parser(_Parser): def find_lemmata(self, tokens, **kwargs): return find_lemmata(tokens) lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"), morphology=os.path.join(MODULE, "en-morphology.txt"), context=os.path.join(MODULE, "en-context.txt"), entities=os.path.join(MODULE, "en-entities.txt"), language="en") parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en") sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"), synset="wordnet_id", language="en") spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt")) def tokenize(s, *args, **kwargs): """ Returns a list of sentences, where punctuation marks have been split from words. """ return parser.find_tokens(s, *args, **kwargs)
# Do not overwrite words already in the lexicon (which appears to result in <10% accuracy). top = [(v, k) for k, v in entities.items() if " " not in k] top = sorted(top, reverse=True)[:int(len(lexicon) * 0.4)] # percentage top = [k for v, k in top] for ne in top: if ne not in seen: lexicon.append(ne+" "+NE) lexicon = sorted(lexicon) open("brill-lexicon.txt", "w").write("\n".join(lexicon)) #### TEST ########################################################################################## # Create a Pattern Brill tagger and evaluate accuracy on the test data. # 11) Load lexicon data (it is a lazy-loading object). lexicon = Lexicon() lexicon.path = "brill-lexicon.txt" lexicon.lexical_rules.path = "brill-lexical.txt" lexicon.contextual_rules.path = "brill-contextual.txt" lexicon.named_entities.tag = "NP" lexicon.load() lexicon.lexical_rules.load() lexicon.contextual_rules.load() lexicon.named_entities.load() # For testing with or without lexical and contextual rules: #for i in reversed(range(len(lexicon.lexical_rules)-1)): # del lexicon.lexical_rules[i] #for i in reversed(range(len(lexicon.contextual_rules)-1)): # del lexicon.contextual_rules[i]
# Do not overwrite words already in the lexicon (which appears to result in <10% accuracy). top = [(v, k) for k, v in entities.items() if " " not in k] top = sorted(top, reverse=True)[:int(len(lexicon) * 0.4)] # percentage top = [k for v, k in top] for ne in top: if ne not in seen: lexicon.append(ne + " " + NE) lexicon = sorted(lexicon) open("brill-lexicon.txt", "w").write("\n".join(lexicon)) #### TEST ########################################################################################## # Create a Pattern Brill tagger and evaluate accuracy on the test data. # 11) Load lexicon data (it is a lazy-loading object). lexicon = Lexicon() lexicon.path = "brill-lexicon.txt" lexicon.lexical_rules.path = "brill-lexical.txt" lexicon.contextual_rules.path = "brill-contextual.txt" lexicon.named_entities.tag = "NP" lexicon.load() lexicon.lexical_rules.load() lexicon.contextual_rules.load() lexicon.named_entities.load() # For testing with or without lexical and contextual rules: #for i in reversed(range(len(lexicon.lexical_rules)-1)): # del lexicon.lexical_rules[i] #for i in reversed(range(len(lexicon.contextual_rules)-1)): # del lexicon.contextual_rules[i]
if int(frequency) >= 1: # Adjust to tweak file size. for tag in tags.split(", "): if tag: w.append("%s %s" % (word, tag)); break open("it-lexicon.txt", "w", encoding="utf-8").write("\n".join(w)) # Load the lexicon and the rules in an instance of ItalianParser: from pattern.text import Lexicon lexicon = Lexicon( path = "it-lexicon.txt", morphology = "it-morphology.txt", context = "it-context.txt", language = "it" ) parser = ItalianParser( lexicon = lexicon, default = ("NN", "NNP", "CD"), language = "it" ) def parse(s, *args, **kwargs): return parser.parse(s, *args, **kwargs) # It is still missing features (notably lemmatization) but our Italian parser is essentially ready for use: print parse("Il gatto nero faceva le fusa.")
def find_tags(self, tokens, **kwargs): # Parser.find_tags() can take an optional map(token, tag) function, # which returns an updated (token, tag)-tuple for each token. kwargs.setdefault("map", parole2penntreebank) return Parser.find_tags(self, tokens, **kwargs) # Load the lexicon and the rules in an instance of SpanishParser: from pattern.text import Lexicon lexicon = Lexicon( path = "es-lexicon.txt", morphology = "es-morphology.txt", context = "es-context.txt", language = "es" ) parser = SpanishParser( lexicon = lexicon, default = ("NCS", "NP", "Z"), language = "es" ) def parse(s, *args, **kwargs): return parser.parse(s, *args, **kwargs) # It is still missing features (notably lemmatization) but our Spanish parser is essentially ready # for use: