Esempio n. 1
0
def build_parser(brill_dir):
    lexicon = Lexicon(
            path = os.path.join(brill_dir, "es-lexicon.txt"),
      morphology = os.path.join(brill_dir, "es-morphology.txt"),
         context = os.path.join(brill_dir, "es-context.txt"),
        language = "es"
    )

    parser = SpanishParser(
         lexicon = lexicon,
         default = ("NCS", "NP", "Z"),
        language = "es"
    )
    return parser
Esempio n. 2
0
        
    def find_chunks(self, tokens, **kwargs):
        return _Parser.find_chunks(self, tokens, **kwargs)

    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)

# The parser's part-of-speech tagger requires a lexicon of tagged known words,
# and rules for unknown words. See pattern.text.Morphology and pattern.text.Context
# for further details. A tutorial on how to acquire data for the lexicon is here:
# http://www.clips.ua.ac.be/pages/using-wiktionary-to-build-an-italian-part-of-speech-tagger

lexicon = Lexicon(
        path = os.path.join(MODULE, "xx-lexicon.txt"), 
  morphology = os.path.join(MODULE, "xx-morphology.txt"), 
     context = os.path.join(MODULE, "xx-context.txt"),
    entities = os.path.join(MODULE, "xx-entities.txt"),
    language = "xx"
)

# Create the parser with default tags for unknown words:
# (noun, proper noun, numeric).

lexicon.load()
parser = Parser(
     lexicon = lexicon,
     default = ("NN", "NNP", "CD"),
    language = "xx"
)

# Create the sentiment lexicon,
Esempio n. 3
0
        return find_lemmata(tokens)

    def find_tags(self, tokens, **kwargs):
        if kwargs.get("tagset") in (PENN, None):
            kwargs.setdefault(
                "map", lambda token, tag: parole2penntreebank(token, tag))
        if kwargs.get("tagset") == UNIVERSAL:
            kwargs.setdefault("map",
                              lambda token, tag: parole2universal(token, tag))
        if kwargs.get("tagset") is PAROLE:
            kwargs.setdefault("map", lambda token, tag: (token, tag))
        return _Parser.find_tags(self, tokens, **kwargs)


lexicon = Lexicon(path=os.path.join(MODULE, "es-lexicon.txt"),
                  morphology=os.path.join(MODULE, "es-morphology.txt"),
                  context=os.path.join(MODULE, "es-context.txt"),
                  language="es")

parser = Parser(lexicon=lexicon, default=("NCS", "NP", "Z"), language="es")


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)
Esempio n. 4
0
            lemma = singularize(word)
        # sat => sit
        if pos.startswith(("VB", "MD")):
            lemma = conjugate(word, INFINITIVE) or word
        token.append(lemma.lower())
    return tokens


class Parser(_Parser):
    def find_lemmata(self, tokens, **kwargs):
        return find_lemmata(tokens)


lexicon = Lexicon(path=os.path.join(MODULE, "en-lexicon.txt"),
                  morphology=os.path.join(MODULE, "en-morphology.txt"),
                  context=os.path.join(MODULE, "en-context.txt"),
                  entities=os.path.join(MODULE, "en-entities.txt"),
                  language="en")
parser = Parser(lexicon=lexicon, default=("NN", "NNP", "CD"), language="en")
sentiment = Sentiment(path=os.path.join(MODULE, "en-sentiment.xml"),
                      synset="wordnet_id",
                      language="en")
spelling = Spelling(path=os.path.join(MODULE, "en-spelling.txt"))


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)

Esempio n. 5
0
#     Do not overwrite words already in the lexicon (which appears to result in <10% accuracy).
top = [(v, k) for k, v in entities.items() if " " not in k]
top = sorted(top, reverse=True)[:int(len(lexicon) * 0.4)] # percentage
top = [k for v, k in top]
for ne in top:
    if ne not in seen:
        lexicon.append(ne+" "+NE)
lexicon = sorted(lexicon)

open("brill-lexicon.txt", "w").write("\n".join(lexicon))

#### TEST ##########################################################################################
# Create a Pattern Brill tagger and evaluate accuracy on the test data.

# 11) Load lexicon data (it is a lazy-loading object).
lexicon = Lexicon()
lexicon.path = "brill-lexicon.txt"
lexicon.lexical_rules.path = "brill-lexical.txt"
lexicon.contextual_rules.path = "brill-contextual.txt"
lexicon.named_entities.tag = "NP"
lexicon.load()
lexicon.lexical_rules.load()
lexicon.contextual_rules.load()
lexicon.named_entities.load()

# For testing with or without lexical and contextual rules:
#for i in reversed(range(len(lexicon.lexical_rules)-1)):
#    del lexicon.lexical_rules[i]
#for i in reversed(range(len(lexicon.contextual_rules)-1)):
#    del lexicon.contextual_rules[i]
Esempio n. 6
0
#     Do not overwrite words already in the lexicon (which appears to result in <10% accuracy).
top = [(v, k) for k, v in entities.items() if " " not in k]
top = sorted(top, reverse=True)[:int(len(lexicon) * 0.4)]  # percentage
top = [k for v, k in top]
for ne in top:
    if ne not in seen:
        lexicon.append(ne + " " + NE)
lexicon = sorted(lexicon)

open("brill-lexicon.txt", "w").write("\n".join(lexicon))

#### TEST ##########################################################################################
# Create a Pattern Brill tagger and evaluate accuracy on the test data.

# 11) Load lexicon data (it is a lazy-loading object).
lexicon = Lexicon()
lexicon.path = "brill-lexicon.txt"
lexicon.lexical_rules.path = "brill-lexical.txt"
lexicon.contextual_rules.path = "brill-contextual.txt"
lexicon.named_entities.tag = "NP"
lexicon.load()
lexicon.lexical_rules.load()
lexicon.contextual_rules.load()
lexicon.named_entities.load()

# For testing with or without lexical and contextual rules:
#for i in reversed(range(len(lexicon.lexical_rules)-1)):
#    del lexicon.lexical_rules[i]
#for i in reversed(range(len(lexicon.contextual_rules)-1)):
#    del lexicon.contextual_rules[i]
Esempio n. 7
0
    if int(frequency) >= 1: # Adjust to tweak file size.

        for tag in tags.split(", "):

            if tag:
                w.append("%s %s" % (word, tag)); break
 
open("it-lexicon.txt", "w", encoding="utf-8").write("\n".join(w))

# Load the lexicon and the rules in an instance of ItalianParser:

from pattern.text import Lexicon
 
lexicon = Lexicon(
        path = "it-lexicon.txt", 
  morphology = "it-morphology.txt", 
     context = "it-context.txt", 
    language = "it"
)
 
parser = ItalianParser(
     lexicon = lexicon,
     default = ("NN", "NNP", "CD"),
    language = "it"
)
 
def parse(s, *args, **kwargs):
    return parser.parse(s, *args, **kwargs)

# It is still missing features (notably lemmatization) but our Italian parser is essentially ready for use:

print parse("Il gatto nero faceva le fusa.")
Esempio n. 8
0
     
    def find_tags(self, tokens, **kwargs):

        # Parser.find_tags() can take an optional map(token, tag) function,
        # which returns an updated (token, tag)-tuple for each token. 

        kwargs.setdefault("map", parole2penntreebank)
        return Parser.find_tags(self, tokens, **kwargs)

# Load the lexicon and the rules in an instance of SpanishParser:

from pattern.text import Lexicon
 
lexicon = Lexicon(
        path = "es-lexicon.txt", 
  morphology = "es-morphology.txt", 
     context = "es-context.txt", 
    language = "es"
)
 
parser = SpanishParser(
     lexicon = lexicon,
     default = ("NCS", "NP", "Z"),
    language = "es"
)
 
def parse(s, *args, **kwargs):
    return parser.parse(s, *args, **kwargs)

# It is still missing features (notably lemmatization) but our Spanish parser is essentially ready
# for use: