Esempio n. 1
0
        # We restore the "ß" after parsing.
        tokens_ss = [t.replace(u"ß", "ss") for t in tokens]
        tokens_ss = _Parser.find_tags(self, tokens_ss, **kwargs)
        return [[w] + tokens_ss[i][1:] for i, w in enumerate(tokens)]


parser = Parser(lexicon=os.path.join(MODULE, "de-lexicon.txt"),
                frequency=os.path.join(MODULE, "de-frequency.txt"),
                morphology=os.path.join(MODULE, "de-morphology.txt"),
                context=os.path.join(MODULE, "de-context.txt"),
                default=("NN", "NE", "CARDNUM"),
                language="de")

lexicon = parser.lexicon  # Expose lexicon.

spelling = Spelling(path=os.path.join(MODULE, "de-spelling.txt"))


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)


def parsetree(s, *args, **kwargs):
Esempio n. 2
0
        return _Parser.find_tags(self, tokens, **kwargs)

parser = Parser(
    lexicon=os.path.join(MODULE, "ru-lexicon.txt"),  # A dict of known words => most frequent tag.
    frequency=os.path.join(MODULE, "ru-frequency.txt"),  # A dict of word frequency.
    model=os.path.join(MODULE, "ru-model.slp"),  # A SLP classifier trained on WSJ (01-07).
    #morphology=os.path.join(MODULE, "en-morphology.txt"),  # A set of suffix rules
    #context=os.path.join(MODULE, "en-context.txt"),  # A set of contextual rules.
    #entities=os.path.join(MODULE, "en-entities.txt"),  # A dict of named entities: John = NNP-PERS.
    #default=("NN", "NNP", "CD"),
    language="ru"
)


spelling = Spelling(
    path=os.path.join(MODULE, "ru-spelling.txt"),
    alphabet='CYRILLIC'
)


def tokenize(s, *args, **kwargs):
    """ Returns a list of sentences, where punctuation marks have been split from words.
    """
    return parser.find_tokens(s, *args, **kwargs)


def parse(s, *args, **kwargs):
    """ Returns a tagged Unicode string.
    """
    return parser.parse(s, *args, **kwargs)