Beispiel #1
0
def _read_line(text):
    """
    Parses a line of the freeling command line output.
    """

    assert_valid_encoding(text)
    assert u"#" in text

    start, text = text.split(u"#", 1)

    start = start.strip().rsplit(u" ", 1)[0]
    text = text.strip()
    token_has_spaces = False

    if start.count(u" ") > 2:
        token = FREELING_FUNCTION_OUTPUT_REGEX.match(start)
        assert not token is None
        token = token.group()
        token_has_spaces = True
    else:
        token = start.split(u" ")[0]

    if token_has_spaces:
        text = text.replace(token, u"<token>")

    text = text.split(u" ")
    assert len(text) % 4 == 0

    best_word = None
    while text:
        word = Word(token)
        word.sense = text.pop()
        try:
            word.prob = float(text.pop())
        except ValueError:
            raise TaggingError(u"The probability field of a"
                               u" word was non-numerical")
        if word.prob < 0 or word.prob > 1:
            raise TaggingError(u"The probability field of a"
                               u" word was not a probability")

        word.pos = text.pop()
        word.lemma = text.pop()

        if word.pos in (u"NNP", u"MR"):
            word.token = word.token.replace(u"_", u" ")

        if word.token == u"?" and word.pos == u"Fit":
            word.pos = u"."

        if not best_word or word.prob > best_word.prob:
            best_word = word

    return best_word
Beispiel #2
0
def _read_line(text):
    """
    Parses a line of the freeling command line output.
    """

    assert_valid_encoding(text)
    assert u"#" in text

    start, text = text.split(u"#", 1)

    start = start.strip().rsplit(u" ", 1)[0]
    text = text.strip()
    token_has_spaces = False

    if start.count(u" ") > 2:
        token = FREELING_FUNCTION_OUTPUT_REGEX.match(start)
        assert not token is None
        token = token.group()
        token_has_spaces = True
    else:
        token = start.split(u" ")[0]

    if token_has_spaces:
        text = text.replace(token, u"<token>")

    text = text.split(u" ")
    assert len(text) % 4 == 0

    best_word = None
    while text:
        word = Word(token)
        word.sense = text.pop()
        try:
            word.prob = float(text.pop())
        except ValueError:
            raise TaggingError(u"The probability field of a"
                               u" word was non-numerical")
        if word.prob < 0 or word.prob > 1:
            raise TaggingError(u"The probability field of a"
                               u" word was not a probability")

        word.pos = text.pop()
        word.lemma = text.pop()

        if word.pos in (u"NNP", u"MR"):
            word.token = word.token.replace(u"_", u" ")

        if word.token == u"?" and word.pos == u"Fit":
            word.pos = u"."

        if not best_word or word.prob > best_word.prob:
            best_word = word

    return best_word
Beispiel #3
0
def run_nltktagger(string, nltk_data_path=None):
    """
    Runs nltk tagger on `string` and returns a list of
    :class:`quepy.freeling.Word` objects.
    """
    assert_valid_encoding(string)

    global _penn_to_morphy_tag

    if nltk_data_path:
        nltk.data.path = nltk_data_path

    from nltk.corpus import wordnet

    if not _penn_to_morphy_tag:
        _penn_to_morphy_tag = {
            u'NN': wordnet.NOUN,
            u'JJ': wordnet.ADJ,
            u'VB': wordnet.VERB,
            u'RB': wordnet.ADV,
        }

    # Recommended tokenizer doesn't handle non-ascii characters very well
    #tokens = nltk.word_tokenize(string)
    tokens = nltk.wordpunct_tokenize(string)
    tags = nltk.pos_tag(tokens)

    words = []
    for token, pos in tags:
        word = Word(token)
        # Eliminates stuff like JJ|CC
        # decode ascii because they are the penn-like POS tags (are ascii).
        if sys.version_info[0] == 3:
            word.pos = pos.split("|")[0]
        else:
            word.pos = pos.split("|")[0].decode("ascii")

        mtag = penn_to_morphy_tag(word.pos)
        # Nice shooting, son. What's your name?
        lemma = wordnet.morphy(word.token, pos=mtag)
        if isinstance(lemma, str):
            # In this case lemma is example-based, because if it's rule based
            # the result should be unicode (input was unicode).
            # Since english is ascii the decoding is ok.
            if sys.version_info[0] == 2:
                lemma = lemma.decode("ascii")
        word.lemma = lemma
        if word.lemma is None:
            word.lemma = word.token.lower()

        words.append(word)

    return words
Beispiel #4
0
def run_nltktagger(string, nltk_data_path=None):
    """
    Runs nltk tagger on `string` and returns a list of
    :class:`quepy.freeling.Word` objects.
    """
    assert_valid_encoding(string)

    global _penn_to_morphy_tag

    if nltk_data_path:
        nltk.data.path = nltk_data_path

    from nltk.corpus import wordnet

    if not _penn_to_morphy_tag:
        _penn_to_morphy_tag = {
            u'NN': wordnet.NOUN,
            u'JJ': wordnet.ADJ,
            u'VB': wordnet.VERB,
            u'RB': wordnet.ADV,
        }

    # Recommended tokenizer doesn't handle non-ascii characters very well
    #tokens = nltk.word_tokenize(string)
    tokens = nltk.wordpunct_tokenize(string)
    tags = nltk.pos_tag(tokens)

    words = []
    for token, pos in tags:
        word = Word(token)
        # Eliminates stuff like JJ|CC
        # decode ascii because they are the penn-like POS tags (are ascii).
        word.pos = pos.split("|")[0].decode("ascii")

        mtag = penn_to_morphy_tag(word.pos)
        # Nice shooting, son. What's your name?
        lemma = wordnet.morphy(word.token, pos=mtag)
        if isinstance(lemma, str):
            # In this case lemma is example-based, because if it's rule based
            # the result should be unicode (input was unicode).
            # Since english is ascii the decoding is ok.
            lemma = lemma.decode("ascii")
        word.lemma = lemma
        if word.lemma is None:
            word.lemma = word.token.lower()

        words.append(word)

    return words