Ejemplo n.º 1
0
 def test_nounlike_noun(self):
     noun_word = Word(u"dog", u"dog", u"NN")
     output = semantic_utils.handle_nounlike(noun_word)
     self.assertIsInstance(output, HasKeyword)
     noun_word = Word(u"dog", u"dog", u"NNP")
     output = semantic_utils.handle_nounlike(noun_word)
     self.assertIsInstance(output, HasKeyword)
Ejemplo n.º 2
0
    def test_match_words(self):
        class SomeRegex(QuestionTemplate):
            def interpret(self, match):
                return match

        words = [Word(u"|@€đ€łł@ð«|µnþ", u"hello"), Word(u"a", u"b", u"c")]
        match, _ = SomeRegex().get_interpretation(words)
        self.assertEqual(words, match.words)
Ejemplo n.º 3
0
    def test_match_words(self):
        class SomeRegex(RegexTemplate):
            def semantics(self, match):
                return match

        words = [Word(u"|@€đ€łł@ð«|µnþ", u"hello"), Word(u"a", u"b", u"c")]
        match, _ = SomeRegex().get_semantics(words)
        self.assertEqual(words, match.words)
Ejemplo n.º 4
0
def run_nltktagger(string, nltk_data_path=None):
    """
    Runs nltk tagger on `string` and returns a list of
    :class:`quepy.freeling.Word` objects.
    """
    assert_valid_encoding(string)

    global _penn_to_morphy_tag

    if nltk_data_path:
        nltk.data.path = nltk_data_path

    from nltk.corpus import wordnet

    if not _penn_to_morphy_tag:
        _penn_to_morphy_tag = {
            u'NN': wordnet.NOUN,
            u'JJ': wordnet.ADJ,
            u'VB': wordnet.VERB,
            u'RB': wordnet.ADV,
        }

    # Recommended tokenizer doesn't handle non-ascii characters very well
    #tokens = nltk.word_tokenize(string)
    tokens = nltk.wordpunct_tokenize(string)
    tags = nltk.pos_tag(tokens)

    words = []
    for token, pos in tags:
        word = Word(token)
        # Eliminates stuff like JJ|CC
        # decode ascii because they are the penn-like POS tags (are ascii).
        if sys.version_info[0] == 3:
            word.pos = pos.split("|")[0]
        else:
            word.pos = pos.split("|")[0].decode("ascii")

        mtag = penn_to_morphy_tag(word.pos)
        # Nice shooting, son. What's your name?
        lemma = wordnet.morphy(word.token, pos=mtag)
        if isinstance(lemma, str):
            # In this case lemma is example-based, because if it's rule based
            # the result should be unicode (input was unicode).
            # Since english is ascii the decoding is ok.
            if sys.version_info[0] == 2:
                lemma = lemma.decode("ascii")
        word.lemma = lemma
        if word.lemma is None:
            word.lemma = word.token.lower()

        words.append(word)

    return words
Ejemplo n.º 5
0
def run_nltktagger(string, nltk_data_path=None):
    """
    Runs nltk tagger on `string` and returns a list of
    :class:`quepy.freeling.Word` objects.
    """
    assert_valid_encoding(string)

    global _penn_to_morphy_tag

    if nltk_data_path:
        nltk.data.path = nltk_data_path

    from nltk.corpus import wordnet

    if not _penn_to_morphy_tag:
        _penn_to_morphy_tag = {
            u'NN': wordnet.NOUN,
            u'JJ': wordnet.ADJ,
            u'VB': wordnet.VERB,
            u'RB': wordnet.ADV,
        }

    # Recommended tokenizer doesn't handle non-ascii characters very well
    #tokens = nltk.word_tokenize(string)
    tokens = nltk.wordpunct_tokenize(string)
    tags = nltk.pos_tag(tokens)

    words = []
    for token, pos in tags:
        word = Word(token)
        # Eliminates stuff like JJ|CC
        # decode ascii because they are the penn-like POS tags (are ascii).
        word.pos = pos.split("|")[0].decode("ascii")

        mtag = penn_to_morphy_tag(word.pos)
        # Nice shooting, son. What's your name?
        lemma = wordnet.morphy(word.token, pos=mtag)
        if isinstance(lemma, str):
            # In this case lemma is example-based, because if it's rule based
            # the result should be unicode (input was unicode).
            # Since english is ascii the decoding is ok.
            lemma = lemma.decode("ascii")
        word.lemma = lemma
        if word.lemma is None:
            word.lemma = word.token.lower()

        words.append(word)

    return words
Ejemplo n.º 6
0
    def test_no_ir(self):
        class SomeRegex(QuestionTemplate):
            regex = Lemma(u"hello")

        regexinstance = SomeRegex()
        words = [Word(u"hi", u"hello")]
        self.assertRaises(NotImplementedError,
                          regexinstance.get_interpretation, words)
Ejemplo n.º 7
0
    def test_no_semantics(self):
        class SomeRegex(RegexTemplate):
            regex = Lemma(u"hello")

        regexinstance = SomeRegex()
        words = [Word(u"hi", u"hello")]
        self.assertRaises(NotImplementedError, regexinstance.get_semantics,
                          words)
Ejemplo n.º 8
0
def _read_line(text):
    """
    Parses a line of the freeling command line output.
    """

    assert_valid_encoding(text)
    assert u"#" in text

    start, text = text.split(u"#", 1)

    start = start.strip().rsplit(u" ", 1)[0]
    text = text.strip()
    token_has_spaces = False

    if start.count(u" ") > 2:
        token = FREELING_FUNCTION_OUTPUT_REGEX.match(start)
        assert not token is None
        token = token.group()
        token_has_spaces = True
    else:
        token = start.split(u" ")[0]

    if token_has_spaces:
        text = text.replace(token, u"<token>")

    text = text.split(u" ")
    assert len(text) % 4 == 0

    best_word = None
    while text:
        word = Word(token)
        word.sense = text.pop()
        try:
            word.prob = float(text.pop())
        except ValueError:
            raise TaggingError(u"The probability field of a"
                               u" word was non-numerical")
        if word.prob < 0 or word.prob > 1:
            raise TaggingError(u"The probability field of a"
                               u" word was not a probability")

        word.pos = text.pop()
        word.lemma = text.pop()

        if word.pos in (u"NNP", u"MR"):
            word.token = word.token.replace(u"_", u" ")

        if word.token == u"?" and word.pos == u"Fit":
            word.pos = u"."

        if not best_word or word.prob > best_word.prob:
            best_word = word

    return best_word
Ejemplo n.º 9
0
    def test_regex_empty(self):
        class SomeRegex(QuestionTemplate):
            def interpret(self, match):
                return Mockrule, "YES!"

        regexinstance = SomeRegex()
        words = [Word(u"hi", u"hello")]
        ir, userdata = regexinstance.get_interpretation(words)
        self.assertTrue(ir is Mockrule)
        self.assertEqual(userdata, "YES!")
Ejemplo n.º 10
0
    def test_regex_empty(self):
        class SomeRegex(RegexTemplate):
            def semantics(self, match):
                return Mockrule, "YES!"

        regexinstance = SomeRegex()
        words = [Word(u"hi", u"hello")]
        semantics, userdata = regexinstance.get_semantics(words)
        self.assertTrue(semantics is Mockrule)
        self.assertEqual(userdata, "YES!")
Ejemplo n.º 11
0
def _read_line(text):
    """
    Parses a line of the freeling command line output.
    """

    assert_valid_encoding(text)
    assert u"#" in text

    start, text = text.split(u"#", 1)

    start = start.strip().rsplit(u" ", 1)[0]
    text = text.strip()
    token_has_spaces = False

    if start.count(u" ") > 2:
        token = FREELING_FUNCTION_OUTPUT_REGEX.match(start)
        assert not token is None
        token = token.group()
        token_has_spaces = True
    else:
        token = start.split(u" ")[0]

    if token_has_spaces:
        text = text.replace(token, u"<token>")

    text = text.split(u" ")
    assert len(text) % 4 == 0

    best_word = None
    while text:
        word = Word(token)
        word.sense = text.pop()
        try:
            word.prob = float(text.pop())
        except ValueError:
            raise TaggingError(u"The probability field of a"
                               u" word was non-numerical")
        if word.prob < 0 or word.prob > 1:
            raise TaggingError(u"The probability field of a"
                               u" word was not a probability")

        word.pos = text.pop()
        word.lemma = text.pop()

        if word.pos in (u"NNP", u"MR"):
            word.token = word.token.replace(u"_", u" ")

        if word.token == u"?" and word.pos == u"Fit":
            word.pos = u"."

        if not best_word or word.prob > best_word.prob:
            best_word = word

    return best_word
Ejemplo n.º 12
0
    def test_nounlike_handler(self):
        from quepy import handlers

        class DogType(FixedType):
            fixedtype = "dog"

        class MyHandler(handlers.Handler):
            def check(self, word):
                return word.lemma == "special_dog"

            def handler(self, word):
                return DogType()

        handlers.register(MyHandler)
        noun_word = Word(u"lazzy", u"special_dog", u"NN")
        output = semantic_utils.handle_nounlike(noun_word)
        self.assertIsInstance(output, DogType)
Ejemplo n.º 13
0
def run_spacytagger(string):
    """
    Runs spacy on `string` and returns a list of
    :class:`quepy.tagger.Word` objects.
    """
    assert_valid_encoding(string)

    # For now, at least, perform our own pre-processing
    # --to ensure terms like "presynaptic" are easily found later.
    string = ' '.join(string.split())
    string = collapse(string)

    doc = nlp(string)  # NOTE: spaCy expects and returns unicode

    spans = [(ent_id, nlp.vocab.strings[ent_id], doc[start:end])
             for ent_id, start, end in matcher(doc)]
    for ent_id, label_id, span in spans:
        span.merge(label=label_id, tag='NNP' if label_id else span.root.tag_)

    # tag_ is the "fine-grained" POS
    words = [Word(x.text, x.lemma_, x.tag_) for x in doc]

    # The following is only for logging purposes; if necessary, it could be removed for production
    log.info(' '.join([t.text + '[' + str(t.i) + ']' for t in doc]))
    indent = "  "
    longest = max(len(t.text) for t in doc)
    column = (len(doc) - 1) * len(indent) + longest + 2
    wout = '{:' + str(column) + '}| '

    def trav_tree(indents, node):
        log.info( wout.format((indent * indents) + node.text) + ', '.join(
            [ str(x) for x in [
                node.i, node.is_oov, node.lemma_, node.tag_, \
                "<-"+ str(node.left_edge), str(node.right_edge) +"->"] ]) )
        for el in node.children:
            # NOTE: Could also change display based on node.lefts and node.rights
            trav_tree(indents + 1, el)

    for sent in doc.sents:
        trav_tree(0, sent.root)
    log.info('Ents:  ' + str(doc.ents))
    log.info('NPs:   ' + str(list(doc.noun_chunks)))

    return words
Ejemplo n.º 14
0
 def test_match(self):
     words = [Word(u"hi", u"hello")]
     ir, userdata = self.regexinstance.get_interpretation(words)
     self.assertTrue(ir is self.mockrule)
     self.assertEqual(userdata, None)
Ejemplo n.º 15
0
 def test_user_data(self):
     words = [Word(u"hi", u"hello")]
     _, userdata = self.regex_with_data.get_interpretation(words)
     self.assertEqual(userdata, 42)
Ejemplo n.º 16
0
 def test_no_match(self):
     words = [Word(u"hi", u"hello"), Word(u"girl", u"girl")]
     ir, userdata = self.regexinstance.get_interpretation(words)
     self.assertEqual(ir, None)
     self.assertEqual(userdata, None)
Ejemplo n.º 17
0
 def test_user_data(self):
     words = [Word(u"hi", u"hello")]
     _, userdata = self.regex_with_data.get_semantics(words)
     self.assertEqual(userdata, 42)
Ejemplo n.º 18
0
 def test_no_match(self):
     words = [Word(u"hi", u"hello"), Word(u"girl", u"girl")]
     semantics, userdata = self.regexinstance.get_semantics(words)
     self.assertEqual(semantics, None)
     self.assertEqual(userdata, None)
Ejemplo n.º 19
0
 def test_match(self):
     words = [Word(u"hi", u"hello")]
     semantics, userdata = self.regexinstance.get_semantics(words)
     self.assertTrue(semantics is self.mockrule)
     self.assertEqual(userdata, None)
Ejemplo n.º 20
0
 def test_nested_particle(self):
     words = [Word(x, x) for x in u"Jim 's car be Tonny".split()]
     match, _ = self.nestedregex.get_semantics(words)
     self.assertEqual(match.personasset.words[0], words[0])
     self.assertRaises(AttributeError, lambda: match.personasset.another)
Ejemplo n.º 21
0
 def test_handle_noun_phrase(self):
     noun_phrase = [Word(u"cool", u"cool", u"JJ"),
                    Word(u"dogs", u"dog", u"NNS")]
     output = semantic_utils.handle_noun_phrase(noun_phrase)
     self.assertIsInstance(output, HasKeyword)
Ejemplo n.º 22
0
 def test_attrs(self):
     words = [Word(x, x) for x in u"Jim be Tonny".split()]
     match, _ = self.personregex.get_semantics(words)
     self.assertEqual(match.another.words[0], words[-1])
     self.assertEqual(match.person.words[0], words[0])
     self.assertRaises(AttributeError, lambda: match.pirulo)
Ejemplo n.º 23
0
 def test_nounlike_unhandled(self):
     non_noun_word = Word(u"ran", u"run", u"VB")
     self.assertRaises(semantic_utils.UnhandledWord,
                       semantic_utils.handle_nounlike,
                       non_noun_word)