Example #1
0
    def annotate(self):
        data = cherrypy.request.json

        # Peel off sentence input ("s" field) and produce a response list like:
        # [(0, 7), {raw: "foo", difficulty: 0.12, replacement: "bar", definition: "blah blah", url: "bleep"}]

        s = data.get("s", u"")

        # parse_result = self.nlp.parse(s)
        parse_result = self.nlp_parse_request(s)
        print >> sys.stderr, parse_result

        response = []
        for token in parse_result:
            difficulty = -int(token["log_prob"])
            raw = token["as_is"]
            if raw.lower() in self.hard_words:
                difficulty += 3

            output_this = (token["entity_position"] == 'O' and not token["is_punct"]) and difficulty > 7

            if output_this:
                x1 = token["start_idx"]
                x2 = token["end_idx"]

                url = "http://blah"
                definition = "blah blah"

                synsets = wn.synsets(raw.lower())
                synset_words = []
                hypernym_synset_words = []
                for s in synsets:
                    synset_words += s.lemma_names()
                    hypernym_synsets = s.hypernyms()
                    for ss in hypernym_synsets:
                        hypernym_synset_words += ss.lemma_names()

                print >> sys.stderr, "synset_words:", synset_words
                print >> sys.stderr, "hypernym_synset_words:", hypernym_synset_words
                print >> sys.stderr, "part_of_speech:", token["pos"]
                highest_logprob = token["log_prob"]
                replacement = ""

                for w in synset_words + ["BREAK"] + hypernym_synset_words:

                    # If we've found a replacement in the base sysnset words, stop here.
                    # Otherwise proceed to hypernyms
                    if w == "BREAK" and replacement:
                        break

                    inflected_form = conjugate.find_form(raw, token["lemma"], w, token["pos"])
                    print >> sys.stderr, "ALTERNATE:", w, inflected_form

                    is_easy_word = (w in self.easy_words or inflected_form in self.easy_words)
                    is_hard_word = (w in self.hard_words or inflected_form in self.hard_words)

                    if w != raw and inflected_form != raw and is_easy_word:
                        logprob = self.nlp_wordprob_request(w)
                        print >> sys.stderr, "LOGPROB", logprob
                        if logprob is not None and logprob > highest_logprob:
                            highest_logprob = logprob
                            replacement = inflected_form
                        break
                if not replacement:
                    continue

                result_obj = {"raw": token["as_is"],
                              "difficulty": difficulty,
                              "replacement": replacement,
                              "definition": definition,
                              "url": url}
                response.append(((x1, x2), result_obj))

        return response
def test_verb_form():
    assert find_form(u'singing', u'sing', u'ring', u'VERB') == u'ringing'
    assert find_form(u'drank', u'drink', u'think', u'VERB') == u'thought'
    assert find_form(u'drink', u'drink', u'think', u'VERB') == u'think'
def test_noun_form():
    # NB pattern.en pluralizes 'hippopotamus' as 'hippopotamuss' :( :( :(
    assert find_form(u'cats', u'cat', u'child', u'NOUN') == u'children'
    assert find_form(u'children', u'child', u'cat', u'NOUN') == u'cats'
    assert find_form(u'cat', u'cat', u'hippopotamus', u'NOUN') == u'hippopotamus'
def test_adjective_form():
    assert find_form(u'bigger', u'big', u'absolute', u'ADJ') == u'more absolute'
    assert find_form(u'biggest', u'big', u'small', u'ADJ') == u'smallest'