def test_nounlike_noun(self): noun_word = Word(u"dog", u"dog", u"NN") output = semantic_utils.handle_nounlike(noun_word) self.assertIsInstance(output, HasKeyword) noun_word = Word(u"dog", u"dog", u"NNP") output = semantic_utils.handle_nounlike(noun_word) self.assertIsInstance(output, HasKeyword)
def test_match_words(self): class SomeRegex(QuestionTemplate): def interpret(self, match): return match words = [Word(u"|@€đ€łł@ð«|µnþ", u"hello"), Word(u"a", u"b", u"c")] match, _ = SomeRegex().get_interpretation(words) self.assertEqual(words, match.words)
def test_match_words(self): class SomeRegex(RegexTemplate): def semantics(self, match): return match words = [Word(u"|@€đ€łł@ð«|µnþ", u"hello"), Word(u"a", u"b", u"c")] match, _ = SomeRegex().get_semantics(words) self.assertEqual(words, match.words)
def run_nltktagger(string, nltk_data_path=None): """ Runs nltk tagger on `string` and returns a list of :class:`quepy.freeling.Word` objects. """ assert_valid_encoding(string) global _penn_to_morphy_tag if nltk_data_path: nltk.data.path = nltk_data_path from nltk.corpus import wordnet if not _penn_to_morphy_tag: _penn_to_morphy_tag = { u'NN': wordnet.NOUN, u'JJ': wordnet.ADJ, u'VB': wordnet.VERB, u'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well #tokens = nltk.word_tokenize(string) tokens = nltk.wordpunct_tokenize(string) tags = nltk.pos_tag(tokens) words = [] for token, pos in tags: word = Word(token) # Eliminates stuff like JJ|CC # decode ascii because they are the penn-like POS tags (are ascii). if sys.version_info[0] == 3: word.pos = pos.split("|")[0] else: word.pos = pos.split("|")[0].decode("ascii") mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? lemma = wordnet.morphy(word.token, pos=mtag) if isinstance(lemma, str): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. if sys.version_info[0] == 2: lemma = lemma.decode("ascii") word.lemma = lemma if word.lemma is None: word.lemma = word.token.lower() words.append(word) return words
def run_nltktagger(string, nltk_data_path=None): """ Runs nltk tagger on `string` and returns a list of :class:`quepy.freeling.Word` objects. """ assert_valid_encoding(string) global _penn_to_morphy_tag if nltk_data_path: nltk.data.path = nltk_data_path from nltk.corpus import wordnet if not _penn_to_morphy_tag: _penn_to_morphy_tag = { u'NN': wordnet.NOUN, u'JJ': wordnet.ADJ, u'VB': wordnet.VERB, u'RB': wordnet.ADV, } # Recommended tokenizer doesn't handle non-ascii characters very well #tokens = nltk.word_tokenize(string) tokens = nltk.wordpunct_tokenize(string) tags = nltk.pos_tag(tokens) words = [] for token, pos in tags: word = Word(token) # Eliminates stuff like JJ|CC # decode ascii because they are the penn-like POS tags (are ascii). word.pos = pos.split("|")[0].decode("ascii") mtag = penn_to_morphy_tag(word.pos) # Nice shooting, son. What's your name? lemma = wordnet.morphy(word.token, pos=mtag) if isinstance(lemma, str): # In this case lemma is example-based, because if it's rule based # the result should be unicode (input was unicode). # Since english is ascii the decoding is ok. lemma = lemma.decode("ascii") word.lemma = lemma if word.lemma is None: word.lemma = word.token.lower() words.append(word) return words
def test_no_ir(self): class SomeRegex(QuestionTemplate): regex = Lemma(u"hello") regexinstance = SomeRegex() words = [Word(u"hi", u"hello")] self.assertRaises(NotImplementedError, regexinstance.get_interpretation, words)
def test_no_semantics(self): class SomeRegex(RegexTemplate): regex = Lemma(u"hello") regexinstance = SomeRegex() words = [Word(u"hi", u"hello")] self.assertRaises(NotImplementedError, regexinstance.get_semantics, words)
def _read_line(text): """ Parses a line of the freeling command line output. """ assert_valid_encoding(text) assert u"#" in text start, text = text.split(u"#", 1) start = start.strip().rsplit(u" ", 1)[0] text = text.strip() token_has_spaces = False if start.count(u" ") > 2: token = FREELING_FUNCTION_OUTPUT_REGEX.match(start) assert not token is None token = token.group() token_has_spaces = True else: token = start.split(u" ")[0] if token_has_spaces: text = text.replace(token, u"<token>") text = text.split(u" ") assert len(text) % 4 == 0 best_word = None while text: word = Word(token) word.sense = text.pop() try: word.prob = float(text.pop()) except ValueError: raise TaggingError(u"The probability field of a" u" word was non-numerical") if word.prob < 0 or word.prob > 1: raise TaggingError(u"The probability field of a" u" word was not a probability") word.pos = text.pop() word.lemma = text.pop() if word.pos in (u"NNP", u"MR"): word.token = word.token.replace(u"_", u" ") if word.token == u"?" and word.pos == u"Fit": word.pos = u"." if not best_word or word.prob > best_word.prob: best_word = word return best_word
def test_regex_empty(self): class SomeRegex(QuestionTemplate): def interpret(self, match): return Mockrule, "YES!" regexinstance = SomeRegex() words = [Word(u"hi", u"hello")] ir, userdata = regexinstance.get_interpretation(words) self.assertTrue(ir is Mockrule) self.assertEqual(userdata, "YES!")
def test_regex_empty(self): class SomeRegex(RegexTemplate): def semantics(self, match): return Mockrule, "YES!" regexinstance = SomeRegex() words = [Word(u"hi", u"hello")] semantics, userdata = regexinstance.get_semantics(words) self.assertTrue(semantics is Mockrule) self.assertEqual(userdata, "YES!")
def test_nounlike_handler(self): from quepy import handlers class DogType(FixedType): fixedtype = "dog" class MyHandler(handlers.Handler): def check(self, word): return word.lemma == "special_dog" def handler(self, word): return DogType() handlers.register(MyHandler) noun_word = Word(u"lazzy", u"special_dog", u"NN") output = semantic_utils.handle_nounlike(noun_word) self.assertIsInstance(output, DogType)
def run_spacytagger(string): """ Runs spacy on `string` and returns a list of :class:`quepy.tagger.Word` objects. """ assert_valid_encoding(string) # For now, at least, perform our own pre-processing # --to ensure terms like "presynaptic" are easily found later. string = ' '.join(string.split()) string = collapse(string) doc = nlp(string) # NOTE: spaCy expects and returns unicode spans = [(ent_id, nlp.vocab.strings[ent_id], doc[start:end]) for ent_id, start, end in matcher(doc)] for ent_id, label_id, span in spans: span.merge(label=label_id, tag='NNP' if label_id else span.root.tag_) # tag_ is the "fine-grained" POS words = [Word(x.text, x.lemma_, x.tag_) for x in doc] # The following is only for logging purposes; if necessary, it could be removed for production log.info(' '.join([t.text + '[' + str(t.i) + ']' for t in doc])) indent = " " longest = max(len(t.text) for t in doc) column = (len(doc) - 1) * len(indent) + longest + 2 wout = '{:' + str(column) + '}| ' def trav_tree(indents, node): log.info( wout.format((indent * indents) + node.text) + ', '.join( [ str(x) for x in [ node.i, node.is_oov, node.lemma_, node.tag_, \ "<-"+ str(node.left_edge), str(node.right_edge) +"->"] ]) ) for el in node.children: # NOTE: Could also change display based on node.lefts and node.rights trav_tree(indents + 1, el) for sent in doc.sents: trav_tree(0, sent.root) log.info('Ents: ' + str(doc.ents)) log.info('NPs: ' + str(list(doc.noun_chunks))) return words
def test_match(self): words = [Word(u"hi", u"hello")] ir, userdata = self.regexinstance.get_interpretation(words) self.assertTrue(ir is self.mockrule) self.assertEqual(userdata, None)
def test_user_data(self): words = [Word(u"hi", u"hello")] _, userdata = self.regex_with_data.get_interpretation(words) self.assertEqual(userdata, 42)
def test_no_match(self): words = [Word(u"hi", u"hello"), Word(u"girl", u"girl")] ir, userdata = self.regexinstance.get_interpretation(words) self.assertEqual(ir, None) self.assertEqual(userdata, None)
def test_user_data(self): words = [Word(u"hi", u"hello")] _, userdata = self.regex_with_data.get_semantics(words) self.assertEqual(userdata, 42)
def test_no_match(self): words = [Word(u"hi", u"hello"), Word(u"girl", u"girl")] semantics, userdata = self.regexinstance.get_semantics(words) self.assertEqual(semantics, None) self.assertEqual(userdata, None)
def test_match(self): words = [Word(u"hi", u"hello")] semantics, userdata = self.regexinstance.get_semantics(words) self.assertTrue(semantics is self.mockrule) self.assertEqual(userdata, None)
def test_nested_particle(self): words = [Word(x, x) for x in u"Jim 's car be Tonny".split()] match, _ = self.nestedregex.get_semantics(words) self.assertEqual(match.personasset.words[0], words[0]) self.assertRaises(AttributeError, lambda: match.personasset.another)
def test_handle_noun_phrase(self): noun_phrase = [Word(u"cool", u"cool", u"JJ"), Word(u"dogs", u"dog", u"NNS")] output = semantic_utils.handle_noun_phrase(noun_phrase) self.assertIsInstance(output, HasKeyword)
def test_attrs(self): words = [Word(x, x) for x in u"Jim be Tonny".split()] match, _ = self.personregex.get_semantics(words) self.assertEqual(match.another.words[0], words[-1]) self.assertEqual(match.person.words[0], words[0]) self.assertRaises(AttributeError, lambda: match.pirulo)
def test_nounlike_unhandled(self): non_noun_word = Word(u"ran", u"run", u"VB") self.assertRaises(semantic_utils.UnhandledWord, semantic_utils.handle_nounlike, non_noun_word)