def test_three_dots_process(self): phrases = [ u"tie...to...", u"used to...", u"wash...face", u"...weeks old", u"I/He/She... was(not) going to…", u"or...or...a...a", ] recognizer = PhrasalRecognizer(phrases) recognizer.inspect = True data = { u"To fasten or secure with or as if with a cord, rope, or strap: tied the kite to a post; tie up a bundle.": sorted(phrases[0:1] + phrases[5:6]), u"I am used to hitchhiking": phrases[1:2], u"There are specific things to keep in mind when washing your face": phrases[2:3], u"The Best Foods for 6 Week Old Puppies | Dog Care - The Daily": phrases[3:4], u"He was(not) going to say hello.": phrases[4:5], u"To fasten or secure with or as if with a cord a cake": phrases[5:6], } for content in reversed(sorted(data.keys())): result = recognizer.process(content) self.assertEqual(result[1], data[content])
def test_basic_process(self): recognizer = PhrasalRecognizer([ u"ruby python", u"have lunch", u"a lot of", u"Don't", u"Don't have to", ]) recognizer.inspect = True data = { u"ruby python which one": [u"which one", [u"ruby python"]], u"It’s 12:00 now. Let’s have lunch together.": [u"It’s 12:00 now. Let’s together.", [u"have lunch"]], u"There are a lot of signs the grass.": [u"There are signs the grass.", [u"a lot of"]], u"Don't ": [u"", [u"Don't"]], # dont replace twice u"Don't have to Don't have to ": [u"Don't have to", [u"Don't", u"Don't have to"]], # u"Don't talk in class, Don't read in bed, Don't spill the sugar on the table" : [u"Don't talk in class, Don't read in bed,", [u"Don't"]], } for content in reversed(sorted(data.keys())): result = recognizer.process(content, inspect=True, replace=True) # if 'talk' in result[0]: import pdb; pdb.set_trace() # TODO some extract bugs? self.assertEqual(result, data[content])