class PosTaggerTestCase(unittest.TestCase): def setUp(self): base_dir = os.path.dirname(__file__) self.pos_tagger = PosTagger(SegTagger(), os.path.join(base_dir, "./model/pos.crf.model")) def tearDown(self): pass def testPosTagger(self): res = self.pos_tagger.pos_as_iter(u"据以色列电台昨天报道") self.assertEquals(list(res), [(u"据", "p"), (u"以色列", "nr"), (u"电台", "nn"), (u"昨天", "nt"), (u"报道", "vv")]) def testPosTaggerWithMultipleLines(self): res = self.pos_tagger.pos_as_iter(u"据以色列电台昨天报道\n\n 巴勒斯坦和以色列") self.assertEquals(list(res), [(u"据", "p"), (u"以色列", "nr"), (u"电台", "nn"), (u"昨天", "nt"), (u"报道", "vv"), ("\n", "pu"), ("\n", "pu"), (" ", "pu"), (u"巴勒斯坦", "nr"), (u"和", "cc"), (u"以色列", "nr")])
class PosTaggerTestCase(unittest.TestCase): def setUp(self): base_dir = os.path.dirname(__file__) self.pos_tagger = PosTagger( SegTagger(), os.path.join(base_dir, "./model/pos.crf.model")) def tearDown(self): pass def testPosTagger(self): res = self.pos_tagger.pos_as_iter(u"据以色列电台昨天报道") self.assertEquals(list(res), [(u"据", "p"), (u"以色列", "nr"), (u"电台", "nn"), (u"昨天", "nt"), (u"报道", "vv")]) def testPosTaggerWithMultipleLines(self): res = self.pos_tagger.pos_as_iter(u"据以色列电台昨天报道\n\n 巴勒斯坦和以色列") self.assertEquals(list(res), [(u"据", "p"), (u"以色列", "nr"), (u"电台", "nn"), (u"昨天", "nt"), (u"报道", "vv"), ("\n", "pu"), ("\n", "pu"), (" ", "pu"), (u"巴勒斯坦", "nr"), (u"和", "cc"), (u"以色列", "nr")])
def setUp(self): base_dir = os.path.dirname(__file__) self.pos_tagger = PosTagger(SegTagger(), os.path.join(base_dir, "./model/pos.crf.model"))
def setUp(self): self.extractor = ImpressionExtractor(PosTagger(SegTagger()))
| select(lambda nums: [words[num] for num in nums]) \ | select(lambda words: ''.join(words)) \ | as_list short_phrase = ''.join(short_matches) first_index = matches | select(extract_nums) | chain | min last_index = matches | select(extract_nums) | chain | max complete_phrase = ''.join(words[first_index:last_index + 1]) index = cur_index + ( [len(words[i]) for i in range(first_index)] | add) yield (short_phrase, complete_phrase, index) cur_index += len(line) + 1 if __name__ == '__main__': from pynlpini import PosTagger from pynlpini import SegTagger with open("../../data/app/travel_comments/mafengwo_comments_raw.txt" ) as comment_file: index = 0 ie = ImpressionExtractor(PosTagger(SegTagger())) for line in comment_file: line = line.decode("utf-8") for item in ie.extract(line): print item index += 1 if index > 10: exit()
def pos(txt): global pos_tagger if pos_tagger is None: pos_tagger = PosTagger(seg_tagger) return json.dumps(pos_tagger.pos_as_iter(txt) | as_list, ensure_ascii=False)
def setUp(self): base_dir = os.path.dirname(__file__) self.pos_tagger = PosTagger( SegTagger(), os.path.join(base_dir, "./model/pos.crf.model"))