class SegTaggerTestCase(unittest.TestCase): def setUp(self): base_dir = os.path.dirname(__file__) self.tagger = SegTagger(os.path.join(base_dir, "./model/seg.crf.model")) def tearDown(self): pass def testSegTagger(self): res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列") self.assertEquals(list(res), [u"巴勒斯坦", u"和", u"以色列"]) res = self.tagger.seg_as_txt(u"巴勒斯坦和以色列") self.assertEquals(res, u"巴勒斯坦 和 以色列") def testSegTaggerWithMultipleLines(self): res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列\n\n 吴邦国副总理5号傍晚") self.assertEquals(list(res), [u"巴勒斯坦", u"和", u"以色列", "\n", "\n", " ", u"吴邦国", u"副总理", u"5号", u"傍晚"])
class SegTaggerTestCase(unittest.TestCase): def setUp(self): base_dir = os.path.dirname(__file__) self.tagger = SegTagger(os.path.join(base_dir, "./model/seg.crf.model")) def tearDown(self): pass def testSegTagger(self): res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列") self.assertEquals(list(res), [u"巴勒斯坦", u"和", u"以色列"]) res = self.tagger.seg_as_txt(u"巴勒斯坦和以色列") self.assertEquals(res, u"巴勒斯坦 和 以色列") def testSegTaggerWithMultipleLines(self): res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列\n\n 吴邦国副总理5号傍晚") self.assertEquals(list(res), [ u"巴勒斯坦", u"和", u"以色列", "\n", "\n", " ", u"吴邦国", u"副总理", u"5号", u"傍晚" ])
def setUp(self): base_dir = os.path.dirname(__file__) self.tagger = SegTagger(os.path.join(base_dir, "./model/seg.crf.model"))
def setUp(self): self.extractor = ImpressionExtractor(PosTagger(SegTagger()))
| select(lambda nums: [words[num] for num in nums]) \ | select(lambda words: ''.join(words)) \ | as_list short_phrase = ''.join(short_matches) first_index = matches | select(extract_nums) | chain | min last_index = matches | select(extract_nums) | chain | max complete_phrase = ''.join(words[first_index:last_index + 1]) index = cur_index + ( [len(words[i]) for i in range(first_index)] | add) yield (short_phrase, complete_phrase, index) cur_index += len(line) + 1 if __name__ == '__main__': from pynlpini import PosTagger from pynlpini import SegTagger with open("../../data/app/travel_comments/mafengwo_comments_raw.txt" ) as comment_file: index = 0 ie = ImpressionExtractor(PosTagger(SegTagger())) for line in comment_file: line = line.decode("utf-8") for item in ie.extract(line): print item index += 1 if index > 10: exit()
def seg(txt): global seg_tagger if seg_tagger is None: seg_tagger = SegTagger() return json.dumps(seg_tagger.seg_as_iter(txt) | as_list, ensure_ascii=False)
def setUp(self): base_dir = os.path.dirname(__file__) self.pos_tagger = PosTagger( SegTagger(), os.path.join(base_dir, "./model/pos.crf.model"))
def setUp(self): base_dir = os.path.dirname(__file__) self.classifier = SentimentClassifier( SegTagger(), os.path.join(base_dir, "./model/sentiment.svm.model"), os.path.join(base_dir, "./model/sentiment_feature_index.json"))