Exemple #1
0
class SegTaggerTestCase(unittest.TestCase):
    def setUp(self):
        base_dir = os.path.dirname(__file__)
        self.tagger = SegTagger(os.path.join(base_dir, "./model/seg.crf.model"))

    def tearDown(self):
        pass

    def testSegTagger(self):
        res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列")
        self.assertEquals(list(res), [u"巴勒斯坦", u"和", u"以色列"])
        res = self.tagger.seg_as_txt(u"巴勒斯坦和以色列")
        self.assertEquals(res, u"巴勒斯坦 和 以色列")

    def testSegTaggerWithMultipleLines(self):
        res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列\n\n 吴邦国副总理5号傍晚")
        self.assertEquals(list(res), [u"巴勒斯坦", u"和", u"以色列", "\n", "\n", " ", u"吴邦国", u"副总理", u"5号", u"傍晚"])
Exemple #2
0
class SegTaggerTestCase(unittest.TestCase):
    def setUp(self):
        base_dir = os.path.dirname(__file__)
        self.tagger = SegTagger(os.path.join(base_dir,
                                             "./model/seg.crf.model"))

    def tearDown(self):
        pass

    def testSegTagger(self):
        res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列")
        self.assertEquals(list(res), [u"巴勒斯坦", u"和", u"以色列"])
        res = self.tagger.seg_as_txt(u"巴勒斯坦和以色列")
        self.assertEquals(res, u"巴勒斯坦 和 以色列")

    def testSegTaggerWithMultipleLines(self):
        res = self.tagger.seg_as_iter(u"巴勒斯坦和以色列\n\n 吴邦国副总理5号傍晚")
        self.assertEquals(list(res), [
            u"巴勒斯坦", u"和", u"以色列", "\n", "\n", " ", u"吴邦国", u"副总理", u"5号",
            u"傍晚"
        ])
Exemple #3
0
 def setUp(self):
     base_dir = os.path.dirname(__file__)
     self.tagger = SegTagger(os.path.join(base_dir,
                                          "./model/seg.crf.model"))
Exemple #4
0
 def setUp(self):
     self.extractor = ImpressionExtractor(PosTagger(SegTagger()))
                                    | select(lambda nums: [words[num] for num in nums]) \
                                    | select(lambda words: ''.join(words)) \
                                    | as_list
                    short_phrase = ''.join(short_matches)
                    first_index = matches | select(extract_nums) | chain | min
                    last_index = matches | select(extract_nums) | chain | max
                    complete_phrase = ''.join(words[first_index:last_index +
                                                    1])
                    index = cur_index + (
                        [len(words[i]) for i in range(first_index)] | add)
                    yield (short_phrase, complete_phrase, index)

            cur_index += len(line) + 1


if __name__ == '__main__':
    from pynlpini import PosTagger
    from pynlpini import SegTagger

    with open("../../data/app/travel_comments/mafengwo_comments_raw.txt"
              ) as comment_file:
        index = 0
        ie = ImpressionExtractor(PosTagger(SegTagger()))
        for line in comment_file:
            line = line.decode("utf-8")
        for item in ie.extract(line):
            print item
        index += 1
        if index > 10:
            exit()
Exemple #6
0
def seg(txt):
    global seg_tagger
    if seg_tagger is None:
        seg_tagger = SegTagger()
    return json.dumps(seg_tagger.seg_as_iter(txt) | as_list, ensure_ascii=False)
Exemple #7
0
 def setUp(self):
     base_dir = os.path.dirname(__file__)
     self.pos_tagger = PosTagger(
         SegTagger(), os.path.join(base_dir, "./model/pos.crf.model"))
Exemple #8
0
 def setUp(self):
     base_dir = os.path.dirname(__file__)
     self.tagger = SegTagger(os.path.join(base_dir, "./model/seg.crf.model"))
Exemple #9
0
 def setUp(self):
     base_dir = os.path.dirname(__file__)
     self.classifier = SentimentClassifier(
         SegTagger(), os.path.join(base_dir, "./model/sentiment.svm.model"),
         os.path.join(base_dir, "./model/sentiment_feature_index.json"))