Example #1
0
    def __init__(self, threshold, stop_words='', words_bag_root='', mode='c'):
        if stop_words:
            self.stop_words_file = stop_words
        else:
            self.stop_words_file = stop_words_file

        if words_bag_root:
            self.words_bag_root = words_bag_root
        else:
            self.words_bag_root = words_bag_path

        self.threshold = threshold
        self.jt = JiebaTokenizer(self.stop_words_file, mode=mode)
Example #2
0
class JiebaTokenizerTestCase(unittest.TestCase):

    def setUp(self):
        self.jt = JiebaTokenizer("../data/stopwords.txt")

    def testTokens(self):
        in_text = u"完整的单元测试很少只执行一个测试用例," \
                  u"开发人员通常都需要编写多个测试用例才能" \
                  u"对某一软件功能进行比较完整的测试,这些" \
                  u"相关的测试用例称为一个测试用例集,在" \
                  u"PyUnit中是用TestSuite类来表示的。"
        tokens_text = u"完整/单元/测试/单元测试/只/执行/" \
                      u"一个/测试/试用/测试用例/开发/发人/" \
                      u"人员/开发人员/通常/需要/编写/多个/" \
                      u"测试/试用/测试用例/软件/功能/进行/" \
                      u"比较/完整/测试/相关/测试/试用/测试用例/" \
                      u"称为/一个/测试/试用/测试用例/集/PyUnit/" \
                      u"中是/TestSuite/类来/表示"
        self.assertEqual(tokens_text, u'/'.join(self.jt.tokens(in_text)), "Tokenization Results differ")
class JiebaTokenizerTestCase(unittest.TestCase):
    def setUp(self):
        self.jt = JiebaTokenizer("../data/stopwords.txt")

    def testTokens(self):
        in_text = u"完整的单元测试很少只执行一个测试用例," \
                  u"开发人员通常都需要编写多个测试用例才能" \
                  u"对某一软件功能进行比较完整的测试,这些" \
                  u"相关的测试用例称为一个测试用例集,在" \
                  u"PyUnit中是用TestSuite类来表示的。"
        tokens_text = u"完整/单元/测试/单元测试/只/执行/" \
                      u"一个/测试/试用/测试用例/开发/发人/" \
                      u"人员/开发人员/通常/需要/编写/多个/" \
                      u"测试/试用/测试用例/软件/功能/进行/" \
                      u"比较/完整/测试/相关/测试/试用/测试用例/" \
                      u"称为/一个/测试/试用/测试用例/集/PyUnit/" \
                      u"中是/TestSuite/类来/表示"
        self.assertEqual(tokens_text, u'/'.join(self.jt.tokens(in_text)),
                         "Tokenization Results differ")
Example #4
0
 def setUp(self):
     self.jt = JiebaTokenizer("../data/stopwords.txt")
 def setUp(self):
     self.jt = JiebaTokenizer("../data/stopwords.txt")
Example #6
0
class TextHelper:
    def __init__(self, threshold, stop_words='', words_bag_root='', mode='c'):
        if stop_words:
            self.stop_words_file = stop_words
        else:
            self.stop_words_file = stop_words_file

        if words_bag_root:
            self.words_bag_root = words_bag_root
        else:
            self.words_bag_root = words_bag_path

        self.threshold = threshold
        self.jt = JiebaTokenizer(self.stop_words_file, mode=mode)

    def compare_similarity(self, input_tpl, compare_tpl, way=2):
        # 检测文本编码
        if not isinstance(input_tpl, unicode):
            input_tpl = input_tpl.decode('utf8')
        if not isinstance(compare_tpl, unicode):
            compare_tpl = compare_tpl.decode('utf8')

        doc_token_1 = self.jt.tokens(input_tpl)
        doc_token_2 = self.jt.tokens(compare_tpl)

        word_list = list(set(doc_token_1 + doc_token_2))

        # Build unicode string word dict
        word_dict = {}
        for idx, ascword in enumerate(word_list):
            word_dict[ascword] = idx
        # Build nonzero-feature
        fb = FeatureBuilder(word_dict)
        doc_feat_1 = fb.compute(doc_token_1)
        doc_feat_2 = fb.compute(doc_token_2)

        # Init simhash_builder
        smb = SimhashBuilder(word_list)

        doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
        doc_fl_2 = DocFeatLoader(smb, doc_feat_2)

        if way == 1:
            # print 'Matching by Simhash + hamming distance'
            dist = hamming_distance(doc_fl_1.fingerprint, doc_fl_2.fingerprint)
            if dist < float(self.threshold):
                return True, dist
            else:
                return False, dist
        elif way == 2:
            # print 'Matching by VSM + cosine distance'
            dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False)
            if dist > float(self.threshold):
                return True, dist
            else:
                return False, dist

    # 初始化字典,要扔数据集进来
    def init_bag(self, coll, del_old=True):
        self.words_bag = BagOfWords(self.jt, self.words_bag_root)
        if del_old:
            self.words_bag.del_old()
        # rebuild dict
        dict_set = set()
        for data in coll.find():
            words = self.jt.tokens(data['content'])
            dict_set |= set(words)
        self.words_bag.build_dictionary(dict_set)

        train_feature, train_target = self.words_bag.transform_data(coll)

        logreg = linear_model.LogisticRegression(C=1e5)
        logreg.fit(train_feature, train_target)

        self.words_bag.save_model(logreg)

    def classify(self, text):
        # init model
        lr = joblib.load('lr.model')
        # init bow
        BOW = self.words_bag.load_dictionary()

        # TextClassify
        pred = TextClassify.find_classify(text, BOW, lr)
        return pred[0]