Python JiebaTokenizer Examples

Programming Language: Python

Namespace/Package Name: src.tokens

Class/Type: JiebaTokenizer

Examples at hotexamples.com: 6

Python JiebaTokenizer - 6 examples found. These are the top rated real world Python examples of src.tokens.JiebaTokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

JiebaTokenizer(1)

tokens(1)

Example #1

Show file

File: __init__.py Project: zhyc9de/textHelper

    def __init__(self, threshold, stop_words='', words_bag_root='', mode='c'):
        if stop_words:
            self.stop_words_file = stop_words
        else:
            self.stop_words_file = stop_words_file

        if words_bag_root:
            self.words_bag_root = words_bag_root
        else:
            self.words_bag_root = words_bag_path

        self.threshold = threshold
        self.jt = JiebaTokenizer(self.stop_words_file, mode=mode)

Example #2

Show file

File: test_token.py Project: RianaChen/text-similarity

class JiebaTokenizerTestCase(unittest.TestCase):

    def setUp(self):
        self.jt = JiebaTokenizer("../data/stopwords.txt")

    def testTokens(self):
        in_text = u"完整的单元测试很少只执行一个测试用例，" \
                  u"开发人员通常都需要编写多个测试用例才能" \
                  u"对某一软件功能进行比较完整的测试，这些" \
                  u"相关的测试用例称为一个测试用例集，在" \
                  u"PyUnit中是用TestSuite类来表示的。"
        tokens_text = u"完整/单元/测试/单元测试/只/执行/" \
                      u"一个/测试/试用/测试用例/开发/发人/" \
                      u"人员/开发人员/通常/需要/编写/多个/" \
                      u"测试/试用/测试用例/软件/功能/进行/" \
                      u"比较/完整/测试/相关/测试/试用/测试用例/" \
                      u"称为/一个/测试/试用/测试用例/集/PyUnit/" \
                      u"中是/TestSuite/类来/表示"
        self.assertEqual(tokens_text, u'/'.join(self.jt.tokens(in_text)), "Tokenization Results differ")

Example #3

Show file

File: test_token.py Project: saasas61/Demo_TFIDF_Simhash_Python

class JiebaTokenizerTestCase(unittest.TestCase):
    def setUp(self):
        self.jt = JiebaTokenizer("../data/stopwords.txt")

    def testTokens(self):
        in_text = u"完整的单元测试很少只执行一个测试用例，" \
                  u"开发人员通常都需要编写多个测试用例才能" \
                  u"对某一软件功能进行比较完整的测试，这些" \
                  u"相关的测试用例称为一个测试用例集，在" \
                  u"PyUnit中是用TestSuite类来表示的。"
        tokens_text = u"完整/单元/测试/单元测试/只/执行/" \
                      u"一个/测试/试用/测试用例/开发/发人/" \
                      u"人员/开发人员/通常/需要/编写/多个/" \
                      u"测试/试用/测试用例/软件/功能/进行/" \
                      u"比较/完整/测试/相关/测试/试用/测试用例/" \
                      u"称为/一个/测试/试用/测试用例/集/PyUnit/" \
                      u"中是/TestSuite/类来/表示"
        self.assertEqual(tokens_text, u'/'.join(self.jt.tokens(in_text)),
                         "Tokenization Results differ")

Example #4

Show file

File: test_token.py Project: RianaChen/text-similarity

 def setUp(self):
     self.jt = JiebaTokenizer("../data/stopwords.txt")

Example #5

Show file

File: test_token.py Project: saasas61/Demo_TFIDF_Simhash_Python

 def setUp(self):
     self.jt = JiebaTokenizer("../data/stopwords.txt")

Example #6

Show file

File: __init__.py Project: zhyc9de/textHelper

class TextHelper:
    def __init__(self, threshold, stop_words='', words_bag_root='', mode='c'):
        if stop_words:
            self.stop_words_file = stop_words
        else:
            self.stop_words_file = stop_words_file

        if words_bag_root:
            self.words_bag_root = words_bag_root
        else:
            self.words_bag_root = words_bag_path

        self.threshold = threshold
        self.jt = JiebaTokenizer(self.stop_words_file, mode=mode)

    def compare_similarity(self, input_tpl, compare_tpl, way=2):
        # 检测文本编码
        if not isinstance(input_tpl, unicode):
            input_tpl = input_tpl.decode('utf8')
        if not isinstance(compare_tpl, unicode):
            compare_tpl = compare_tpl.decode('utf8')

        doc_token_1 = self.jt.tokens(input_tpl)
        doc_token_2 = self.jt.tokens(compare_tpl)

        word_list = list(set(doc_token_1 + doc_token_2))

        # Build unicode string word dict
        word_dict = {}
        for idx, ascword in enumerate(word_list):
            word_dict[ascword] = idx
        # Build nonzero-feature
        fb = FeatureBuilder(word_dict)
        doc_feat_1 = fb.compute(doc_token_1)
        doc_feat_2 = fb.compute(doc_token_2)

        # Init simhash_builder
        smb = SimhashBuilder(word_list)

        doc_fl_1 = DocFeatLoader(smb, doc_feat_1)
        doc_fl_2 = DocFeatLoader(smb, doc_feat_2)

        if way == 1:
            # print 'Matching by Simhash + hamming distance'
            dist = hamming_distance(doc_fl_1.fingerprint, doc_fl_2.fingerprint)
            if dist < float(self.threshold):
                return True, dist
            else:
                return False, dist
        elif way == 2:
            # print 'Matching by VSM + cosine distance'
            dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False)
            if dist > float(self.threshold):
                return True, dist
            else:
                return False, dist

    # 初始化字典,要扔数据集进来
    def init_bag(self, coll, del_old=True):
        self.words_bag = BagOfWords(self.jt, self.words_bag_root)
        if del_old:
            self.words_bag.del_old()
        # rebuild dict
        dict_set = set()
        for data in coll.find():
            words = self.jt.tokens(data['content'])
            dict_set |= set(words)
        self.words_bag.build_dictionary(dict_set)

        train_feature, train_target = self.words_bag.transform_data(coll)

        logreg = linear_model.LogisticRegression(C=1e5)
        logreg.fit(train_feature, train_target)

        self.words_bag.save_model(logreg)

    def classify(self, text):
        # init model
        lr = joblib.load('lr.model')
        # init bow
        BOW = self.words_bag.load_dictionary()

        # TextClassify
        pred = TextClassify.find_classify(text, BOW, lr)
        return pred[0]