def main(): jieba_instance = Tokenizer() seg_list = jieba_instance.cut("我来到北京清华大学", cut_all=True) print(type(seg_list)) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba_instance.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba_instance.cut_for_search( "小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) t1 = datetime.datetime.now() initialize() t2 = datetime.datetime.now() print("initialize costs:%s" % (t2 - t1)) print(lcut("我来到北京清华大学")) print(list(cut("我来到北京清华大学"))) print(cut("我来到北京清华大学", cut_all=True)) print(lcut_for_search("我来到北京清华大学")) print(list(cut_for_search("我来到北京清华大学"))) print(pseg.lcut("我来到北京清华大学")) print(list(pseg.cut("我来到北京清华大学"))) s = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" r = analyse.extract_tags(s) print(r) r = analyse.textrank(s, withWeight=True) print(r) tr = TextRank(jieba_instance) print(tr.textrank(s, topK=2, withWeight=True)) tf = TFIDF(jieba_instance) print(tf.extract_tags(s, topK=10)) result = jieba_instance.tokenize('永和服装饰品有限公司') for tk in result: print("word %s\t\t start: %d \t\t end:%d" % (tk[0], tk[1], tk[2])) print(tokenize('永和服装饰品有限公司', mode="search")) jieba_instance.load_userdict(["卧槽"]) load_userdict(set(["卧槽"]))
def setUpClass(cls): cls.dt = Tokenizer(DICT) cls.sentence = "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。" cls.extractor = TextRankExtractor(cls.dt)
def takes_arg3_as_stopword_path(self): Tokenizer(DICT, USER_DICT, STOP_WORD)
def takes_arg2_as_user_dict_path(self): Tokenizer(DICT, USER_DICT)
def setUpClass(cls): cls.dt = Tokenizer(DICT) cls.dt.add_word("区块链", 10, "nz")
# -*- coding: utf-8 -*- import re import os import math import pickle from collections import Counter from cppjieba_py import Tokenizer big_dict = os.path.join(os.path.dirname(__file__), "data", "dict.txt.big") tokenizer = Tokenizer(big_dict) DATA_PATH = os.path.join(os.path.dirname(__file__), "data", "dict.pkl") FIXED_PA = 1.6 def load_data(): with open(DATA_PATH, "rb") as f: return pickle.load(f) class Classifier(): def __init__(self, *args): self.initialized = False if len(args): self._initialize(*args) def _initialize(self, pos_emotion, pos_evaluation, neg_emotion, neg_evaluation, degrees, negations): self.pos_emotion = pos_emotion self.neg_emotion = neg_emotion
import os import sys from cppjieba_py import Tokenizer big_dict = os.path.join(os.path.dirname(__file__), "..", "bixin", "data", "dict.txt.big") tokenizer = Tokenizer(big_dict) DIR = os.path.join(os.path.dirname(__file__), "..", "data") TMP = os.path.join(os.path.dirname(__file__), "..", "tmp") if __name__ == "__main__": name = sys.argv[1] filename = os.path.join(DIR, name + ".txt") with open(filename) as f, open(os.path.join(TMP, name + "_missed.txt"), "w") as tmp: hits = 0 miss = 0 for l in f: line = l.strip() if tokenizer.find(line): hits = hits + 1 else: miss = miss + 1 tmp.write("%s\n" % line) print("%f" % (hits / (hits + miss)))