def jieba_tokenize(text, external_wordlist=False): """ Tokenize the given text into tokens whose word frequencies can probably be looked up. This uses Jieba, a word-frequency-based tokenizer. If `external_wordlist` is False, we tell Jieba to default to using wordfreq's own Chinese wordlist, and not to infer unknown words using a hidden Markov model. This ensures that the multi-character tokens that it outputs will be ones whose word frequencies we can look up. If `external_wordlist` is True, this will use the largest version of Jieba's original dictionary, with HMM enabled, so its results will be independent of the data in wordfreq. These results will be better optimized for purposes that aren't looking up word frequencies, such as general- purpose tokenization, or collecting word frequencies in the first place. """ global jieba_tokenizer, jieba_orig_tokenizer if external_wordlist: if jieba_orig_tokenizer is None: jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME) return jieba_orig_tokenizer.lcut(text) else: if jieba_tokenizer is None: jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME) # Tokenize the Simplified Chinese version of the text, but return # those spans from the original text, even if it's in Traditional # Chinese tokens = [] for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False): tokens.append(text[start:end]) return tokens
def __init__(self, ckpt_name=None, pbmodel_dir=None): assert ckpt_name or pbmodel_dir, 'ues at least one way' self.graph = tf.Graph() self.config = tf.ConfigProto( allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True), ) self.sess = tf.Session(config=self.config, graph=self.graph) self.token2id_dct = { 'word2id': utils.Any2Id.from_file( f'{curr_dir}/../data/toutiao_cls_word2id.dct', use_line_no=True), 'label2id': utils.Any2Id.from_file( f'{curr_dir}/../data/toutiao_cls_label2id.dct', use_line_no=True), } self.jieba = jieba.Tokenizer() self.tokenize = lambda t: self.jieba.lcut(re.sub(r'\s+', ',', t)) self.cut = lambda t: ' '.join(self.tokenize(t)) if ckpt_name: self.load_from_ckpt_meta(ckpt_name) else: self.load_from_pbmodel(pbmodel_dir) self.id2label = self.token2id_dct['label2id'].get_reverse()
def __setstate__(self, state): import jieba self.vocabulary = state or [] self.tokenizer = jieba.Tokenizer() for word in self.vocabulary: self.tokenizer.add_word(word)
def __init__(self, doc_path, true_path, predict_path): self.dir_path = os.path.dirname(os.path.realpath(__file__)) self.doc_path = doc_path self.true_path = true_path self.predict_path = predict_path self.label_list = ['疾病和诊断', '解剖部位', '影像检查', '实验室检验', '药物', '手术', '@'] self._jieba = jieba.Tokenizer(dictionary=None) self._jieba.set_dictionary( os.path.join(self.dir_path, 'data/our_dict1.txt')) self._jieba.initialize() self._jieba_posseg = jieba.posseg.POSTokenizer(tokenizer=self._jieba) self.aca: ACA = ACA() type_list = ['疾病和诊断', '解剖部位', '影像检查', '实验室检验', '药物', '手术'] self.term_list = [] self.term_label_dict = dict() for typee in type_list: file_i = pd.read_csv(open(os.path.join( os.path.dirname(self.dir_path), 'analysis/res/term_frequency/' + typee + '_term_frequency.csv'), encoding='utf-8-sig'), header=0) self.term_list.extend(file_i['term']) for term_i in file_i['term']: self.term_label_dict[term_i] = typee self.aca.add_words(self.term_list)
def __init__(self, entity_list): """ [{"@id":"1","name":"张三"},{"@id":"2","name":"李四"}] all input text are assumed (or will be converted into) unicode """ # init entity index self.entities = collections.defaultdict(list) entity_list_unicode = [] for entity in entity_list: entity_list_unicode.append(any2unicode(entity)) for entity in entity_list_unicode: name = entity["name"] self.entities[name].append(entity) for entity in entity_list_unicode: for name in entity.get("alternateName", []): self.entities[name].append(entity) stat(entity_list_unicode, ["name"]) # init jieba self.tokenizer = jieba.Tokenizer() for name in self.entities: self.tokenizer.add_word(name)
def init_jieba_dict(word_tuple, reload = False): if reload : remove_jieba_cache() jb = jieba.Tokenizer(dictionary=jieba.DEFAULT_DICT) for tu in word_tuple: jb.add_word(tu[0], freq=tu[1]) return jb
def get_freq_general(lang='zh'): if lang != 'zh': print('get_freq_general not implemented for {}'.format(lang)) return dd(int) t = jieba.Tokenizer() d = t.gen_pfdict(t.get_dict_file())[0] return dd(int, d)
def __init__(self, pretrained_path, raw_data_path, output_dir, max_len=512, is_test=True, test_data_path=None, split_dic=None, voc_type_path=None): super().__init__(pretrained_path, raw_data_path, output_dir, max_len=max_len, is_test=is_test, test_data_path=test_data_path) self.bert_tokenizer.add_special_tokens(self.special_tokens) self.cut_tokenizer = None if split_dic is not None: self.cut_tokenizer = jieba.Tokenizer() self.cut_tokenizer.load_userdict(split_dic) self.voc_type = None self.disease_list = set() if voc_type_path is not None: with codecs.open(voc_type_path, encoding='utf-8') as f: self.voc_type = json.load(f, encoding='utf-8') for v, types in self.voc_type.items(): if "疾病" in types and len(v) >= 1: self.disease_list.add(v)
def __init__(self, dict_path: str = None): import jieba self._jieba = jieba.Tokenizer() self._jieba.cache_file = "gnes.jieba_wrapper.cache" if dict_path is not None: self._jieba.load_userdict(dict_path)
def taste_dict(self): data = model.Taste.get_all() taste_jieba = jieba.Tokenizer() for food in data: taste_jieba.add_word(food['name'], 2000, food['type']) taste_pseg = pseg.POSTokenizer(taste_jieba) print('taste_pseg:success init') return taste_pseg
def foods_dict(self): data = model.Foods.get_all() foods_jieba = jieba.Tokenizer() for food in data: foods_jieba.add_word(food['name'], 2000, food['type']) foods_pseg = pseg.POSTokenizer(foods_jieba) print('foods_pseg:success init') return foods_pseg
def perform_word_segment(cls, corpus): """ process word segmenting use jieba tokenizer """ tokenizer = jieba.Tokenizer() corpus['tokens'] = corpus.content.apply( lambda x: list(tokenizer.cut(x))) return corpus
def material_dict(self): data = model.Material.get_all() material_jieba = jieba.Tokenizer() for food in data: material_jieba.add_word(food['name'], 2000, food['parent_code']) material_pseg = pseg.POSTokenizer(material_jieba) print('material_pseg:success init') return material_pseg
def __init__(self, show_reason=False, user_dict=None, stop_words=None): ''' :param show_reason:是否需要展示原因 :param user_dict:用户自定义词典,默认调用自带词典 :param stop_words:自定义停顿词 ''' self.show_reason = show_reason self._user_dict_path = os.path.dirname( os.path.abspath(__file__)) + '/Data/' self._model_path = os.path.dirname( os.path.abspath(__file__)) + '/Data/' if self.show_reason: self.INIT_REASON = { 0: "逻辑拼接", 1: "命中敏感词", 2: "疑似电话数字", 3: "数字过长", 4: "涉及微信号码敏感" } if user_dict: self._user_dict_path = user_dict self._jieba_phone_identification = jieba.Tokenizer( dictionary=self._user_dict_path + "user_dict.txt") self.rule = re.compile("[^\u4e00-\u9fa50-9.]") # feature12:手机号有相对固定的起始位 ''' 联通现有号段是:130、131、132、155、156、186、185,其中3G专属号段是:186、185。还有无线上网卡专属号段:145 移动现有号段是:134、135、136、137、138、139、150、151、152、157、158、159、182、183、188、187 电信现有号段是:133、153、180、181、189 ''' # 14开头的多为上网卡,99.99%人不会用来打电话 self._phone_start_position_number = ('13', '15', '17', '18') if stop_words: self.stop_words = stop_words else: # 无意义词 self.stop_words = [ '你', '我', '的', '啊', '嗯', '是', '吧', '对', '了', '那个', '那', '就', '好', '到', '给', '噢', '这个', '他', '说', '在', '不', '什么', '唉', '要', '也', '吗', '都', '现在', '一下', '这', '有', '就是', '不是', '呢', '好好', '能', '装', '看', '喂', '嘛', '知道', '你好', '可以', '没有', '多少', '多', '那边', '去', '没', '怎么', '常州', '哪里', '跟', '呀', '把', '我们', '的话', '货', '地方', '明天', '还', '行', '车', '不能', '问', '走', '等', '来', '给我', '这边', '再', '这样', '过去', '今天', '然后', '不知道', '上', '因为', '是不是', '得', '不了', '叫', '哦', '不要', '无锡', '上面', '反正', '南京', '讲', '搞', '还是', '过来', '看看', '拉', '应该', '东西', '它', '进去', '托盘', '车子', '还有', '可能', '又', '从', '哪', '时候', '拿', '啦', '肯定', '大概', '你们', '差不多', '写', '跑', '不行', '不到', '位置' ] self._tfidf_model = _readbunchobj(self._model_path + 'train_data_tfidf_model.tfidf') self._model_0 = _readbunchobj(self._model_path + 'train_data_mnb_tri_0.nb') self._quantile = _readbunchobj(self._model_path + 'quantile.dat') self._last_model = _readbunchobj(self._model_path + 'last_model.gbm') self._km_model = _readbunchobj(self._model_path + 'kmeans.m')
def __init__(self): self.token = jieba.Tokenizer() file = [ x.path for x in os.scandir(config.JIEBA_DICT_PATH) if x.path.endswith("txt") ] for fp in file: self.token.load_userdict(fp) self.pos_token = POSTokenizer(self.token)
def __init__(self, tokenizer=None): #dict_path = os.path.dirname(os.path.dirname(os.path.split(os.path.realpath(__file__))[0]))+"/people.dict" #print dict_path #jieba.re_han_default = re.compile("", re.U) #print 'loading...' #jieba.load_userdict(dict_path) #print 'dict load successfully' self.tokenizer = jieba.Tokenizer() self.load_word_tag(self.tokenizer.get_abs_path_dict())
def __init__(self, model_name, tokenize=None, pbmodel_dir=None, use_hvd=False): # 维护sess graph config saver self.model_name = model_name if tokenize is None: self.jieba = jieba.Tokenizer() # self.jieba.load_userdict(f'{curr_dir}/../data/segword.dct') self.tokenize = lambda t: self.jieba.lcut(re.sub(r'\s+', ',', t)) else: self.tokenize = tokenize self.cut = lambda t: ' '.join(self.tokenize(t)) self.token2id_dct = { # 'word2id': utils.Any2Id.from_file(f'{curr_dir}/../data/mmch_word2id.dct', use_line_no=True), # 自有数据 # 'word2id': utils.Any2Id.from_file(f'{curr_dir}/../data/mmch_char2id.dct', use_line_no=True), # 自有数据 'word2id': utils.Any2Id.from_file(f'{curr_dir}/../data/DB_mmch_word2id.dct', use_line_no=True), # 豆瓣多轮语料 'char2id': utils.Any2Id.from_file(f'{curr_dir}/../data/DB_mmch_char2id.dct', use_line_no=True), # 豆瓣多轮语料 } self.config = tf.ConfigProto( allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True), ) self.use_hvd = use_hvd if HVD_ENABLE else False if self.use_hvd: hvd.init() self.hvd_rank = hvd.rank() self.hvd_size = hvd.size() self.config.gpu_options.visible_device_list = str(hvd.local_rank()) self.graph = tf.Graph() self.sess = tf.Session(config=self.config, graph=self.graph) if pbmodel_dir is not None: # 只能做predict self.model = MMCH_Model.from_pbmodel(pbmodel_dir, self.sess) else: with self.graph.as_default(): self.model = MMCH_Model(model_name=self.model_name, run_model=self) if self.use_hvd: self.model.optimizer._lr = self.model.optimizer._lr * self.hvd_size # 分布式训练大batch增大学习率 self.model.hvd_optimizer = hvd.DistributedOptimizer( self.model.optimizer) self.model.train_op = self.model.hvd_optimizer.minimize( self.model.loss, global_step=self.model.global_step) self.sess.run(tf.global_variables_initializer()) if self.use_hvd: self.sess.run(hvd.broadcast_global_variables(0)) with self.graph.as_default(): self.saver = tf.train.Saver( max_to_keep=100) # must in the graph context
def __init__(self, user_dict_path='', entity_dict_path='', stop_words_path='', user_dict=(), entity_dict=(), stop_words=(), use_single_char=False): """ 初始化分词器,用词典初始化 :param user_dict_path: 用户词典路径 :param entity_dict_path: 实体词典路径 :param stop_words_path: 停用词路径 :param user_dict: 用户词典集合 :param entity_dict: 实体词典集合 :param stop_words: 停用词集合 """ assert isinstance(user_dict_path, str) assert isinstance(entity_dict_path, str) assert isinstance(stop_words_path, str) assert isinstance(user_dict, tuple) assert isinstance(entity_dict, tuple) assert isinstance(stop_words, tuple) self.use_single_char = use_single_char # 初始化结巴分词器 self.tokenizer = jieba.Tokenizer() try: if os.path.exists(user_dict_path): self.tokenizer.load_userdict(user_dict_path) if os.path.exists(entity_dict_path): self.tokenizer.load_userdict(entity_dict_path) for word in user_dict: self.tokenizer.add_word(word) for word in entity_dict: self.tokenizer.add_word(word) except Exception as e: print(e) self.pos_tokenizer = jieba.posseg.POSTokenizer( tokenizer=self.tokenizer) # 初始化停用词表 self.stop_words = [] try: if os.path.exists(stop_words_path): with open(stop_words_path, 'r', encoding='utf-8') as f: for line in f.readlines(): word = line.replace('\r', '').replace('\n', '').replace( '\t', '').replace(' ', '') self.stop_words.append(word) self.stop_words.extend(list(stop_words)) self.stop_words = list(set(self.stop_words)) except Exception as e: print(e)
def __init__(self, model_path, userdict_path, stopword_path): self.clf = None self.vectorizer = None self.tfidftransformer = None self.model_path = model_path self.stopword_path = stopword_path self.userdict_path = userdict_path self.stop_words = [] self.tokenizer = jieba.Tokenizer() self.initialize()
def technics_dict(self): data = model.Technics.get_all() technics_jieba = jieba.Tokenizer() for food in data: technics_jieba.del_word(food['name']) # technics_jieba.add_word('是',2000,'ttt') technics_jieba.add_word(food['name'], 2000, food['type']) technics_pseg = pseg.POSTokenizer(technics_jieba) print('technics_pseg:success init') return technics_pseg
def __init__(self, modelPath, stopwordPath, userDictPath): self.clt = None self.vectorizer = None self.tfidftransformer = None self.modelPath = modelPath self.stopwordPath = stopwordPath self.userDictPath = userDictPath self.stopWords = [] self.tokenizer = jieba.Tokenizer() self.initalize()
def __init__(self, vocab): super(JiebaTokenizer, self).__init__(vocab) self.tokenizer = jieba.Tokenizer() # initialize tokenizer self.tokenizer.FREQ = { key: 1 for key in self.vocab.token_to_idx.keys() } self.tokenizer.total = len(self.tokenizer.FREQ) self.tokenizer.initialized = True
def __init__(self, doc_path, true_path, predict_path): self.dir_path = os.path.dirname(os.path.realpath(__file__)) self.doc_path = doc_path self.label_list = ['疾病和诊断', '解剖部位', '影像检查', '实验室检验', '药物', '手术', ' '] self._jieba = jieba.Tokenizer(dictionary=None) self._jieba.set_dictionary( os.path.join(self.dir_path, 'data/our_dict1.txt')) self._jieba.initialize() self._jieba_posseg = jieba.posseg.POSTokenizer(tokenizer=self._jieba) self.true_path = true_path self.predict_path = predict_path
def __init__(self, ei_file): self.ei_file = ei_file self.df_file = 'tf_counter.json' if os.path.exists(self.df_file): print('df_file exists, loading the df_file ...') self.df = json.load(open(self.df_file)) else: self.df = Counter() self._read_ei_file() self.jieba_tokenizer = jieba.Tokenizer() self.jieba_tokenizer.tmp_dir = '.'
def _load_user_dict(self, user_dict_path): tokenizer = jieba.Tokenizer() if user_dict_path is None: return tokenizer if os.path.isfile(user_dict_path): tokenizer.load_userdict(user_dict_path) elif os.path.isdir(user_dict_path): for fn in os.listdir(user_dict_path): fp = os.path.join(user_dict_path, fn) if os.path.isfile(fp): tokenizer.load_userdict(fp) return tokenizer
def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = jieba.Tokenizer() self.__jieba.cache_file = self.__CACHE_PATH if not os.path.isdir(self.__DICT_PATH): raise McLanguageException( "Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join( self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize Jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that the dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message)
def __init__(self, title): super(DefaultModelServer, self).__init__(title) # # 读取敏感词库 # with open('ai/{}/fenlei_mingan'.format(title), 'rb') as f: # self.mingan_dict = pickle.load(f) #读取jieba补充词库 #jieba.load_userdict("ai/{}/jieba_buchong.txt".format(title)) self.jieba_fnlp = jieba.Tokenizer() # 读取停止词库 self.stopwords = pd.read_csv("ai/{}/stopwords.txt".format(title), index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8').values # 加载fasttext模型 self.ft_model = fasttext.load_model( 'ai/{}/classifier.model.bin'.format(title), label_prefix='__label__') #self.ft_model = fasttext.load_model('ai/{}/classifier.model.bin'.format(title)) #============================== #print('ai/{}/classifier.model.bin'.format(title)) cp = configparser.ConfigParser() cp.read('./ai/{}/labels.ini'.format(title), encoding='utf-8') kvs = cp.items("labels") kvs_cn = cp.items('labels_cn') #self.label_to_cate = {3: 'violation_politics', 2: 'normal_politics', 1: 'normal'} self.kind_book = [] self.kind_book_cn = [] for kv in kvs: self.kind_book.append(kv[1]) for kv in kvs_cn: self.kind_book_cn.append(kv[1]) self.ok = True self.title = title
def __init__(self, normalizer='basic_with_num', term_file=None): ''' 初始化分词器,初始化字符串处理器,从term_file中加载同义词词典 Args: term_file:自定义词典 Returns: None ''' #加载字符串预处理器和同义词典 self.normalizer = normalizer self.tokenizer = jieba.Tokenizer() self.synonym = {} if term_file is not None: df = pd.read_csv(term_file) for i, row in df.iterrows(): word = unicode(str(row['word']), 'utf8') if self.normalizer == 'basic': word = normalize(word) elif self.normalizer == 'basic_with_num': word = normalize_with_num(word) else: pass if len(word) == 0: continue self.tokenizer.add_word(word) #替换同义词 if row['synonym'] is not None: synonym = unicode(str(row['synonym']), 'utf8') if self.normalizer == 'basic': synonym = normalize(synonym) elif self.normalizer == 'basic_with_num': synonym = normalize_with_num(synonym) else: pass if len(synonym) == 0: continue self.tokenizer.add_word(synonym) if word != synonym: self.synonym[synonym] = word LOGGER.debug('init JiebaProcessor success')
def manage_word(self,todo,word,kindnum): ret = '1' if todo=='increase': self.increase(word,kindnum) #return '已将"'+word+'"增加为“'+self.kind_book[int(kindnum)]+'”类别' return ret elif todo =='change': if word in self.mingan_dict: self.change(word,kindnum) #return '已将"'+word+'"更改为“'+self.kind_book[int(kindnum)]+'”类别' return ret else: return '-1' elif todo =='delete': if word in self.mingan_dict: self.delete(word) else: return '-1' elif todo == 'search': if word in self.mingan_dict: return self.search(word) else: return '-1' elif todo == 'notice': if word == '1': with open('/data/ai_g7/sensitive_words', 'rb') as f: self.mingan_dict = pickle.load(f) self.jieba_kw = jieba.Tokenizer(dictionary="/data/ai_g7/jieba_kwdict.txt") print ('update successful') return '1' else: return '-1' else: return '-2'
def perform_word_segment(cls, corpus): """ process word segmenting use jieba tokenizer """ jieba.suggest_freq('小艾', True) jieba.suggest_freq('艾佳', True) jieba.suggest_freq('艾佳家居', True) jieba.suggest_freq('米兰星光', True) jieba.suggest_freq('诗意新居', True) jieba.suggest_freq('雅君壹格', True) jieba.suggest_freq('以爱之名', True) jieba.suggest_freq('艾师傅', True) jieba.suggest_freq('地暖', True) tokenizer = jieba.Tokenizer() corpus['tokens'] = corpus.content.astype('str').apply( lambda x: list(tokenizer.cut(x))) return corpus