def __init__(self): self.word_to_pinyins = defaultdict(list) f = open(FILE_WORDS, 'rb') for line in f: pinyin, words = line.strip().decode("utf-8").split() for item in words: self.word_to_pinyins[item].append(pinyin) f.close() self.word_to_pinyin = {} f = open(FILE_WORD, 'rb') for line in f: word, pinyin = line.strip().decode("utf-8").split(",") self.word_to_pinyin[word] = pinyin f.close() self.term_to_pinyin = {} f = open(FILE_TERM, 'rb') for line in f: term, pinyin = line.strip().decode("utf-8").split("#") self.term_to_pinyin[term] = pinyin.split("@") f.close() f = open(FILE_USER_DICT, 'rb') jieba.setLogLevel(logging.INFO) jieba.initialize() jieba.load_userdict(f) f.close()
def init(jieba_parallel=False): # 加载英语/中文停止词,分别来自nltk和zhon global english_stopwords, chinese_stopwords english_stopwords = set(nltk.corpus.stopwords.words('english')) chinese_stopwords = {word[:-1] for word in codecs.open("stopwords.txt", "r", encoding="utf-8")} # 设置结巴分词log级别 jieba.setLogLevel("INFO") # 设置结巴分词字典文件 jieba.set_dictionary("./jieba_dict.txt") # 修改结巴分词临时工作目录 jieba.tmp_dir = os.getcwd() # 开启并行分词模式,进程数为CPU核心数 if jieba_parallel: jieba.enable_parallel() config.log.info("module algorithm has initialized successfully.")
import pickle import jieba import json import numpy as np from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences jieba.setLogLevel('WARN') num_words = 80000 maxlen = 400 tokenizer_fact = Tokenizer(num_words=num_words) # train tokenizer # for i in range(18): # print('start big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000)) # with open('./data_deal/data_cut/big_fact_cut_%d_%d_new.pkl' % (i * 100000, i * 100000 + 100000), mode='rb') as f: # big_fact_cut = pickle.load(f) # texts_cut_len = len(big_fact_cut) # n = 0 # # 分批训练 # while n < texts_cut_len: # tokenizer_fact.fit_on_texts(texts=big_fact_cut[n:n + 10000]) # n += 10000 # if n < texts_cut_len: # print('tokenizer finish fit %d samples' % n) # else: # print('tokenizer finish fit %d samples' % texts_cut_len) # print('finish big_fact_cut_%d_%d' % (i * 100000, i * 100000 + 100000)) #
def jieba_cut(sequences): jieba.setLogLevel(20) jieba.enable_parallel(8) for sequence in sequences: data = jieba.cut(sequence) yield ' '.join(data)
def get_tag_list(filename): txt = filename.split('.')[0] jieba.setLogLevel(60) seg_list = jieba.cut(txt) return [i for i in seg_list if len(i) >= 2]
def __init__(self , datapath = '../data'): self.datapath = datapath if datapath.endswith('/') else datapath + '/' jieba.setLogLevel('NOTSET')
# -*- coding:utf-8 -*- # Created Time: Fri Jun 5 11:59:09 2015 # Purpose: extract web article # Mail: [email protected] __author__ = "Wayne Ho" import re import os import sys import json from bs4 import BeautifulSoup import urllib, cStringIO from PIL import Image import jieba import jieba.analyse jieba.setLogLevel(60) # disable loading messages import logging logging.basicConfig( format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO, filename="backtest.log", filemode='w' ) logging.getLogger().addHandler(logging.StreamHandler()) def readFile(fname): """read a file, or via stdin @param fname: file name @type fname: str @return: str
""" 分词 """ import jieba.posseg as psg import jieba import logging import config import string from lib import stopwords # 设置jieba的日志等级 jieba.setLogLevel(logging.INFO) # 所有的小写字母 abcdefghijklmnopqrstuvwxyz lower_letters = string.ascii_lowercase + "+" # 所有的标点符号 punctuations = [" ", "?", ",", "。", "!", ":", "?", ",", ".", "!", ":"] # 加载词典 jieba.load_userdict(config.user_dict_path) def _cut_sentence_by_word(sentence, with_pos, use_stopwords): """ 英汉都按照词来切分 "python和c++哪个难?" --> ["python","和","c++","哪个", "难", "?"] """ if with_pos: ret = psg.lcut(sentence) # 结果是jieba自定义的pair对象 ret = [(i.word, i.flag) for i in ret] # 将pair对象转成元组 if use_stopwords: ret = [i for i in ret if i[0] not in stopwords] return ret
def test_cut(): jieba.setLogLevel(logging.INFO) # logging.basicConfig(level=logging.INFO) c = Cut() test_cases = [ { "cht": { "word": [ "吋" ] }, "eng": { "ansh": [], "dash": [], "hash": [], "nash": [ "475mm" ], "num": [ "19" ], "word": [ "nwb" ] }, "input": "日本nwb 三節式雨刷 19吋/475mm", "num_list": [ "19" ], "tag_list": [ "475mm", "nwb", "三節式", "日本", "雨刷" ] }, { "cht": { "word": [ "跨域", "全", "音域" ] }, "eng": { "ansh": [ "o3" ], "dash": [], "hash": [], "nash": [], "num": [], "word": [ "s", "spearx", "t" ] }, "input": "【spearx 聲特科技】 spearx 跨域美聲 t+s o3全音域留聲耳機-黑色", "num_list": [], "tag_list": [ "o3", "s", "spearx", "t", "留聲", "科技", "美聲", "耳機", "聲特", "黑色" ] } ] for test_case in test_cases: assert test_case == c.cut(test_case['input'])
""" @author:XuMing([email protected]) @description: 配置切词器 """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import logging import os import jieba from jieba import posseg jieba.setLogLevel(log_level="ERROR") def whitespace_tokenize(text): """Runs basic whitespace cleaning and splitting on a peice of text.""" text = text.strip() if not text: return [] tokens = text.split() return tokens def segment(sentence, cut_type='word', pos=False): """ 切词 :param sentence:
# -*- coding: utf-8 -*- import re import time import random import jieba import torch import logging import torch.nn as nn from torchnet import meter from model import EncoderRNN, LuongAttnDecoderRNN from utils.greedysearch import GreedySearchDecoder from dataload import get_dataloader from config import Config jieba.setLogLevel(logging.INFO) #关闭jieba输出信息 def maskNLLLoss(inp, target, mask): ''' inp: shape [batch_size,voc_length] target: shape [batch_size] 经过view ==> [batch_size, 1] 这样就和inp维数相同,可以用gather target作为索引,在dim=1上索引inp的值,得到的形状同target [batch_size, 1] 然后压缩维度,得到[batch_size], 取负对数后 选择那些值为1的计算loss, 并求平均,得到loss 故loss实际是batch_size那列的均值,表示一个句子在某个位置(t)上的平均损失值 故nTotal表示nTotal个句子在某个位置上有值 mask: shape [batch_size] loss: 平均一个句子在t位置上的损失值 ''' nTotal = mask.sum() #padding是0,非padding是1,因此sum就可以得到词的个数 crossEntropy = -torch.log( torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
obj = pickle.load(open(file_name, 'rb')) return obj #------------------------------------------------------------------------------ if __name__ == '__main__': ''' DEBUG INFO WARNING ERROR CRITICAL ''' import logging jieba.setLogLevel(logging.WARNING) def save_ngram(n): global gram_level print('count_ngram', n, gram_level) count_ngram = wikichs_count_ngram(n) count_ngram = Counter({k: v for k, v in count_ngram.items() if v > 10}) #save memory save_obj(count_ngram, 'data/count_ngram' + str(n) + gram_level) gram_level = 'char' save_ngram(1) save_ngram(2) gram_level = 'word'
''' 此类用于数据转换 将 普通数据 转为模型能读懂的数据 ''' import jieba import logging import joblib import numpy as np jieba.setLogLevel(logging.INFO) #不输出jieba日志 class DataSet: def __init__(self, sentence_len, dictionary_path): #加载中文词典 self.__Chinese_dict = joblib.load(dictionary_path) self.sentence_len = int(sentence_len) #句子长度 单位:词 #将每个句子的列表 转为 编码后的列表 def data_to_train(self, list_str): Chinese_dict = self.__Chinese_dict #提取中文词典 Chinese_reverse = {v: k for k, v in Chinese_dict.items()} #反转字典 #将 句子表 编码 list_coding = [] #保存编码的整个表 #遍历每个句子 for row in list_str: text_coding = [] #保存编码的句子 #分词
import json import jieba import random import plotly.express as px import argparse import sys jieba.setLogLevel(20) # disable initialization info parser = argparse.ArgumentParser() parser.add_argument( "-v", "--visualize", required=False, default="False", help="whether to visualize results", ) parser.add_argument( "-l", "--location", required=False, default="data/known.txt", help="location for known user vocabulary", ) parser.add_argument( "-t", "--typesplit", required=False, default="wo", help=
#coding=utf8 import sys import os import pickle as pkl from os.path import exists as os_exists import time from tqdm import tqdm import jieba import numpy as np from torch.utils.data import Dataset, DataLoader jieba.setLogLevel(log_level=0) from PublicConfig import PublicConfig public_config = PublicConfig() def tokenizer(sentence, cut="jieba"): if not isinstance(sentence, str): return [public_config.UNK] if cut == "jieba": return list(jieba.cut(sentence)) elif cut == "char": return list(sentence) else: raise NotImplementedError("分词器可选:jieba 或者 char")
import argparse import os import numpy as np import pandas as pd import pickle import jieba jieba.setLogLevel(20) def set_args(): parser = argparse.ArgumentParser() parser.add_argument('--train_data_path', default='data\\train.csv') parser.add_argument('--test_data_path', default='data\\test_public.csv') parser.add_argument('--embed_data_path', default='data\\Tencent_AILab_ChineseEmbedding.txt') parser.add_argument('--out_embed_dir', default='output') args = parser.parse_args() return args if __name__ == '__main__': args = set_args() # load data, get vocab print('start load data.....') train_contents = pd.read_csv(args['train_data_path'])['content'].tolist() test_contents = pd.read_csv(args['test_data_path'])['content'].tolist() contents = train_contents + test_contents wordset = set() for content in contents:
fin.close() return alltext, accu_label, law_label, time_label def train_SVC(vec, label, class_weight=None): SVC = LinearSVC(class_weight=class_weight) #SVC = LinearSVC() SVC.fit(vec, label) return SVC if __name__ == '__main__': import logging jieba.setLogLevel(logging.CRITICAL) logfilename = "train.log" root_logger = create_logger() logger = create_logger(logfilename) dim = int(sys.argv[1]) seg_method = sys.argv[2] ngram = int(sys.argv[3]) min_df = int(sys.argv[4]) train_fname = sys.argv[5] test_filename = sys.argv[6] class_weight = sys.argv[7] if class_weight == 'none': class_weight = None #train
help="don't use the Hidden Markov Model") parser.add_argument("-q", "--quiet", action="store_true", default=False, help="don't print loading messages to stderr") parser.add_argument("-V", '--version', action='version', version="Jieba " + jieba.__version__) parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() if args.quiet: jieba.setLogLevel(60) delim = text_type(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin if args.dict: jieba.initialize(args.dict) else: jieba.initialize() if args.user_dict: jieba.load_userdict(args.user_dict) ln = fp.readline() while ln: l = ln.rstrip('\r\n')
''' Cut Chinese string into multiple sentences. ''' para = re.sub('([,;:。!?\?])([^』」”’])', r"\1\n\2", para) para = re.sub('(\.{6})([^』」”’])', r"\1\n\2", para) para = re.sub('(\…{2})([^』」”’])', r"\1\n\2", para) para = re.sub('([。!?\?][』」”’])([^,。!?\?])', r'\1\n\2', para) para = para.rstrip() return para.split('\n') if __name__ == '__main__': assert not os.path.exists('finished_files') cmd = os.popen('mkdir finished_files/') cmd = os.popen('mkdir finished_files/train') cmd = os.popen('mkdir finished_files/val') cmd = os.popen('mkdir finished_files/test') jieba.setLogLevel(logging.ERROR) jieba.initialize() # Count entries in LCSTS filename = 'LCSTS/DATA/PART_I.txt' cmd = os.popen('cat %s | grep -c "</doc>"' % filename) data_count = int(cmd.read().strip()) # Data counter data_iter = 0 train_iter = 0 val_iter = 0 test_iter = 0 vocab_counter = Counter() # Collect lines for soup
# coding: utf-8 import re import jieba import logging from functools import partial jieba.setLogLevel(logging.INFO) PUNCTS_PATTERN = re.compile(ur"[.,;:!?'\"~\[\]\(\)\{\}_—。….,;、:!?‘’“”〕《》【】〖〗()「」~]") SPACES_PATTERN = re.compile(ur"[\r\n\t\u00a0 ]") SENT_SEP = u'。,!?~;:.,!?:;' def encode_from_unicode(text): """将文本转换为 str 格式""" return text.encode('utf-8') if isinstance(text, unicode) else text def decode_to_unicode(text): """将文本转换为 unicode 格式""" return text.decode('utf-8') if isinstance(text, str) else text def to_halfwidth(text): """将文本中的全角字符转换为半角字符""" text = decode_to_unicode(text) res = u'' for uchar in text:
# -*- coding: UTF-8 -*- import sqlite3 import jieba import logging jieba.setLogLevel(logging.INFO) #设置不输出信息 conn = sqlite3.connect('./QA_data/QA.db') cursor = conn.cursor() stop_words = [] with open('./QA_data/stop_words.txt', encoding='gbk') as f: for line in f.readlines(): stop_words.append(line.strip('\n')) def match(input_question): res = [] cnt = {} question = list(jieba.cut(input_question, cut_all=False)) #对查询字符串进行分词 for word in reversed(question): #去除停用词 if word in stop_words: question.remove(word) for tag in question: #按照每个tag,循环构造查询语句 keyword = "'%" + tag + "%'" result = cursor.execute("select * from QA where tag like " + keyword) for row in result: if row[0] not in cnt.keys(): cnt[row[0]] = 0 cnt[row[0]] += 1 #统计记录出现的次数 try:
# [email protected] # featurization text data in ./data folder and save as .npz file # do not import this module import os import pdb import logging import numpy as np from time import time import jieba import jieba.analyse jieba.setLogLevel(logging.ERROR) ######################################################### #### functions ########################################## ######################################################### def load_data(type): assert type == 'train' or type == 'test' label_file = 'data/' + type +'2.rlabelclass' sample_path = 'data/' + type + '2' dataset = [] #list of data, each element is a tuple (filename, label, content) label_dict = {} lines = open(label_file).read().strip().split('\n')
def __init__(self, input_queue): self._input_queue = input_queue self._stopWordList() jieba.setLogLevel(30)
# -*- coding: utf-8 -*- # Licensed under the GNU GPLv2 - http://www.gnu.org/licenses/gpl-2.0.html import sys import time import numpy as np import jieba import jieba.posseg as pseg import cPickle as pickle jieba.setLogLevel('NOTSET') def property(val): if isinstance(val , str) or isinstance(val , unicode): cutlist = pseg.cut(val , HMM=False) return ''.join( [ flag for word , flag in cutlist ] ) else: raise("param must be unicode") def scalemax(vec): vec = np.array(vec , dtype = np.float32) if vec.max() != 0: vec = vec / vec.max() return list(vec) def normalize(vec): vec = np.array(vec , dtype = np.float32) denominator = np.sum( vec ) if denominator != 0: vec = vec / denominator return list(vec)
from typing import List from pathlib import Path import jieba from torch import tensor from ad_detector.logger import Logger from ad_detector.config import device jieba.setLogLevel('INFO') def sentence2tensor(sentence: str, content_size: int, word2idx: dict, stop_words: List[str] = None) -> tensor: words = jieba.lcut(sentence) # tokenize if stop_words is not None: words = [i for i in words if i not in stop_words] # delete stop words ret = list() for i in words: # word -> idx if i not in word2idx.keys(): word2idx[i] = len(word2idx) + 1 ret.append(word2idx[i]) # if len(ret) > content_size: # Logger('sentence2tensor').warning('content length out of size, result will be truncated.') while len(ret) < content_size: # padding ret.append(0) ret = ret[:content_size] return tensor(ret, device=device)
nargs='?', const=' ', help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM") parser.add_argument("-a", "--cut-all", action="store_true", dest="cutall", default=False, help="full pattern cutting") parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false", default=True, help="don't use the Hidden Markov Model") parser.add_argument("-q", "--quiet", action="store_true", default=False, help="don't print loading messages to stderr") parser.add_argument("-V", '--version', action='version', version="Jieba " + jieba.__version__) parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() if args.quiet: jieba.setLogLevel(60) delim = unicode(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin jieba.initialize() ln = fp.readline() while ln: l = ln.rstrip('\r\n') print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')) ln = fp.readline() fp.close()
@Author: randolph @Date: 2020-06-01 23:58:59 @LastEditors: randolph @LastEditTime: 2020-06-03 19:49:16 @version: 1.0 @Contact: [email protected] @Descripttion: 用jieba处理唐诗三百首作业 ''' import logging import re from collections import Counter import jieba # 处理自然语言库 import jieba.posseg as pseg jieba.setLogLevel(logging.INFO) # 提升jieba日志级别 关闭jieba debug日志输出 jieba.initialize() # 手动初始化jieba 加快调用函数速度 POEM_FILE = 'e:/randolph/husky_pywork/poem_300/poem.txt' # 古诗词源文件路径 def route(ori_data): '''函数入口判断 ''' flag = input('') if flag == "作者": n = int(input('')) count_authors(ori_data, n) # 统计作者姓名频次 elif flag == "人物": n = int(input('')) count_names(n) # 统计人物姓名频次