def __init__(self, max_vocab=None, pad_token="<pad>", unk_token="<unk>", pad_id=0, unk_id=1, tokenize_method="char", user_dict=None, min_count=None): self.max_vocab = max_vocab self.pad_token = pad_token self.unk_token = unk_token self.pad_id = pad_id self.unk_id = unk_id self.word2index = {pad_token: pad_id, unk_token: unk_id} self.index2word = {pad_id: pad_token, unk_id: unk_token} self.min_count = min_count if tokenize_method.lower() == "char": self.tokenize_method = self.char_tokenize elif tokenize_method.lower() == "word": jieba.setLogLevel(20) self.tokenize_method = self.jieba_tokenize if user_dict is not None: jieba.load_userdict(user_dict) else: raise TypeError(f"bad tokenize method: {tokenize_method}")
def bare_dict(): negations = set(json.load(open(os.path.join(DATA_DIR, 'negations.json')))) with open(os.path.join(DATA_DIR, "degrees.json")) as f: degrees = json.load(f) with open(os.path.join(DATA_DIR, 'pos.txt')) as f: pos_emotion = set([x.strip() for x in f.readlines()]) with open(os.path.join(DATA_DIR, 'neg.txt')) as f: neg_emotion = set([x.strip() for x in f.readlines()]) # with open(os.path.join(DATA_DIR, 'pos_eva.txt')) as f: # pos_envalute = set([x.strip() for x in f.readlines()]) # with open(os.path.join(DATA_DIR, 'neg_eva.txt')) as f: # neg_envalute = set([x.strip() for x in f.readlines()]) # places = os.path.join(os.path.dirname(__file__), "../dictionaries/places.txt") # tokenizer.load_userdict(places) # with open(os.path.join(DATA_DIR, 'pos_sentence.txt')) as f1,\ # open(os.path.join(DATA_DIR, 'neg_sentence.txt')) as f2: # s1 = set([x.strip() for x in f1.readlines()]) # s2 = set([x.strip() for x in f2.readlines()]) # pos_emotion.union(s1) # neg_emotion.union(s2) pos_neg = pos_emotion.union(neg_emotion) # pos_neg_eva = pos_envalute.union(neg_envalute) tokenizer.load_userdict(pos_neg)
def evaluate(cut_mode): if cut_mode == "word": import jieba_fast as jieba jieba.load_userdict("dict_fasttext.txt") vocab_map, _ = dataset.read_map('corpus/mapping') sess = tf.Session() Model = create_model(sess, 'test') Model.batch_size = 1 sys.stdout.write('>') sys.stdout.flush() sentence = sys.stdin.readline() sentence = sentence_cutter(sentence, cut_mode) while (sentence): print('sentence: ', sentence) token_ids = dataset.convert_to_token(sentence, vocab_map) print('toekn_ids: ', token_ids) encoder_input, encoder_length, _ = Model.get_batch([(0, token_ids)]) print('encoder_input: ', encoder_input, encoder_input.shape) print('encoder_length: ', encoder_length) score = Model.step(sess, encoder_input, encoder_length) print('Score: ', score[0][0]) print('>', end='') sys.stdout.flush() sentence = sys.stdin.readline() sentence = sentence_cutter(sentence, cut_mode)
def my_wordcloud(filename): punct = str.maketrans("!.,:;-?※></()=,、。/[]《》", " ") plt.rcParams['font.sans-serif'] = 'PingFang TC' # 設字型 # 讀取停用字 stop = [line.strip() for line in open('stopwords.txt').readlines()] print('停用字長度', len(stop)) all_segs = [] with open(filename) as file: for line in file: # print(line) line = line.translate(punct) segs = line.split(' ') for anyy in segs: if len(anyy.strip()) > 2: all_segs.append(anyy.strip()) # print(all_segs) print(len(all_segs)) jieba.load_userdict('userdict.txt') word_appear_times = {} for i in all_segs: # print('-'*30) # print(i,':',list(jieba.cut(i,cut_all=False))) for anyy in list(jieba.cut(i, cut_all=True)): anyy = anyy.lower() if anyy not in stop and len(anyy.strip()) > 2: # print(anyy) if anyy not in word_appear_times: word_appear_times[anyy] = 1 else: word_appear_times[anyy] += 1 else: continue # print('-'*30,'\n') # time.sleep(3) # print(word_appear_times) word_appear_times_ordered = sorted(word_appear_times.items(), key=lambda x: x[1], reverse=True) top150 = word_appear_times_ordered[0:150] # print(top150) top150_word = ' '.join([x[0] for x in top150]) print(top150_word) cloud_mask = np.array(Image.open("cloud_mask.png")) wc = WordCloud(colormap='RdYlGn', mask=cloud_mask, max_words=150, background_color="black", scale=4, font_path='/System/Library/Fonts/PingFang.ttc') # 產生文字雲 wc.generate(top150_word) wc.to_file(f'{filename[:-4]}.jpg')
def make_segment_file(file_path): print("start seg file") jieba.load_userdict("./seg_dict.txt") with open(file_path) as f: document = f.read() d_cut = jieba.cut(document) res = " ".join(d_cut) with open("./segment_wiki.txt", "w") as f: f.write(res) print("segment file ok")
def map_get_words(txts, kind="char", return_type="str"): if isinstance(txts, str): with open(txts, "r") as f: txts = [row.strip() for row in f.readlines()] jieba = None if kind == "word": import jieba_fast as jieba jieba.initialize() jieba.load_userdict("dict_fasttext.txt") txts = list(map(lambda txt: get_words(txt, kind, return_type, jieba), txts)) return txts
def extract_tag(): ## 分词 ieba_fast jieba.load_userdict(dictionary) data = [] jiebaAnalyse.set_stop_words("../resources/stopWord.txt") with open(data_file, 'r', encoding='utf8') as f: for no, line in enumerate(f): data.append( jiebaAnalyse.extract_tags(line, topK=50, allowPOS=['n'])) print(data)
def word_seg(input_file, output_file, mode): if mode == 'word': jieba.load_userdict(dict_path) with open(output_file, 'w') as f, open(input_file, 'r') as fi: for l in fi: # remove all whitespace characters l = ''.join(l.split()) if mode == 'char': f.write(' '.join(list(l)) + '\n') else: seg = jieba.cut(l, cut_all=False) f.write(' '.join(seg) + '\n')
def update_userdict(userdict_old_file, userdict_new_file): userdict_old_df = pd.read_csv(userdict_old_file, names=['word', 'freq', 'pos_tag'], sep=' ', encoding='utf-8') #[['word',]] userdict_old_df['word'] = userdict_old_df['word'].astype('str') # pool = Pool(processes=num_of_cpu) #userdict_old_df =pool.map(read_userdict,userdict_old_file) #pool.close() #userdict_old_df.drop('word_len', axis=1, inplace=True) userdict_old_df['word_len'] = userdict_old_df['word'].str.len() userdict_old_df.sort_values('word_len', ascending=True, inplace=True) #for i in tqdm(range(1,37)): word_len_list = sorted( userdict_old_df['word_len'].value_counts().index.tolist()) for i in tqdm(word_len_list): try: jieba.load_userdict(userdict_new_file) print('loaded new userdict') except: print('cannot loaded new userdict') pass df_processing = userdict_old_df[(userdict_old_df['word_len'] == i)] print('words_len: {}, no. of words in old userdict:{}'.format( i, df_processing.shape[0])) #if df_processing.shape[0] == 0: # print('No word with length of {}'.format(i)) # pass #else: # print('processing word with length of {}'.format(i)) df_processing.drop('word_len', axis=1, inplace=True) df_processing.drop_duplicates('word', keep='last', inplace=True) df_processing['freq'] = df_processing['word'].swifter.apply( cal_jieba_freq) print('words_len: {}, new userdict:{}'.format(i, df_processing.shape[0])) #print(df_processing.head(20)) df_processing.to_csv(userdict_new_file, mode='a', sep=' ', index=None, header=None, encoding='utf-8')
def test(filename): if FLAGS.src_word_seg == 'word': import jieba_fast as jieba jieba.load_userdict("dict_fasttext.txt") sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(source_mapping) trg_vocab_dict, _ = data_utils.read_map(target_mapping) model = create_seq2seq(sess, 'TEST') model.batch_size = 1 #model.decoder_max_len = None #sources = ["你是誰","你是誰"] #targets = ["你是不是想人家","我是說你是我老婆"] df = pd.read_csv(filename) df = df.fillna('') sources = list(df["context"]) targets = list(df["utterance"]) scores = [] for source, target in zip(sources, targets): if FLAGS.src_word_seg == 'word': source = (' ').join(jieba.lcut(source)) elif FLAGS.src_word_seg == 'char': source = (' ').join([s for s in source]) if FLAGS.trg_word_seg == 'word': target = (' ').join(jieba.lcut(target)) elif FLAGS.trg_word_seg == 'char': target = (' ').join([t for t in target]) src_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(source), src_vocab_dict, False) trg_token_ids = data_utils.convert_to_token(tf.compat.as_bytes(target), trg_vocab_dict, False) trg_len = len(trg_token_ids) for i, bucket in enumerate(buckets): if bucket[0] >= len(src_token_ids): bucket_id = i break encoder_input, decoder_input, weight = model.get_batch( {bucket_id: [(src_token_ids, [])]}, bucket_id) output = model.run(sess, encoder_input, decoder_input, weight, bucket_id)[:trg_len] output = [o[0][t] for t, o in zip(trg_token_ids, output)] output = np.mean(output) scores.append(output) scores = np.mean(scores) return scores
def gen_tokenize_method(self, split_type, user_dict=None, bert_vocab=None): lower_split_type = split_type.lower() if lower_split_type == "char": return self._char_split if lower_split_type == "word": jieba.setLogLevel(20) if user_dict is not None: jieba.load_userdict(user_dict) return self._word_split if lower_split_type == "word_piece": bert_vocab = bert_vocab or self.local_bert tokenizer = BertTokenizer.from_pretrained(bert_vocab) return partial(self._piece_split, tokenizer) raise TypeError(f"error tokenize type: {split_type}")
def get_name(self, sentence): # 获取sentence中的公司名字 # 加载公司字典 jieba.load_userdict(os.path.join(self.abs_path, 'company_list.txt')) sentence_seged = jieba.cut(sentence.strip(), cut_all=False) # 不进行完全分割,公司名字粒度相对较大 stopwords = self.stopwordslist( os.path.join(self.abs_path, "HIT_STOP.txt")) com_name = '' # 公司名字 for word in sentence_seged: if word not in stopwords: if word != '\t': com_name = com_name + word + ' ' com_name = jieba_fast.analyse.extract_tags(com_name, topK=1, withWeight=False, allowPOS=('x')) # 过滤词性 return com_name[0] if len(com_name) > 0 else ' ' # 返回公司名
def _data_pre(self, title_list, content_list, label_list, pred_list): if len(title_list) == len(content_list) == len(label_list): self.data_size = len(title_list) else: print("The lengh of input list should be equal.") return data = [ a + b[:min(400, len(b))] for a, b in zip(title_list, content_list) ] count_vect = CountVectorizer() jieba_fast.load_userdict( "/data1/sina_dw/shichen/FastText/cppjieba/dict/user.dict.utf8") train_data_terms = [self._cut(r) for r in data] train_data = count_vect.fit_transform(train_data_terms) lb = sklearn.preprocessing.LabelBinarizer() Y = lb.fit_transform(label_list) if self.classes_ is None: self.classes_ = lb.classes_ if self.vocabulary_ is None: self.vocabulary_ = count_vect.vocabulary_ return train_data, Y, train_data_terms, count_vect
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator from torchtext.vocab import Vectors from config import user_dict from utils.text_util import pretreatment from utils.ml_util import init_unk from functools import partial import jieba_fast as jieba import torch jieba.setLogLevel(20) jieba.load_userdict(user_dict) class BatchWrapper(object): def __init__(self, batch_iter, x_var, y_vars): self.batch_iter = batch_iter self.x_var = x_var self.y_vars = y_vars def __iter__(self): for batch in self.batch_iter: x, lengths = getattr(batch, self.x_var) y_tensors = [ getattr(batch, y_var).unsqueeze(1) for y_var in self.y_vars ] y = torch.cat(tuple(y_tensors), dim=1) yield x, y, lengths def __len__(self):
''' import sys reload(sys) sys.setdefaultencoding('utf8') """数据处理""" ''' 结巴分词模块 ''' import time import jieba_fast import jieba_fast.posseg as pseg import sys reload(sys) sys.setdefaultencoding('utf8') jieba_fast.load_userdict('/Users/zhuxinquan/Desktop/mykeyword.dict') jieba_fast.add_word('烤鸭炉') #存储停用词 fid2 = '/Users/zhuxinquan/Desktop/停用词调整_二手.txt' stopword = {} fid2 = open(fid2, 'r') for j in fid2.readlines(): stopword[j.strip().decode("utf-8")] = 1 def stop_word(line): data_line = line.strip() wordList = jieba_fast.cut(data_line) # wordlist是一个生成器 outStr = '' t1 = time.time() for word in wordList:
def __init__(self): # 加载蚂蚁相关的字典 jieba.load_userdict(JIEBA_DICT_SELF_DEFINE) pass
import glob, codecs, re, gzip import jieba_fast as jieba pdpath = '/home/fqx/Documents/pd-corpus/**/*.txt' corpuspath = 'corpus/pd-aio.txt.gz' paragraphbreak = re.compile('[‖ ]') linebreak = re.compile('[【】。!?… ]') jieba.load_userdict('names.txt') jieba.enable_parallel(4) aiofile = gzip.open(corpuspath, 'wt', encoding='utf-8') pdfiles = glob.glob(pdpath,recursive=True) for addr in pdfiles: print('Processing %s' % addr) try: file = codecs.open(addr, 'r', 'GB18030') lines = file.readlines() file.close() except UnicodeDecodeError: print('Decoding Error!') continue for line in lines: paras = re.split(paragraphbreak,line) for para in paras: reallines = re.split(linebreak,para) for realline in reallines: if len(realline) > 19: words = jieba.cut(realline) realwords = ' '.join(words)
import matplotlib.pyplot as plt import pandas as pd import numpy as np import re,time,glob import jieba_fast as jieba from gensim import corpora, models, similarities import logging import gensim from gensim.similarities import Similarity import thulac logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #thu1 = thulac.thulac(user_dict = 'E:\\codetest\\fintech\\topic2\\THUOCL_caijing.txt',seg_only=True, filt=True) dicts = glob.glob('E:\\codetest\\fintech\\topic2\\userdict\\*') for d in dicts: print(d) jieba.load_userdict(d) def readFile(): train_data = 'E:\\codetest\\fintech\\topic2\\train_data.csv' test_data = 'E:\\codetest\\fintech\\topic2\\test_data.csv' stop = 'E:\\codetest\\fintech\\topic2\\stopwords.dat' trainD = pd.read_csv(train_data, skiprows=0,index_col='id') testD = pd.read_csv(test_data,index_col='id') stopword = [line.strip() for line in open(stop,encoding='utf-8').readlines()] return trainD,testD,stopword def tcutword(data,stopword): corpora_documents = [] for i in data.index: text = data.loc[i].values[0].strip() text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol
import sys from google_images_download import google_images_download import speech_recognition as sr from jieba_fast.analyse import extract_tags import jieba_fast as jieba jieba.set_dictionary("app/dictionary.txt") jieba.load_userdict("app/dictionary.txt") with open("app/dictionary.txt", "r") as f: places_list = [d.strip().split(' ')[0] for d in f.readlines()] def asr_result(way="file", file_path=None): counter = 0 r = sr.Recognizer() if way == "mic": with sr.Microphone() as source: print("請開始說話:") audio = r.listen(source) elif way == "file": with sr.AudioFile(file_path) as source: audio = r.record(source) # read the entire audio file try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` result = r.recognize_google(audio, language='zh-tw') print("辨識結果: " + result) return result except sr.UnknownValueError: counter += 1
sep=' ', names=['word', 'ner']) #####---------------- Load jieba data ---------------------------------------------------- jieba_dict_folder = existing_ner_folder jieba_dict_filename = 'userdict.txt' jieba_dict_file = os.path.join(jieba_dict_folder, jieba_dict_filename) print('loading jieba_dict') jieba_dict_df = pd.read_csv(jieba_dict_file, sep=' ', names=['word', 'freq', 'pos']) print('loading jieba.set_dictionary') jieba.set_dictionary(jieba_dict_file) print('loading jieba.load_userdict') jieba.load_userdict(jieba_dict_file) #####---------------- Load ner2pos data ---------------------------------------------------- ner2pos_folder = existing_ner_folder ner2pos_filename = 'ner2pos.json' ner2pos_file = os.path.join(ner2pos_folder, ner2pos_filename) print('loading ner2pos') with open(ner2pos_file, 'r') as fp: ner2pos = json.load(fp) pos2ner = {v: k for k, v in ner2pos.items()} #print(pos2ner) print('\n\n') #####---------------- Load ner2pos data ----------------------------------------------------
posdelim = args.pos def cutfunc(sentence, _, HMM=True): for w, f in jieba_fast.posseg.cut(sentence, HMM): yield w + posdelim + f else: cutfunc = jieba.cut delim = text_type(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin if args.dict: jieba.initialize(args.dict) else: jieba.initialize() if args.user_dict: jieba.load_userdict(args.user_dict) ln = fp.readline() while ln: l = ln.rstrip('\r\n') result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm)) if PY2: result = result.encode(default_encoding) print(result) ln = fp.readline() fp.close()
def tag_one_file_test(file: str): ''' 测试版,在控制台输出提取的标签。 内部需要:NBA专用名词的词典txt的路径,停用词表txt的路径; :param file: 一个json文件的路径,键为 title、url、reply、views、comefrom、time、text、tags :return: ''' import json import jieba_fast as jb jb.load_userdict('dict3.txt') #专用字典 import jieba_fast.analyse as jbana textranker = jbana.TextRank() tfidfer = jbana.TFIDF() # 分词器 textranker.set_stop_words('stop.txt') tfidfer.set_stop_words('stop.txt') # stop words try: json_file = open(file, 'r', encoding='utf-8') except IOError: print('fail to open', file, IOError, sep=' ') return doc_list = json.load(json_file) for doc in doc_list: title = doc['title'] text = doc['text'] true_tag = doc['tags'] if 'tags' in doc.keys() else None whole = title + ' ' for string in text: whole = whole + string # 整个文章拼在一起 split_gen = jb.cut(whole) # 是生成器 split_whole = '' for s in split_gen: split_whole = split_whole + ' ' + s # 分词后的文章 tag1w = tfidfer.extract_tags(split_whole, topK=10, withWeight=True, allowPOS=('nr', 'nz')) tag2w = tfidfer.extract_tags(split_whole, topK=10, withWeight=True, allowPOS=('an', 'b', 'j', 'l', 'Ng', 'n', 'nr', 'ns', 'nz', 'nt')) tag3w = textranker.textrank( split_whole, topK=10, withWeight=True # , allowPOS=('nr', 'nz', 'n') # , allowPOS=( # 'Ag', 'a', 'ad', 'an', 'b', 'c', 'dg', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'Ng', # 'n', 'nr', 'ns', 'nt', 'nz', 'o', 'p', 'q', 'r', 's', 'tg', 't', 'u' # , 'vg', 'v', 'vd', 'vn', 'w', 'x', 'y', 'z', 'un') # all pos # , allowPOS=( # 'Ag', 'a', 'ad', 'an', 'b', 'c', 'dg', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'Ng', # 'n', 'nr', 'ns', 'nt', 'nz', 'o', 'p', 'q', 'r', 's', 'u', 'w', 'y', 'z', 'un') , allowPOS=('an', 'b', 'j', 'k', 'l', 'Ng', 'n', 'nr', 'ns', 'nz', 'vn', 's')) tag1 = [] wei1 = [] for tup in tag1w: tag1.append(tup[0]) wei1.append(tup[1]) tag2 = [] wei2 = [] for tup in tag2w: tag2.append(tup[0]) wei2.append(tup[1]) tag3 = [] wei3 = [] for tup in tag3w: tag3.append(tup[0]) wei3.append(tup[1]) #: 下面是让3个提取器投票选出最后的关键词 # 方案1: # final_tag = set() # import random # choose_size = 5 # if len(tag1) < choose_size: # final_tag = final_tag.union(set(tag1)) # else: # cho1 = random.choices(tag1, weights=wei1, k=choose_size) # final_tag = final_tag.union(set(cho1)) # # if len(tag2) < choose_size: # final_tag = final_tag.union(set(tag2)) # else: # cho2 = random.choices(tag2, weights=wei2, k=choose_size) # final_tag = final_tag.union(set(cho2)) # # if len(tag3) < choose_size: # final_tag = final_tag.union(set(tag3)) # else: # cho3 = random.choices(tag3, weights=wei3, k=choose_size) # final_tag = final_tag.union(set(cho3)) # 方案2: final_tag = list() import random choose_size = 5 # 一个参数,控制随机选择时选择的数目 if len(tag1) < choose_size: final_tag.extend(tag1w) else: cho1 = random.choices( tag1w, weights=wei1, k=choose_size) # 根据分词器判断的权重加权随机选择choose_size个关键词 final_tag.extend(cho1) # 合并每个提取器的提议 if len(tag2) < choose_size: final_tag.extend(tag2w) else: cho2 = random.choices(tag2w, weights=wei2, k=choose_size) final_tag.extend(cho2) if len(tag3) < choose_size: final_tag.extend(tag3w) else: cho3 = random.choices(tag3w, weights=wei3, k=choose_size) final_tag.extend(cho3) final_tag.sort(key=lambda x: float(x[1]), reverse=True) # 所有提议的关键词按权重排序(有重复) tag = [] wei = [] for tup in final_tag: tag.append(tup[0]) wei.append(tup[1]) if len(tag) < 2 * choose_size: final_tag = tag else: choose = random.choices(tag, weights=wei, k=2 * choose_size) final_tag = choose # 再做一次按权重的随机抽取,大小为2*choose_size final_tag = set(final_tag) # 去重 print('get tags:', final_tag) # 这里还可以考虑如下修改:先去重保持降序性,不过现在的结果似乎还可以,甚至比排序的更好些 # final_tag = set(final_tag) # final_tag = sorted(final_tag, key=lambda x: float(x[1]), reverse=True) # 关键词按权重排序 # tag = [] # wei = [] # for tup in final_tag: # tag.append(tup[0]) # wei.append(tup[1]) # if len(tag) < 2 * choose_size: # final_tag = tag # else: # choose = random.choices(tag, weights=wei, k=2*choose_size) # final_tag = choose json_file.close()
if isNature: nture = w.nature chN = natures[nture.toString()] r.append(word + ":( " + nture.toString() + chN + ")") else: r.append(word.strip()) return r if __name__ == '__main__1': out_file = "./data/userdict1.txt" user_dict(out_file) if __name__ == '__main__': jieba.load_userdict('./data/userdict.txt') stopwords = [ line.rstrip() for line in open('./data/stopwords.txt', encoding='utf-8') ] file = "./data/hotword.txt" with open(file, mode="r", encoding="utf-8") as fp: i = 0 for line in fp.readlines()[:]: text = json.loads(line)['content'] print(i) cleanedText = clean_text(text) print("-" * 60 + "清洗之后:") print(cleanedText) c = [word for word in jieba.cut(cleanedText)]
# e_ner=pd.read_csv(e_ner_file,sep=' ',header=0,encoding='utf-8')['word'].values # # sentences_folder = 'sentences' NER_resources_folder = 'NER_resources' bio_folder = 'bio_corpus' ner_txtfilename = 'ner_results.txt' ner_txtfile = os.path.join(NER_resources_folder, ner_txtfilename) ners = pd.read_csv(ner_txtfile, sep=' ', names=['word', 'ner'], encoding='utf-8') jieba.set_dictionary(os.path.join(NER_resources_folder, 'userdict_ner.txt')) jieba.load_userdict(os.path.join(NER_resources_folder, 'userdict_ner.txt')) sentence_filename = 'whole.txt' sentence_file = os.path.join(sentences_folder, sentence_filename) def convert_sentence_bio(sentence): word_list, label_list = [], [] for c in jieba.lcut(sentence.strip('\n'), cut_all=False, HMM=False): if c not in ners.word.values: c_tag = 'O' word_tag = list(zip(list(c), [c_tag] * len(c))) else: c_ner = ners[ners['word'] == c]['ner'].values[0]
def __init__(self): if comConfig.use_dict: jieba.load_userdict(fileConfig.dir_jieba + fileConfig.file_jieba_dict)
from train import run from flags import FLAGS import jieba_fast as jieba jieba.load_userdict("data/dict_fasttext.txt") if __name__ == "__main__": #infer_file = "data/train_pos.csv" infer_file = FLAGS.inference_data_path mean_prob = run([infer_file], mode="infer", jieba=jieba) print("coh2 score: ",mean_prob) if FLAGS.log_path: with open(FLAGS.log_path,"a") as f: f.write("coh2: %s\n"%mean_prob)
import json import os import jieba_fast as jieba import gensim from gensim import corpora jieba.load_userdict("F:/代码code/DSTS/dict/dict_baidu_utf8.txt") jieba.load_userdict("F:/代码code/DSTS/dict/dict_pangu.txt") jieba.load_userdict("F:/代码code/DSTS/dict/dict_sougou_utf8.txt") jieba.load_userdict("F:/代码code/DSTS/dict/dict_tencent_utf8.txt") jieba.load_userdict("F:/代码code/DSTS/dict/my_dict.txt") stopwords = [] # 创建停用词列表 for line in open('F:/代码code/DSTS/dict/Stopword.txt', encoding='UTF-8'): x = line.split('\n')[0] stopwords.append(x) def LDA_topic(text): vector = [] seg_list = jieba.lcut(text, cut_all=False) # 使用分词,将文本分开 生成列表 result = [] for j in seg_list: # 去掉停用词 if j not in stopwords and j is not ' ': result.append(j) result = [result] dictionary = corpora.Dictionary(result) # 构造词典,给每一个词创建一个索引号 # 使用上面的词典,将转换文档列表(语料)变成 DT 矩阵 doc_term_matrix = [dictionary.doc2bow(doc) for doc in result] model = gensim.models.LdaModel.load('F:/代码code/DSTS/LDA/lda.model') # 加载训练好的模型 for e, values in enumerate(model.inference(doc_term_matrix)[0]): for ee, value in enumerate(values):
def tag_one_file2(file: str): ''' 服务器版:为一个json文件提取标签,在输出 增加标签后的一整个json文件 到原文件。 内部需要(可固定):NBA专用名词的词典的路径('../NBAdict.txt'),停用词表的路径('../stopword.txt'), 设置choose_size参数,控制随机采纳分词的提议时的选择的数目 :param file: 一个json文件的路径,数组,每个数组单元为一篇新闻报道,其键为 title、url、reply、views、comefrom、time、text、tags :return:打印增加标签后的json文件 ''' import json import jieba as jb0 jb0.setLogLevel(logging.INFO) import jieba_fast as jb jb.setLogLevel(logging.INFO) # 让jieba不输出debug信息 try: jb.load_userdict('NBAdict.txt') # 专用字典 except: logger.exception('fail to open dictionary') from jieba_fast.analyse.textrank import TextRank from jieba_fast.analyse.tfidf import TFIDF textranker = TextRank() tfidfer = TFIDF() # 分词器 try: textranker.set_stop_words('stopword.txt') tfidfer.set_stop_words('stopword.txt') # stop words except: logger.exception('fail to set stop words') try: json_file = open(file, 'r', encoding='utf-8') except IOError as ioe: logger.exception('fail to open ' + file) # 打开文件失败 raise ioe try: doc_list = json.load(json_file) except Exception as e: # 不知道error类型 logger.exception('fail to load json file:' + file) # 打开文件失败 json_file.close() raise e for doc in doc_list: # 对每一篇报道 keys = doc.keys() title = doc['title'] if 'title' in keys else '' text = doc['text'] if 'text' in keys else '' old_tag = doc['tags'] if 'tags' in keys else '' # 原来爬到的tags whole = title + ' ' for string in text: # 迭代器:为空的话在for循环中也不会报错的 whole = whole + string # 整个文章拼在一起 split_gen = jb.cut(whole) # 按字典分词,是生成器 split_whole = '' for s in split_gen: split_whole = split_whole + ' ' + s # 分词后的文章 # 3个分词器,每个提议的结果都是(tag,weight)形式的list tag1w = tfidfer.extract_tags(split_whole, topK=10, withWeight=True, allowPOS=('nr', 'nz')) tag2w = tfidfer.extract_tags(split_whole, topK=10, withWeight=True, allowPOS=('an', 'b', 'j', 'l', 'Ng', 'n', 'nr', 'ns', 'nz', 'nt')) tag3w = textranker.textrank(split_whole, topK=10, withWeight=True, allowPOS=('an', 'b', 'j', 'k', 'l', 'Ng', 'n', 'nr', 'ns', 'nz', 'vn', 's')) #: 将所有的权重值提取出来形成一个list用于后面的随机选择 # tag1 = [] wei1 = [] for tup in tag1w: # tag1.append(tup[0]) wei1.append(tup[1]) # tag2 = [] wei2 = [] for tup in tag2w: # tag2.append(tup[0]) wei2.append(tup[1]) # tag3 = [] wei3 = [] for tup in tag3w: # tag3.append(tup[0]) wei3.append(tup[1]) final_tagw = list() import random choose_size = 5 # 一个参数,控制随机选择时选择的数目 if len(tag1w) < choose_size: final_tagw.extend(tag1w) else: cho1 = random.choices( tag1w, weights=wei1, k=choose_size) # 根据分词器判断的权重加权随机选择choose_size个关键词 final_tagw.extend(cho1) # 合并每个提取器的提议 if len(tag2w) < choose_size: final_tagw.extend(tag2w) else: cho2 = random.choices(tag2w, weights=wei2, k=choose_size) final_tagw.extend(cho2) if len(tag3w) < choose_size: final_tagw.extend(tag3w) else: cho3 = random.choices(tag3w, weights=wei3, k=choose_size) final_tagw.extend(cho3) final_tagw.sort(key=lambda x: float(x[1]), reverse=True) # 所有提议的关键词按权重排序(有重复) tag = [] wei = [] for tup in final_tagw: tag.append(tup[0]) wei.append(tup[1]) final_tag = list() if len(tag) < 2 * choose_size: final_tag = tag else: choose = random.choices(tag, weights=wei, k=2 * choose_size) final_tag = choose # 再做一次按权重的随机抽取,大小为2*choose_size final_tag = set(final_tag + old_tag) # 合并原标签并去重 doc['tags'] = list(final_tag) # 修改原标签 json_file.close() with open(file, 'wt', encoding='utf-8') as fo: json.dump(doc_list, fo, ensure_ascii=False)
def my_wordcloud(filename): t1 = time.time() # 讀取停用字 stopwords = [line.strip() for line in open('stopwords.txt').readlines()] print('停用字長度', len(stopwords)) print('filename : ', filename) jieba.load_userdict('userdict.txt') words = [] with open(filename, 'r') as fileread: for line in fileread.readlines(): line = line.replace(' ', '') line = line.replace('7-11', '統一超商') line = line.replace('711', '統一超商') line = line.replace('SEVEN', '統一超商') line = line.replace('7-eleven', '統一超商') line = line.replace('小7', '統一超商') line = line.replace('7Eleven', '統一超商') line = line.replace('seven', '統一超商') line = line.replace('小七', '統一超商') cutted = jieba.cut(line, cut_all=False) for word in cutted: if word.lower() not in stopwords: words.append(word.lower()) t2 = time.time() print(f'分詞使用時間 {t2-t1:.4f} s') print(len(words)) #words_in_string = ' '.join(words) word_appear_times = {} for i in words: if i not in word_appear_times: word_appear_times[i] = 1 else: word_appear_times[i] += 1 # print(word_appear_times) word_appear_times_ordered = sorted(word_appear_times.items(), key=lambda x: x[1], reverse=True) # top150 = word_appear_times_ordered[0:150] # print(top150) top150_word = ' '.join([x[0] for x in word_appear_times_ordered[0:150]]) #print(top150_word) cloud_mask = np.array(Image.open("cloud_mask.png")) wc = WordCloud(repeat=False, include_numbers=False, max_words=150, min_word_length=2, colormap='RdYlGn', mask=cloud_mask, background_color="black", scale=4, font_path='/System/Library/Fonts/PingFang.ttc') wc.generate(top150_word) wc.to_file(f'{filename[:-4]}.jpg') t3 = time.time() print(f'畫圖使用時間 {t3-t2:.4f} s')
from sklearn.metrics import accuracy_score model_path = r'/home/sunxianwei/python/result/kind2level4/test_model_path' bs_kind_cat4_system_path = r'/home/sunxianwei/python/data/kind/test_config_filepath/kind_system_allcat_20180601.csv' stopwords_path = r'/home/sunxianwei/python/data/kind/test_config_filepath/filter_char.txt' dataseg_path = r'/home/sunxianwei/python/data/kind/test_config_filepath/prod_clothes20180601_to_keywords.csv' userdict_path = False tfidf_max_features_config_dict = {1:None,3:None,5:None,7:None,10:None} if userdict_path: print('加载自定义词典') jieba.load_userdict(userdict_path) bs_kind_cat4_system = pd.read_csv(bs_kind_cat4_system_path,encoding='utf8') bs_kind_cat4_system.fillna(0,inplace=True) bs_kind_cat4_system.parent_catid=bs_kind_cat4_system.parent_catid.astype(int) bs_kind_cat4_system=[{'catid':row['cat_id'],'catname':row['cat_name'],'parent_catid':row['parent_catid']} for index,row in bs_kind_cat4_system.iterrows()] def readdata_from_pgsql(username,passwd,queryStr): ''' 两个重点字段需要规范,其它字段保留 prod_name:产品名称 catid:品类ID '''