def set_sentiment_and_viewpoint(self): self.hotelnlp = HotelNLP() self.thu = thulac("") comm_list = self.hotel_dao.get_remarks() print len(comm_list) sentiment_comm_list = [] i = 0 for comm in comm_list: if comm[8] is None or comm[9] is None: sentiment_value = None viewpoint = None remark = re.sub(u"\@",u"",comm[2]) try: sentiment_value = self.hotelnlp.sentiment(remark.encode("utf-8")) sentiment_value = round(sentiment_value*1000)/1000 print sentiment_value except: print comm[2] traceback.print_exc() try: viewpoint = self.hotelnlp.viewpoint(remark.encode("utf-8"),decoding="utf-8") viewpoint = json.dumps(viewpoint, ensure_ascii=False) except: print remark traceback.print_exc() comm = {"guid":comm[0], "senti_value":sentiment_value, "viewpoint":viewpoint} sentiment_comm_list.append(comm) if len(sentiment_comm_list)==10000: i+=1 print "update %d time"%i self.hotel_dao.update_remarks(sentiment_comm_list) sentiment_comm_list = []
def createTable(num): start = time.time() thu = thulac.thulac() file = open('agri_economic.json', encoding='utf-8') print("begin!") f = json.load(file) count = 0 file_text = "" for p in f: count += 1 if int(count/100) != num: continue if count % 10 == 0: cur = time.time() print("now id : " + str(count) + " table size :" ) print("Running Time : " + str(int(cur-start)) + " s......") detail = p['detail'] # if len(detail) > 600: # detail = detail[0:600] title = p['title'] # 分词 text = thu.cut(detail) wordList = createWordList(text) file_text += title for word in wordList: file_text += ' ' + word file_text += '\n' file_object = open('article'+str(num)+".txt",'w') file_object.write(file_text) file_object.close()
def count_word_frq(self): self.thu = thulac("-input cs.txt") comm_list = self.hotel_dao.get_remarks() sentiment_comm_list = [] i = 0 for comm in comm_list: a_dict = {} try: cut_comm = map(lambda x: x.split("_"), self.thu.cut(comm[2].encode("utf-8"))) except: cut_comm = [] print comm[2] traceback.print_exc() for word in cut_comm: if word[1].decode("utf-8") == "a": if word[0].decode("utf-8") not in a_dict: a_dict[word[0].decode("utf-8")] = 1 else: a_dict[word[0].decode("utf-8")] += 1 comm = {"guid":comm[0], "word_freq":json.dumps(a_dict, ensure_ascii=False)} sentiment_comm_list.append(comm) if len(sentiment_comm_list)==10000: i+=1 print "update %d time"%i self.hotel_dao.update_hotel_comm_word_freq(sentiment_comm_list) sentiment_comm_list = []
def preprocess(self, filepath, char_or_word='char'): cleaned_data = list() cleaned_label = list() with open(filepath, 'rb') as fr: for line in fr: items = line.strip().split('\t') label = items[0] review = items[1].decode() if char_or_word == 'word': import thulac cutter = thulac.thulac(seg_only=True, T2S=True, filt=True) words = cutter.cut(review) if len(words) < 1: continue words, _ = zip(*words) elif char_or_word == 'char': words = list(review) else: raise ValueError('You must make sure the value of ' '[char_or_word] is either char or word') if words in ([''], [' ']): continue words = map(lambda kk: kk.decode(), words) cleaned_data.append(words) cleaned_label.append(int(label)) return cleaned_data, cleaned_label
def createTable(num): start = time.time() thu = thulac.thulac() file = open('agri_economic.json', encoding='utf-8') print("begin!") table = set() f = json.load(file) count = 0 file_text = "" for p in f: count += 1 if int(count/2000) != num: continue if count % 10 == 0: cur = time.time() print("now id : " + str(count) + " table size :" + str(len(table))) print("Running Time : " + str(int(cur-start)) + " s......") detail = p['detail'] # if len(detail) > 600: # detail = detail[0:600] title = p['title'] table.add(title) # 分词 text = thu.cut(detail) table = table | createWordSet(text) for t in table: file_text += t+' ' file_object = open('table'+str(num)+".txt",'w') file_object.write(file_text) file_object.close()
def __init__(self): self.tfidf = joblib.load('predictor/model/tfidf.model') self.law = joblib.load('predictor/model/law.model') self.accu = joblib.load('predictor/model/accu.model') self.time = joblib.load('predictor/model/time.model') self.batch_size = 1 self.cut = thulac.thulac(seg_only = True)
def split(label='20020101am', output='20020101am_split.txt', \ input='20020101am_clean.txt', lac = thulac.thulac(seg_only=True)): #f_stopwords = open('./list/stop_words.txt','r') # stopwords = [] # for line in f_stopwords.readlines(): # stopwords.append(line.strip()) # f_stopwords.close() lac.cut_f(input, output) print 'DONE: ', label
def ch_seg_line(eachline): seg_line = "" thu1 = thulac.thulac(seg_only=True) #only split but not tag seg_line = thu1.cut(eachline, text=True) #splitted input_ch seg_line = seg_line.strip() return seg_line
def __init__(self, cwd=".", tfidf='statement_tfidf.model', gbt='statement_som_gbt.model'): print('train tfidf...', self.print_mem()) self.tfidf = joblib.load(os.path.join(cwd, tfidf)) print('train gbt...', self.print_mem()) self.gbt = joblib.load(os.path.join(cwd, gbt)) self.cut = thulac.thulac(seg_only=True)
def CutArticle(article): file = open(article, 'rb') data = file.read().decode('utf-8') file.close() thu = thulac.thulac() text = thu.cut(data) length = len(text) demo = createGenerator(text) return demo, length
def cut(content, method=1): """seg""" if method == 0: import thulac thu1 = thulac.thulac(seg_only=True, filt=True) words = thu1.cut(content, text=True) # 进行一句话分词 else: words = content return words
def testrmSpace(): test_text1 = "而荔 波 肉又 丧 心 病 狂 的不肯悔改" test_text2 = "我爱北京天 安 门" thu = thulac.thulac(seg_only = True, rm_space = False) gold1 = thu.cut(test_text1, text = True) gold2 = thu.cut(test_text2, text = True) print(gold1, gold2) assert gold1 == "而 荔 波 肉 又 丧 心 病 狂 的 不 肯 悔改" assert gold2 == "我 爱 北京 天 安 门"
def testrmSpace(): test_text1 = "而荔 波 肉又 丧 心 病 狂 的不肯悔改" test_text2 = "我爱北京天 安 门" thu = thulac.thulac(seg_only=True, rm_space=False) gold1 = thu.cut(test_text1, text=True) gold2 = thu.cut(test_text2, text=True) print(gold1, gold2) assert gold1 == "而 荔 波 肉 又 丧 心 病 狂 的 不 肯 悔改" assert gold2 == "我 爱 北京 天 安 门"
def thulac(source_text): """THULAC segmentator.""" url = 'http://localhost:5000/' data = {'source_text': source_text} # resp = requests.post(url, data=data) # return resp.text.split(' ') thu = thulacSeg.thulac(seg_only=True, model_path="thulac/models/") segtxt = [x[0] for x in thu.cut(source_text)] return segtxt
def thulac(source_text): """THULAC segmentator.""" url = 'http://localhost:5000/' data = {'source_text': source_text} # resp = requests.post(url, data=data) # return resp.text.split(' ') thu = thulacSeg.thulac(seg_only=True, model_path = "thulac/models/") segtxt = [x[0] for x in thu.cut(source_text)] return segtxt
def seg_pos(self, text): thu4car = thulac.thulac(user_dict= self.user_dict, seg_only=True) postagger = Postagger() postagger.load(self.pos_file) item = thu4car.cut(text, text=True) words = item.split(' ') postag = postagger.postag(words) print('|'.join([w+'_'+p for w, p in zip(words, postag)])) return words,postag
def __init__(self,filename): fin = open(filename,'r') self.text = fin.read() mthulac = thulac.thulac(user_dict=None, model_path=None, T2S=False, seg_only=False, filt=False) #默认模式 lst = mthulac.cut(self.text) self.words = [] for w in lst: self.words.append((w[0],w[1])) fin.close()
def main(args): with open(args.input, encoding='utf8') as f: lines = f.read().splitlines() if args.format == 'lines': lines.append('<song>') tot = len(lines) parsed_line = [] thu1 = thulac.thulac(seg_only=True) with open(args.output, encoding='utf8', mode='w') as f: cnt = 0 for line in lines: if args.format == 'lines': line = HanziConv.toSimplified(line) if cnt % 100 == 0: print('status: %d/%d' % (cnt, tot)) cnt += 1 if line == '<song>': if len(parsed_line) == 0: continue n = len(parsed_line) # 控制每句总长度为maxlen for i in range(n): l = len(parsed_line[i]) if l > args.maxlen: continue ctrl_list = parsed_line[i] for k in range(i + 1, n + 1): if k == n or l + len(parsed_line[k]) + 1 > args.maxlen: f.write(' '.join(ctrl_list) + '\n') break ctrl_list.append('<lbreak>') ctrl_list += parsed_line[k] l += len(parsed_line[k]) + 1 parsed_line = [] continue # 用thulac或jieba进行分词 if args.segment == 0: seg_list = jieba.lcut(line) else: seg_list = thu1.cut(line) seg_list = [t[0] for t in seg_list] seg_list2 = [] for word in seg_list: seg_list2 += parse_segged_word(word) seg_list = seg_list2 if args.segment == 0: seg_list2 = [] for word in seg_list: if word == '<num>': seg_list2.append(word) else: seg_list2 += list(word) seg_list = seg_list2 if len(seg_list) > 0: parsed_line.append(seg_list) print('Finished')
def test_cut_from_file(self): thu = thulac("-input cs.txt") neg_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..\\..\\main\\python\\service\\nlp\\pos.txt') neg_docs = codecs.open(neg_path, 'r', 'utf-8').readlines() for sent in neg_docs: try: thu.cut(sent.encode("utf-8")) except: print sent continue
def run(self): starttime = time.time() thu = thulac.thulac(seg_only=True) print('Thulac:') print("".join([(x + '/ ') for x in thu.cut(self.test_text, text=True) if x not in self.stopwords])) endtime = time.time() print('time cost:' + str(round((endtime - starttime), 4)) + ' seconds.\n')
def __init__(self, word2VecModelFilePath='Data/wiki_han_word2vec_300维度.model', weightpara=1e-3, isRemovePc=1, isUseThulac=True): self.weightpara = weightpara self.isRemovePc = isRemovePc self.model = Word2Vec.load(word2VecModelFilePath) self.word_index_map = {} for index, word in enumerate(self.model.wv.index2entity): self.word_index_map[word] = index self.vectors = self.model.wv.vectors self.isUseThulac = isUseThulac if isUseThulac == True: self._thulac = thulac.thulac(seg_only=True)
def __init__(self, pos_tags: bool = False, simplify: bool = False, filt: bool = False, only_tokens: bool = True, user_dict: str = None) -> None: if pos_tags: seg_only = False else: seg_only = True if user_dict and os.path.exists(user_dict): self.thunlp = thulac.thulac(seg_only=seg_only, T2S=simplify, filt=filt, user_dict=user_dict) else: self.thunlp = thulac.thulac(seg_only=seg_only, T2S=simplify, filt=filt) self._only_tokens = only_tokens
def Segmentation(wiki_file, Words_dict, output_text): thu1 = thulac.thulac(user_dict=Words_dict, seg_only=True) input_object = open(wiki_file, 'r', encoding="UTF-8").readlines() output_object = open(output_text, 'w', encoding="UTF-8") for line in tqdm(input_object): str = line.replace(" ", "") t = thu1.cut(str, text=True) output_object.write(t + "\n") input_object.close() output_object.close() print("清华分词处理完成")
def __init__(self): self.user_dict = None self.model_path = None #默认为model_path self.T2S = True #繁简体转换 self.seg_only = True #只进行分词 self.filt = False #去停用词 self.tokenizer = thulac.thulac(user_dict=self.user_dict, model_path=self.model_path, T2S=self.T2S, seg_only=self.seg_only, filt=self.filt)
def cut_text(alltext): count = 0 cut = thulac.thulac(seg_only=True) train_text = [] for text in alltext: count += 1 if count % 2000 == 0: print(count) train_text.append(cut.cut(text, text=True)) return train_text
def __init__(self): ''' prop_dic是一个存放知识库中所有属性名的字典及频率 char_2_prop是一个字映射属性的倒排索引,用于提高模糊匹配的速度 ''' self.prop_dic = pickle.load(open('../data/prop_dic.pkl', 'rb')) #键没有引号 self.char_2_prop = pickle.load(open('../data/char_2_prop.pkl', 'rb')) self.segger = thulac.thulac() self.question2mention = pickle.load( open('../data/question_2_mention.pkl', 'rb')) print('prop extractor loaded')
def cut_qts_to_words(qts_file, saved_words_file): save_dir = os.path.dirname((saved_words_file)) dumped_file = os.path.join(save_dir, 'qts_words_stat_result.pkl') if os.path.exists(dumped_file) and os.path.exists(saved_words_file): print('find preprocessed static, loading directly...') with open(dumped_file, 'rb') as f: char_counter, author_counter, vocab, word_counter, genre_counter = pickle.load(f) else: char_counter = Counter() # 字频统计 author_counter = Counter() # 每个作者的写诗篇数 vocab = set() # 词汇库 word_counter = Counter() # 词频统计 genre_counter = defaultdict(Counter) # 针对每个词性的Counter fid_save = open(saved_words_file, 'w', encoding = 'utf-8') lex_analyzer = thulac.thulac() # 分词器 line_cnt = 0 with open(qts_file, 'r', encoding = 'utf-8') as f: for line in f: text_segs = line.split() author = text_segs[2] author_counter[author] += 1 poem = text_segs[-1] # 去除非汉字字符 valid_char_list = [c for c in poem if '\u4e00' <= c <= '\u9fff' or c == ',' or c == '。'] for char in valid_char_list: char_counter[char] += 1 regularized_poem = ''.join(valid_char_list) word_genre_pairs = lex_analyzer.cut(regularized_poem) word_list = [] for word, genre in word_genre_pairs: word_list.append(word) vocab.add(word) word_counter[word] += 1 genre_counter[genre][word] += 1 save_line = ' '.join(word_list) fid_save.write(save_line + '\n') if line_cnt % 10 == 0: print('%d poets processed.' % line_cnt) line_cnt += 1 fid_save.close() # 存储下来 dumped_data = [char_counter, author_counter, vocab, word_counter, genre_counter] with open(dumped_file, 'wb') as f: pickle.dump(dumped_data, f) return char_counter, author_counter, genre_counter
def clean_with_tf_idf(in_file_name, tf_idf_name): thu0 = thulac.thulac() data = json.load(open(in_file_name, encoding='utf-8')) tf_idf = json.load(open(tf_idf_name, encoding='utf-8')) for index_, item_ in enumerate(data): text_ = item_['text'] label_ = item_['merged_label'] tmp_ = ' '.join([c[0] for c in thu0.fast_cut(text_) if c[0] in tf_idf[str(label_)]]) item_['text'] = tmp_ json.dump(data, open('okoo-merged-clean-cut-data,json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2, separators=(',', ': '))
def __init__(self): self.tfidf = joblib.load( os.path.join(data_path, 'generated/tfidf.model')) self.law = joblib.load(os.path.join(data_path, 'generated/law.model')) self.accu = joblib.load(os.path.join(data_path, 'generated/accu.model')) self.time = joblib.load(os.path.join(data_path, 'generated/time.model')) self.batch_size = 1 self.cut = thulac.thulac(seg_only=True)
def parse_args(): parser = argparse.ArgumentParser(description='Check pairwise similarities of .docx files.') parser.add_argument('--dir', help='the directory containing .docx files', required=True) parser.add_argument('--out', help='the output file path', required=True) parser.add_argument('--hash-width', help='the word length of a hashing block (default 8)', type=int, default=8) parser.add_argument('--hash-step', help='the word step between hashing block (default 1)', type=int, default=1) parser.add_argument('--sample-cnt', help='sample count (default 1000)', type=int, default=1000) args = parser.parse_args() args.thu = thulac.thulac(seg_only=True) return args
def __init__(self): with open("chengyu_index_r.json", "r") as json_file: data = json_file.read() self.data_dict = json.loads(data) # for Chinese segmentation self.thu = thulac.thulac( user_dict=None, model_path=None, T2S=False, # 繁体到简体 seg_only=True, # 只分词 filt=True, # 过滤没有意义的词 deli='_') # 分隔词和词性的分割符
def file_tag(src_dir, dst_dir, user_dict_path): src_dir = pathlib.Path(src_dir) dst_dir = pathlib.Path(dst_dir) seg = thulac.thulac(user_dict=user_dict_path) for src_file in src_dir.glob('*.txt'): assert src_file.is_file(), "Not found {0}".format(src_file) tag_path = pathlib.Path(dst_dir / src_file.name) # read file and process fr = src_file.open(mode='r') contents = [] for line in fr.readlines(): # remove space blank if len(line.strip()) == 0: continue line = content_process(line) contents.append(line) # word tag with tag_path.open('w') as fw: for content in contents: for seg_clip in seg.cut(content): words, tag = seg_clip[0], seg_clip[1] word_status = 'o' for word in words: print(words, len(words)) # filter url if tag in ['nz', 'ns', 'n', 'ni', 'uw' ] and len(words) > 1 and len(words) < 10: if word_status in ['IT', 'o']: fw.write(word + ' ' + tag_map['n_begin'] + '\n') word_status = 'BE' else: fw.write(word + ' ' + tag_map['n_internal'] + '\n') word_status = 'IE' elif tag in ['v', 'vn' ] and len(words) > 1 and len(words) < 10: if word_status in ['IE', 'o']: fw.write(word + ' ' + tag_map['v_begin'] + '\n') word_status = 'BT' else: fw.write(word + ' ' + tag_map['v_internal'] + '\n') word_status = 'IT' else: fw.write(word + ' ' + tag_map['other'] + '\n')
def preprocess(dataset: str): global segment_tool, dictionary print('Loading Segment Model...') segment_tool = thulac(rm_space=True) print('Loading dictionary') dictionary = set(map(lambda s: s.rstrip('\n'), open('dataset/dictionary.txt', encoding='utf-8').readlines())) dataset_list = (['train', 'test'], [dataset]) for dataset_type, dataset_name in product(*dataset_list): with open('dataset/%s/%s_seg.txt' % (dataset_name, dataset_type), 'w', encoding='utf-8') as f: for line in handle_data('dataset/%s/%s.txt' % (dataset_name, dataset_type)): f.write(json.dumps(line, ensure_ascii=False) + '\n')
def thulac_pos(string): print('THULAC的分词和词性标注:') num = len(string) print(num) start_time = datetime.now() for s in string: seg = thulac.thulac() # 加载模型,给定用户词典 pos_list = seg.cut(s) all_time = (datetime.now() - start_time).total_seconds() avg = all_time / num print('pos_tag time used: {} sec'.format(avg)) print('\n\n')
def main(): files = getfiles(ROOT_ARTICLE) thu = thulac.thulac(seg_only=True) out_sentence = open(SENTENCE_FILE,'w') out_sentence_split = open(SENTENCE_SPLIT_FILE,'w') for f in tqdm(files): with open(f) as r: read = r.read() preprocessed_s = preprocess(read) preprocessed_ss = [thu.cut(sen, text=True) for sen in preprocessed_s] out_sentence.write('\n'.join(preprocessed_s)+'\n') out_sentence_split.write('\n'.join(preprocessed_ss)+'\n')
def run_write(): logger.info("write file 启动,加载数据...") update_path() set_logger_file() logger.info("加载jvm...") jpype.startJVM(common_keys.JVM_PATH, "-Djava.class.path=" + common_keys.JAR_PATH) logger.info("加载切词器...") thu1 = thulac.thulac(model_path=common_keys.THULAC_MODEL_PATH) file=create_single_file() write_file(file,thu1)
def cut(): cutmodel = thulac.thulac() train_dirs = os.listdir("data/train/") ans = {} for dir in train_dirs: if dir.endswith('.txt'): train = open("data/train/" + dir, 'r', encoding='utf-8').read() res = cutmodel.cut(train) ans[dir] = res print(dir + ' Done!') with open("data/cutresult.json", 'w') as f: json.dump(obj=ans, fp=f) f.close()
def get_thulac_result(sentences): """ Ref to: http://thulac.thunlp.org/ Install by: `pip install thulac` """ import thulac preds = [] lac = thulac.thulac(seg_only=True) for sentence in sentences: sent_seg = lac.cut(sentence, text=True) sent_seg = to_unicode(sent_seg) preds.append(sent_seg) return preds
def _tokenize(user_dict=None): tokenzier = thulac.thulac(user_dict=user_dict) def _tokenize(text): words = [] pos = [] pairs = tokenzier.cut(text) for pair in pairs: words.append(pair[0]) pos.append(pair[1]) return words, pos return _tokenize
def cut_words(): cutter = thulac.thulac(T2S=True, seg_only=True) with open('../data/reviews/reviews.txt', 'rb') as fr: sys.stdout.write('\r\rStart') sys.stdout.flush() fw = open('../data/reviews/cut_reviews.txt', 'wb') for i, line in enumerate(fr): items = line.strip().decode() words = cutter.cut(items) if len(words) < 2: continue words, _ = zip(*words) fw.write(' '.join(words) + '\n') sys.stdout.write('\r\rFinish %d' % i) sys.stdout.flush() fw.close()
def main(): src_dir = "/home/zihao/Text" out_dir = "/home/zihao/segText2" intput_filename_list = os.listdir(src_dir) intput_filename_list = [f for f in intput_filename_list if f.endswith(".txt")] thu = thulac.thulac("-seg_only") counter = 0 for name in intput_filename_list: segText(thu, name, src_dir, out_dir) counter += 1 if counter % 1000 == 0: print("Done {} passages.".format(counter)) localtime = time.asctime( time.localtime(time.time())) print "Time :", localtime, "\n" print("All done!!")
def set_sentiment_and_viewpoint(self): self.thu = thulac("") comm_list = dao.get_hotel_comments() sentiment_comm_list = [] for comm in comm_list: if comm[7] is None or comm[8] is None: sentiment_value = None viewpoint = None try: sentiment_value = self.hotelnlp.sentiment(comm[2].encode("utf-8")) except: print comm[2] traceback.print_exc() try: viewpoint = self.hotelnlp.viewpoint(comm[2].encode("utf-8"),decoding="utf-8") viewpoint = json.dumps(viewpoint, ensure_ascii=False) except: print comm[2] traceback.print_exc() comm = {"guid":comm[0], "senti_value":sentiment_value, "viewpoint":viewpoint} sentiment_comm_list.append(comm) print len(sentiment_comm_list) dao.update_hotel_comm(sentiment_comm_list)
import os import config import json import thulac index = "law_thulac" doc_type = "big_data" dir_path = "/mnt/new/" model_path = "/home/zhx/elasticsearch-5.5.2/plugins/thulac/models" server_dir = os.path.dirname(os.path.realpath(__file__)) config_file = os.path.join(server_dir, 'config.py') local_config_file = os.path.join(server_dir, 'local_config.py') cutter = thulac.thulac(seg_only=True, model_path=model_path, T2S=True) def cut(text): res = cutter.cut(text.encode('utf8')) result = "" first = True for x in res: if first: first = False else: result = result + " " result = result + x[0] return result if __name__ == '__main__':
#coding:utf-8 import thulac thu1 = thulac.thulac(seg_only=True, model_path="请查看README下载相关模型放到thulac根目录或在这里写路径") #设置模式为行分词模式 a = thu1.cut("我爱北京天安门") print(a)
import sys import os import json import xlrd reload(sys) sys.setdefaultencoding('utf-8') import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S') import thulac from collections import defaultdict # filt : remove the useless words # T2S : transform 繁体 to 简体 cutter = thulac.thulac(seg_only=True, T2S=True) # 解析大连理工大学的情感词汇数据 def DUTParser(): dut = xlrd.open_workbook(u'../dict/sentimentwords/DUT/file/情感词汇本体.xlsx') sheet1 = dut.sheet_by_index(0) words = sheet1.col_values(0) word_senses_num = sheet1.col_values(2) word_emotion_strengths = sheet1.col_values(5) word_emotion_types = sheet1.col_values(6) dut_dict = defaultdict(int) for i, word in enumerate(words): if i == 0: continue word = str(word).decode()
#coding=utf-8 import thulac thu1 = thulac.thulac("-seg_only") #设置模式为行分词模式 thu1.run() #根据参数运行分词程序,从屏幕输入输出 print " ".join(thu1.cut("我爱北京天安门")) #进行一句话分词 #============================================== thu2 = thulac.thulac("-input cs.txt") #设置模式为分词和词性标注模式 thu2.run() #根据参数运行分词和词性标注程序,从cs.txt文件中读入,屏幕输出结果 print " ".join(thu2.cut("我爱北京天安门")) #进行一句话分词和词性标注
def testUserDict(): test_text = "我爱北京天安门" thu = thulac.thulac(seg_only = True, user_dict = prefix + "/userDict.txt") gold = thu.cut(test_text, text = True) assert gold == "我爱北京天安门"
#coding: utf-8 import thulac import sys prefix = sys.path[0] thu = thulac.thulac(seg_only = True) def readFile(file_name): with open(file_name) as result: for line in result: return line def testCutFile(): thu.cut_f(prefix +"/textForTest/input.txt", prefix +"/textForTest/output.txt") print(readFile(prefix +"/textForTest/output.txt")) assert readFile(prefix + "/textForTest/output.txt") == "我 爱 北京 天安门\n" def testFastCut(): test_text = "我爱北京天安门" gold = thu.fast_cut(test_text, text = True) assert gold == "我 爱 北京 天安门" def testFastCutFile(): thu.fast_cut_f(prefix +"/textForTest/input.txt", prefix +"/textForTest/output.txt") print(readFile(prefix +"/textForTest/output.txt")) assert readFile(prefix +"/textForTest/output.txt") == "我 爱 北京 天安门\n"
import sys import thulac seg_only = False if(len(sys.argv) >= 4 and sys.argv[3] == "-seg_only"): seg_only = True lac = thulac.thulac(seg_only=seg_only) lac.cut_f(sys.argv[1], sys.argv[2])
def __init__(self): self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"]) self.thu = thulac("-input cs.txt") self.hotelnlp = HotelNLP()
def testSegOnly(): test_text = "我爱北京天安门" thu = thulac.thulac(seg_only = True) gold = thu.cut(test_text, text = True) assert gold == "我 爱 北京 天安门"
def testFilt(): test_text = "我可以爱北京天安门" thu = thulac.thulac(seg_only = True, filt = True) gold = thu.cut(test_text, text = True) print(gold) assert gold == "我 爱 北京 天安门"
def testT2S(): test_text = "我愛北京天安門" thu = thulac.thulac(seg_only = True, T2S = True) gold = thu.cut(test_text, text = True) print(gold) assert gold == "我 爱 北京 天安门"
reload(sys) import xlrd sys.setdefaultencoding('utf-8') import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S') import thulac from collections import defaultdict import numpy as np from sklearn.metrics import roc_auc_score, confusion_matrix from sklearn.svm import LinearSVC from sklearn.model_selection import KFold cutter = thulac.thulac(T2S=True, seg_only=True) # 获取字典内容,包括否定词、程度词、情感词 def stop_words_parser(): # 停用词:融合网络停用词、哈工大停用词、川大停用词 stop_words = set() with open(u'../dict/stopwords/中文停用词库.txt') as fr: for line in fr: item = line.strip().decode() stop_words.add(item) with open(u'../dict/stopwords/哈工大停用词表.txt') as fr: for line in fr: item = line.strip().decode() stop_words.add(item) with open(u'../dict/stopwords/四川大学机器智能实验室停用词库.txt') as fr: for line in fr:
def __init__(self): self.thul = thulac.thulac()
def __init__(self): self.classifier = Bayes() self.thu = thulac("-seg_only") train_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sentiment.marshal') self.load(train_file)
# -*- coding: utf-8 -*- import thulac import csv import sys import os sys.path.append("..") from Model.neo_models import Neo4j from Model.mongo_model import Mongo from toolkit.vec_API import word_vector_model from toolkit.tree_API import TREE pre_load_thu = thulac.thulac() #默认模式 print('thulac open!') neo_con = Neo4j() #预加载neo4j neo_con.connectDB() print('neo4j connected!') predict_labels = {} # 预加载实体到标注的映射字典 filePath = os.getcwd() with open(filePath+'/toolkit/predict_labels.txt','r',encoding="utf-8") as csvfile: reader = csv.reader(csvfile, delimiter=' ') for row in reader: predict_labels[str(row[0])] = int(row[1]) print('predicted labels load over!') # 读取word vector wv_model = word_vector_model() #wv_model.read_vec('toolkit/vector_5.txt') # 测试用,节约读取时间 #wv_model.read_vec('toolkit/vector.txt')
def test_pos(self): thu2 = thulac("-input cs.txt") #设置模式为分词和词性标注模式 # thu2.run() #根据参数运行分词和词性标注程序,从cs.txt文件中读入,屏幕输出结果 print " ".join(thu2.cut("住宿都是途牛给推荐的,杭州的两天说实话,有点偏,吃饭打车都不太方便,又赶上下雨带着孩子游玩比较费劲")).decode("utf-8")
def test_cut(self): s = "住宿都是途牛给推荐的,杭州的两天说实话,有点偏,吃饭打车都不太方便,又赶上下雨带着孩子游玩比较费劲。不过住的挺舒服的。南京这个酒店不知道是订单到酒店的问题,还是什么问题,第一天到了,我们要的是两间房1天,可是酒店的订单是1间房2天。而且我们在这个酒店还预定了隔一天的房间,也没有订单。给途牛打电话,说肯定没问题。结果第3天回来,没房。投诉了半天,才解决的(以后还是提前给预定的酒店打电话确认)。在扬州的住宿非常好。" thu = thulac("-seg_only") print " ".join(thu.cut(s)).decode("utf-8")
def testTagAndDeli(): test_text = "我爱北京天安门" thu = thulac.thulac(deli = '#') gold = thu.cut(test_text, text = True) assert gold == "我#r 爱#v 北京#ns 天安门#ns"