import os import re import jieba from random import shuffle from util import load_word_re, load_type_re, load_pair, word_replace path_stop_word = 'dict/stop_word.txt' path_type_dir = 'dict/word_type' path_homo = 'dict/h**o.csv' path_syno = 'dict/syno.csv' stop_word_re = load_word_re(path_stop_word) word_type_re = load_type_re(path_type_dir) homo_dict = load_pair(path_homo) syno_dict = load_pair(path_syno) path_cut_word = 'dict/cut_word.txt' jieba.load_userdict(path_cut_word) def save_train(path, texts, labels): label_texts = dict() for text, label in zip(texts, labels): if label not in label_texts: label_texts[label] = list() cut_text = ' '.join(jieba.cut(text)) label_texts[label].append(cut_text) head = 'label,cut_doc' with open(path, 'w') as f:
import json import pickle as pk import re from util import load_word_re path_pre_name = 'dict/pre_name.txt' path_digit = 'dict/digit.txt' pre_name_re = load_word_re(path_pre_name) digit_re = load_word_re(path_digit) path_label_ind = 'feat/label_ind.pkl' def include_pre_name(word): if re.findall(pre_name_re, word): return True else: return False def include_digit(word): if re.findall(digit_re, word): return True else: return False def sent2feat(triples): sent_feat = list()