def model_xunlian(): #读取数据并预处理 df_bingyin_list = load_dataset('病因') df_zhenduan_list = load_dataset('诊断') df_zhengzhuang_list = load_dataset('症状') df_zhiliao_list = load_dataset('治疗') #对各个类别数据进行空值符处理 df_bingyin_word = processing_null(df_bingyin_list) # print(len(df_bingyin_word)) df_zhenduan_word = processing_null(df_zhenduan_list) df_zhengzhuang_word = processing_null(df_zhengzhuang_list) df_zhiliao_word = processing_null(df_zhiliao_list) bingyin = df_bingyin_word.values.tolist() zhenduan = df_zhenduan_word.values.tolist() zhengzhuang = df_zhengzhuang_word.values.tolist() zhiliao = df_zhiliao_word.values.tolist() #分别把各个类别数据整理成一个列表形式 sentences = [] prep = preprocess(sentences, bingyin, zhenduan, zhengzhuang, zhiliao) prep.preprocess_text(bingyin, sentences, 'pathogeny') prep.preprocess_text(zhenduan, sentences, 'diagnosis') prep.preprocess_text(zhengzhuang, sentences, 'symptom') prep.preprocess_text(zhiliao, sentences, 'treatment') random.shuffle(sentences) # 分别把各个类别数据整理成各个列表形式 bingyin_list = [] zhenduan_list = [] zhengzhuang_list = [] zhiliao_list = [] prep = preprocess2(bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, bingyin, zhenduan, zhengzhuang, zhiliao) prep.preprocess_lines(bingyin, bingyin_list, 'pathogeny') prep.preprocess_lines(zhenduan, zhenduan_list, 'diagnosis') prep.preprocess_lines(zhengzhuang, zhengzhuang_list, 'symptom') prep.preprocess_lines(zhiliao, zhiliao_list, 'treatment') #分割数据 x, y = zip(*sentences) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234) #训练数据 text_classifier = TextClassifier() text_classifier.fit(x_train, y_train) #保存并加载模型 joblib.dump(text_classifier, 'text_classifier.pkl') # new_text_classifier=joblib.load('text_classifier.pkl') # precision=text_classifier.score(x_test, y_test) return bingyin_list, zhenduan_list, zhengzhuang_list, zhiliao_list, x_train, x_test, y_train, y_test
from load_dataset import processing_null #读取数据并预处理 df_bingyin_list = load_dataset('病因') df_zhenduan_list = load_dataset('诊断') df_zhengzhuang_list = load_dataset('症状') df_zhiliao_list = load_dataset('治疗') stopwords = pd.read_csv('data/stopwords.txt', index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8') stopwords = stopwords['stopword'].values #导入停用词 #对各个类别数据进行空值符处理 df_bingyin_word = processing_null(df_bingyin_list)[0:1000] # print(len(df_bingyin_word)) df_zhenduan_word = processing_null(df_zhenduan_list)[0:1000] df_zhengzhuang_word = processing_null(df_zhengzhuang_list)[0:1000] df_zhiliao_word = processing_null(df_zhiliao_list)[0:1000] def build_sentence_vector(text, size, imdb_w2v): vec = np.zeros(size).reshape((1, size)) count = 0 contents = jieba.lcut(text) for word in contents: if word not in stopwords: try: vec += imdb_w2v[word].reshape((1, size)) count += 1
from load_dataset import processing_null import os import csv #数据预处理 def test(line): text_classifier = joblib.load('text_classifier.pkl') content = text_classifier.process_line(line) leibie = text_classifier.predict(content)[0] return leibie #文本处理,训练并随时存储数据 path = "C:\\Users\\Administrator\\Desktop\\GBDT_predicted\\ziliao" folder_list = os.listdir(path) with open('new_txt.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['line', 'leibie']) for folder in folder_list: folder_path = os.path.join(path, folder) df = pd.read_csv(folder_path, encoding='gbk') # df = pd.read_csv('ziliao/aixiao.csv', encoding='gbk') df = processing_null(df) for line in df: # print(type(line)) liebie = test(line) # new = pd.DataFrame([{'title': line,'leibie':liebie}], index=['0']) writer.writerow([line, liebie]) f.close()