def model_train(self, n_lay, key_num, word_len, out_lay, train): alpha_logger.info("Model Train Config loading...") train_x, train_y = train model_question = keras.models.Sequential() model_question.add(keras.layers.Embedding(output_dim=n_lay, input_dim=key_num, input_length=word_len)) model_question.add(keras.layers.Flatten()) model_question.add(keras.layers.Dense(units=256, activation='relu')) model_question.add(keras.layers.Dropout(0.3)) model_question.add(keras.layers.Dense(units=out_lay, activation='softmax')) alpha_logger.info(model_question.summary()) alpha_logger.info("Model Train Config load Completed.") alpha_logger.info("Model Starting...") model_question.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) history_q = model_question.fit(x=train_x, y=train_y, validation_split=0.2, epochs=10, batch_size=128, verbose=1) alpha_logger.info("Model Train Completed.") self.plot_model(history_q) # prediction = model_question.predict(train_x) # print(prediction[0]) model_question.save('./kara_model_q.h5') return model_question
def plot_model(history): import matplotlib.pyplot as plt acc = history.history['acc'] val_acc = history.history['val_acc'] loss = history.history['loss'] val_loss = history.history['val_loss'] epochs = range(1, len(acc) + 1) plt.plot(epochs, loss, 'r', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.show() plt.clf() acc_values = history.history['acc'] val_acc_values = history.history['val_acc'] plt.plot(epochs, acc, 'r', label='Training acc') plt.plot(epochs, val_acc, 'b', label='Validation acc') plt.title('Training and validation accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() alpha_logger.info("Plot Completed")
def word_cloud(self, df): cli_text = dict() tag_to_clinic = dict(zip(df.Q_Tag, df.CLINIC)) for cli in self.qa['CLINIC'].value_counts().index: text = df.loc[df['CLINIC'] == cli, 'Q_D'] text = (' '.join(map(str, text))).split(' ') cli_text[cli] = text fig, axes = plt.subplots(4, 4, figsize=(30, 38)) k = 1 for i in range(4): for j in range(4): cat = tag_to_clinic[k] most100 = Counter(cli_text[cat]).most_common(100) ax = axes[i, j] ax.imshow(self.generate_wordcloud(most100), interpolation="bilinear") ax.axis('off') ax.set_title("{} Top 100".format(cat), fontsize=10) k += 1 if k == 15: break if k == 15: break plt.show() alpha_logger.info("Word Cloud Completed")
def __init__(self, db_config, db_driver='mysql'): self.engine = create_engine( str(r"mysql+mysqldb://%s:" + '%s' + "@%s/%s?charset=utf8") % (db_config.USER_NAME, db_config.PASSWORD, db_config.HOST, db_config.DBNAME)) self.conn = self.engine.connect() self.cur_path = os.getcwd() alpha_logger.info(self.cur_path) self.clinic_one_hot_code = { "中医科": [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "产科": [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "儿科": [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "内科": [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "口腔颌面科": [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "外科": [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], "妇科": [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], "男科": [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], "皮肤性病科": [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], "眼科": [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], "耳鼻咽喉科": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], "肿瘤及防治科": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], "营养科": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], "骨伤科": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], "全部科室": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] } self.clinic_code = { "中医科": 1, "产科": 2, "儿科": 3, "内科": 4, "口腔颌面科": 5, "外科": 6, "妇科": 7, "男科": 8, "皮肤性病科": 9, "眼科": 10, "耳鼻咽喉科": 11, "肿瘤及防治科": 12, "营养科": 13, "骨伤科": 14, "全部科室": 15 } self.clinic_code_T = { value: key for key, value in self.clinic_code.items() } self.sql_select_qa = """SELECT * FROM dataManage_qalist WHERE CLINIC != "全部科室" """ alpha_logger.info("DataPreProcess loading...\n----------------------") self.qa = pd.read_sql(sql=self.sql_select_qa, con=self.conn) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('max_colwidth', 100)
def model_train(data): alpha_logger.info("Model Train Config loading...") X_train, X_test, y_train, y_test = train_test_split( data['Q_D'], data['Q_Tag'], random_state=0, stratify=data['Q_Tag']) count_vec = CountVectorizer() X_train_counts = count_vec.fit_transform(raw_documents=X_train) tf_idf_transformer = TfidfTransformer() X_train_tf_idf = tf_idf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tf_idf, y_train) alpha_logger.info("Model Train Config load Completed.") return clf, count_vec
def show_data_message(self): clinic_count = { 'Clinic': self.qa['CLINIC'].value_counts().index, 'Count': self.qa['CLINIC'].value_counts() } df_clinic = pd.DataFrame(data=clinic_count).reset_index(drop=True) alpha_logger.info(df_clinic) df_clinic.plot(x='Clinic', y='Count', kind='bar', legend=False, figsize=(8, 5)) plt.title("科室问题统计") plt.ylabel('Count', fontsize=18) plt.xlabel('Clinic', fontsize=18)
def divide_and_generate(self): q_all_list = [] stop_words = self.stop_words_list("./chineseStopWords.txt") qa = self.qa.copy(deep=True) qa["Q_Clean"] = qa["QUESTION"].apply(self.remove_punctuation) qa["Q_D"] = qa["Q_Clean"].apply(lambda x: " ".join( [w for w in jb.lcut_for_search(x) if w not in stop_words])) qa["Q_Tag"] = qa["CLINIC"].apply(lambda x: self.clinic_code[x] if x in self.clinic_code else 0) # 生成词云 self.word_cloud(qa) tf_idf = TfidfVectorizer(norm='l2', ngram_range=(1, 2)) features = tf_idf.fit_transform(qa.Q_D) labels = qa.Q_Tag alpha_logger.info(features.shape) alpha_logger.info(features) N = 2 for cli, cli_tag in self.clinic_code.items(): features_chi2 = chi2(features, labels == cli_tag) indices = np.argsort(features_chi2[0]) feature_names = np.array(tf_idf.get_feature_names())[indices] uni_grams = [v for v in feature_names if len(v.split(' ')) == 1] bi_grams = [v for v in feature_names if len(v.split(' ')) == 2] print("# '{}':".format(cli)) print(" . Most correlated uni-grams:\n . {}".format( '\n . '.join(uni_grams[-N:]))) print(" . Most correlated bi-grams:\n . {}".format( '\n . '.join(bi_grams[-N:]))) alpha_logger.info("相关性展示") return qa, features, labels
def divide_q(self): q_all_list = [] pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) pd.set_option('max_colwidth', 100) qa = self.qa.copy(deep=True) qa["Q_D"] = qa["QUESTION"].apply(lambda x: jb.lcut_for_search(x)) qa["Q_Tag"] = qa["CLINIC"].apply(lambda x: self.clinic_code[x] if x in self.clinic_code else 0) token = keras.preprocessing.text.Tokenizer(num_words=6000) q_list = qa["QUESTION"].to_list() q_d_list = qa["Q_D"].to_list() token.fit_on_texts(q_d_list) alpha_logger.info("词索引 ") alpha_logger.info(token.word_index) alpha_logger.info("词数 ") alpha_logger.info(token.word_counts) qa["Q_S"] = token.texts_to_sequences(q_d_list) q_seq_list = qa["Q_S"].to_list() q_seq_array = keras.preprocessing.sequence.pad_sequences(q_seq_list, padding='post', truncating='post', maxlen=50) qa["Q_S_padding"] = q_seq_array.tolist() clinic_tag_list = qa["Q_Tag"].tolist() clinic_tag_array = keras.preprocessing.sequence.utils.to_categorical(clinic_tag_list, num_classes=16) qa["Clinic_S"] = clinic_tag_array.tolist() alpha_logger.info("\n %s", qa.head(n=2)) alpha_logger.info("Divide & Tag & Sequence Complete.") train_data = (q_seq_array, clinic_tag_array) token_pk_file = open('./token_pk.pkl', 'wb') pk.dump(token, token_pk_file) token_pk_file.close() return qa, train_data, token
self.pre_clinic_code_T = {value: key for key, value in self.pre_clinic_code.items()} def pre_for_input(self, input_x): print(input_x) input_list = [jb.lcut_for_search(input_x)] input_x_seq = self.token_pre.texts_to_sequences(input_list) print(input_x_seq) input_x_pad = keras.preprocessing.sequence.pad_sequences(input_x_seq, padding='post', truncating='post', maxlen=50) print(input_x_pad) output_x = self.model_pre.predict(input_x_pad) print("Predict Output:", self.pre_clinic_code_T[np.argmax(output_x)]) return 0 if __name__ == '__main__': # 分词 data_pre = DataPreProcess(mysql_db_config) qa_, train_data_, token_ = data_pre.divide_q() Q_model = data_pre.model_train(n_lay=32, key_num=6000, word_len=50, out_lay=16, train=train_data_) alpha_logger.info("PreProcessing completed.") data_pre.test_model(Q_model, train_data_, qa_, 0) trigger = PreTrigger(token_x=token_, model_x=Q_model) trigger.pre_for_input("手淫导致尿痛是尿道炎吗?")
def before_process(self): alpha_logger.info("训练数据总量:%d" % len(self.qa)) alpha_logger.info(self.qa.sample(10)) self.show_data_message() return self.divide_and_generate()