Example #1
0
 def PreprocessData(self):
     self.data = pd.read_csv(
         os.path.join(self.path_net, "save_CNN_text_2_datas.csv"))
     numeric_xys = Kc.DataFrameToXY(
         self.data, [
             "Sex", "Years", "Temperature", "Pulse", "Breath",
             "Blood_Pressure_Systolic", "Blood_Pressure_Diastolic", "Width",
             "Height", "Corneal_Endothelium_Right",
             "Corneal_Endothelium_Left", "Right_Intraocular_Pressure",
             "Left_Intraocular_Pressure"
         ],
         "MajorDiagnosisCoding",
         manualFilePath=os.path.join(self.path_net,
                                     r"CodingToClass_1to16 - new 多分类.csv"))
     text_xys = Kc.DataFrameToXY(self.data, ["HistoryOfPastIllness"],
                                 "MajorDiagnosisCoding",
                                 manualFilePath=os.path.join(
                                     self.path_net,
                                     r"CodingToClass_1to16 - new 多分类.csv"))
     # # 排除不要的标记数据
     numeric_xys = numeric_xys[numeric_xys["MajorDiagnosisCoding"] != "-1"]
     text_xys = text_xys[text_xys["MajorDiagnosisCoding"] != "-1"]
     # 字符串向量化
     model, text_x_vds = Kc.TextToVector_doc2vec(
         text_xys["HistoryOfPastIllness"].values.tolist())
     model.save(os.path.join(self.path_net, r'model_text_2.model'))
     text_xys.insert(1, 'HistoryOfPastIllness_vec', text_x_vds)
     text_xys.drop(['HistoryOfPastIllness'], axis=1, inplace=True)
     text_xys_np = text_xys.values
     # 数值型插补
     numeric_xys = Kc.Mice_numeric(numeric_xys)
     # # 合并数据
     text_xys["MajorDiagnosisCoding"] = text_xys[
         "MajorDiagnosisCoding"].astype("float64")
     text_xys = text_xys.reset_index(drop=True)
     all_xys = numeric_xys.copy()
     all_xys['HistoryOfPastIllness_vec'] = text_xys[
         'HistoryOfPastIllness_vec']
     # 数据校验
     # # 取出低压
     xys_c_0 = all_xys.copy()
     # xys_c_0.loc[:, 'MajorDiagnosisCoding'] += 1
     xys_c_1 = xys_c_0.loc[(xys_c_0["Right_Intraocular_Pressure"] > 10)
                           & (xys_c_0["Left_Intraocular_Pressure"] > 10)]
     xys_c_1.loc[xys_c_1[(xys_c_1["Right_Intraocular_Pressure"] <= 21) & (
         xys_c_1["Left_Intraocular_Pressure"] <= 21)].index,
                 "MajorDiagnosisCoding"] = 0
     xys_c_end = xys_c_1
     # 保存
     xys_c_end.reset_index(drop=True)
     self.xys_c_end = xys_c_end
     return xys_c_end
def main():
    # path_net = os.path.join('/', 'public', 'home', 'liqi', 'data', 'analysis', 'transcribe_CNN')
    path_net = os.path.join('Z:\\', 'datas', 'analysis')

    datas = pd.read_csv(os.path.join(path_net, "save_CNN_text_2_datas.csv"))

    text_xys = Kc.DataFrameToXY(datas, ["HistoryOfPastIllness"],
                                "MajorDiagnosisCoding",
                                manualFilePath=os.path.join(
                                    path_net,
                                    r"CodingToClass_1to16 - new 多分类.csv"))

    text_xys = text_xys[text_xys["MajorDiagnosisCoding"] != "-1"]

    # 字符串向量化
    text_x_all = ""
    text_x_thus = []
    text_x_d2v_train = []
    thu1 = thulac.thulac(seg_only=True)  # 默认模式
    for i in range(len(text_xys["HistoryOfPastIllness"])):
        try:
            index_text_y = text_xys["HistoryOfPastIllness"][i]
        except KeyError:
            index_text_y = ""

        # 分词
        temp_thu1_rt = thu1.cut(index_text_y, text=True)
        # document = gensim.models.doc2vec.TaggedDocument(temp_thu1_rt, tags=[i])
        text_x_d2v_train.append(temp_thu1_rt)

    model = gensim.models.word2vec.Word2Vec(text_x_d2v_train,
                                            min_count=1,
                                            iter=20)
    model.save(os.path.join(path_net, r'model_text_word2vec.model'))
    # model = gensim.models.Doc2Vec(text_x_d2v_train, size=50, window=8, min_count=5, workers=4)
    # model.save(os.path.join(path_net, r'model_text_2.model'))
    text_x_vds = list(
        map(lambda x: np.array(x), model.docvecs.vectors_docs.tolist()))
    text_xys.insert(1, 'HistoryOfPastIllness_vec', text_x_vds)
    text_xys.drop(['HistoryOfPastIllness'], axis=1, inplace=True)

    text_xys_np = text_xys.values

    x_train, y_train, x_test, y_test = Kc.SplitGroup(text_xys_np)
    y_train = y_train.astype(np.float64) - 1  # 没有得到的消除,只有1和2两类
    y_test = y_test.astype(np.float64) - 1  # 没有得到的消除,只有1和2两类
    x_train = np.array(list(map(lambda x: x[0].tolist(), x_train.tolist())))
    x_test = np.array(list(map(lambda x: x[0].tolist(), x_test.tolist())))

    x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
    x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)

    numclasses = y_train.max() + 1
    onesharp = len(x_train[0])

    # 创建神经网络
    print("搭建神经网络")
    model = tf.keras.Sequential([
        layers.LSTM(16, return_sequences=True, input_shape=(onesharp, 1)),
        layers.Conv1D(32, 5, padding='same', activation=tf.nn.relu),
        layers.Conv1D(64, 5, padding='same', activation=tf.nn.relu),
        layers.MaxPooling1D(5),
        layers.Dropout(0.25),
        layers.LSTM(64, return_sequences=True),
        layers.Conv1D(128, 5, padding='same', activation=tf.nn.relu),
        layers.Conv1D(64, 5, padding='same', activation=tf.nn.relu),
        layers.MaxPooling1D(5),
        layers.Dropout(0.25),
        layers.LSTM(64, return_sequences=True),
        layers.Conv1D(32, 5, padding='same', activation=tf.nn.relu),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.5),
        layers.Dense(numclasses, activation=tf.nn.softmax)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    print(model.summary())
    print("训练神经网络")
    metrics = Kc.Metrics(validation_data=(x_test, y_test))
    history = model.fit(
        x_train,
        y_train,
        batch_size=400,
        validation_data=(x_test, y_test),
        epochs=500,
        callbacks=[
            metrics,
            TensorBoard(log_dir=os.path.join('logs', '{}').format(
                "模型名-{}".format(int(time.time()))))
        ])

    nowtime = str(int(time.time()))
    try:
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.plot(metrics.val_f1s)
        plt.plot(metrics.val_recalls)
        plt.plot(metrics.val_precisions)
        plt.legend([
            'training', 'validation', 'val_f1', 'val_recall', 'val_precision'
        ],
                   loc='upper left')

        plt.savefig(
            os.path.join(path_net, 'result', 'CNN_text_2',
                         nowtime + '_cnn_result' + '.png'))
        plt.show()
    except:
        print("无法画图")
    finally:
        with open(
                os.path.join(path_net, 'result', 'CNN_text_2',
                             nowtime + '_history.txt'), 'w+') as f:
            f.write(str(history.history))
    f = open('cnn_text.py', 'r', encoding='utf-8')
    fff = f.read()
    f.close()
    nf = open(
        os.path.join(path_net, 'result', 'CNN_text_2',
                     nowtime + '_code' + '.py'), 'w+')
    nf.write(fff)
    nf.close()
    print('结束')
    return 0