Beispiel #1
0
    def SplitData(self, index_load: str = "false"):
        xys_dd = self.xys_c_end.values

        if index_load == "false":
            self.train_index_list = []
            self.test_index_list = []
            x_train, y_train, x_test, y_test = Kc.SplitGroup(
                xys_dd,
                train_index_list=self.train_index_list,
                test_index_list=self.test_index_list)
        elif index_load == "true":
            self.SplitData_Load()
            y_test = xys_dd[self.test_index_list, 0]
            x_test = xys_dd[self.test_index_list, 1:]
            y_train = xys_dd[self.train_index_list, 0]
            x_train = xys_dd[self.train_index_list, 1:]

        y_train = y_train.astype(np.float64)
        y_test = y_test.astype(np.float64)
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test

        return x_train, y_train, x_test, y_test
def main():
    # path_net = os.path.join('/', 'public', 'home', 'liqi', 'data', 'analysis', 'transcribe_CNN')
    path_net = os.path.join('Z:\\', 'datas', 'analysis')

    datas = pd.read_csv(os.path.join(path_net, "save_CNN_text_2_datas.csv"))

    text_xys = Kc.DataFrameToXY(datas, ["HistoryOfPastIllness"],
                                "MajorDiagnosisCoding",
                                manualFilePath=os.path.join(
                                    path_net,
                                    r"CodingToClass_1to16 - new 多分类.csv"))

    text_xys = text_xys[text_xys["MajorDiagnosisCoding"] != "-1"]

    # 字符串向量化
    text_x_all = ""
    text_x_thus = []
    text_x_d2v_train = []
    thu1 = thulac.thulac(seg_only=True)  # 默认模式
    for i in range(len(text_xys["HistoryOfPastIllness"])):
        try:
            index_text_y = text_xys["HistoryOfPastIllness"][i]
        except KeyError:
            index_text_y = ""

        # 分词
        temp_thu1_rt = thu1.cut(index_text_y, text=True)
        # document = gensim.models.doc2vec.TaggedDocument(temp_thu1_rt, tags=[i])
        text_x_d2v_train.append(temp_thu1_rt)

    model = gensim.models.word2vec.Word2Vec(text_x_d2v_train,
                                            min_count=1,
                                            iter=20)
    model.save(os.path.join(path_net, r'model_text_word2vec.model'))
    # model = gensim.models.Doc2Vec(text_x_d2v_train, size=50, window=8, min_count=5, workers=4)
    # model.save(os.path.join(path_net, r'model_text_2.model'))
    text_x_vds = list(
        map(lambda x: np.array(x), model.docvecs.vectors_docs.tolist()))
    text_xys.insert(1, 'HistoryOfPastIllness_vec', text_x_vds)
    text_xys.drop(['HistoryOfPastIllness'], axis=1, inplace=True)

    text_xys_np = text_xys.values

    x_train, y_train, x_test, y_test = Kc.SplitGroup(text_xys_np)
    y_train = y_train.astype(np.float64) - 1  # 没有得到的消除,只有1和2两类
    y_test = y_test.astype(np.float64) - 1  # 没有得到的消除,只有1和2两类
    x_train = np.array(list(map(lambda x: x[0].tolist(), x_train.tolist())))
    x_test = np.array(list(map(lambda x: x[0].tolist(), x_test.tolist())))

    x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
    x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)

    numclasses = y_train.max() + 1
    onesharp = len(x_train[0])

    # 创建神经网络
    print("搭建神经网络")
    model = tf.keras.Sequential([
        layers.LSTM(16, return_sequences=True, input_shape=(onesharp, 1)),
        layers.Conv1D(32, 5, padding='same', activation=tf.nn.relu),
        layers.Conv1D(64, 5, padding='same', activation=tf.nn.relu),
        layers.MaxPooling1D(5),
        layers.Dropout(0.25),
        layers.LSTM(64, return_sequences=True),
        layers.Conv1D(128, 5, padding='same', activation=tf.nn.relu),
        layers.Conv1D(64, 5, padding='same', activation=tf.nn.relu),
        layers.MaxPooling1D(5),
        layers.Dropout(0.25),
        layers.LSTM(64, return_sequences=True),
        layers.Conv1D(32, 5, padding='same', activation=tf.nn.relu),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.5),
        layers.Dense(numclasses, activation=tf.nn.softmax)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    print(model.summary())
    print("训练神经网络")
    metrics = Kc.Metrics(validation_data=(x_test, y_test))
    history = model.fit(
        x_train,
        y_train,
        batch_size=400,
        validation_data=(x_test, y_test),
        epochs=500,
        callbacks=[
            metrics,
            TensorBoard(log_dir=os.path.join('logs', '{}').format(
                "模型名-{}".format(int(time.time()))))
        ])

    nowtime = str(int(time.time()))
    try:
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.plot(metrics.val_f1s)
        plt.plot(metrics.val_recalls)
        plt.plot(metrics.val_precisions)
        plt.legend([
            'training', 'validation', 'val_f1', 'val_recall', 'val_precision'
        ],
                   loc='upper left')

        plt.savefig(
            os.path.join(path_net, 'result', 'CNN_text_2',
                         nowtime + '_cnn_result' + '.png'))
        plt.show()
    except:
        print("无法画图")
    finally:
        with open(
                os.path.join(path_net, 'result', 'CNN_text_2',
                             nowtime + '_history.txt'), 'w+') as f:
            f.write(str(history.history))
    f = open('cnn_text.py', 'r', encoding='utf-8')
    fff = f.read()
    f.close()
    nf = open(
        os.path.join(path_net, 'result', 'CNN_text_2',
                     nowtime + '_code' + '.py'), 'w+')
    nf.write(fff)
    nf.close()
    print('结束')
    return 0
Beispiel #3
0
def main():
    nowtime = str(int(time.time()))
    py_filename = "cnn_mix_vgg16x_preSet"
    path_net = os.path.join('/', 'public', 'home', 'liqi', 'data', 'analysis', 'transcribe_CNN')
    # path_net = os.path.join('Z:', 'datas', 'analysis')

    # xys_c_end = PreprocessData(path_net)

    # xys_c_end.to_csv(os.path.join(path_net, r'xys_c_end.csv'))
    xys_c_end = pd.read_csv(os.path.join(path_net, r'xys_c_end.csv'))
    xys_c_end.drop(xys_c_end.columns[[0]], axis=1, inplace=True)# 删除多出来的序号列
    xys_c_end['HistoryOfPastIllness_vec'] = xys_c_end['HistoryOfPastIllness_vec'].apply(lambda x: eval(re.sub(r'(?<=\d)\s+', ',', x, flags=re.S)))
    xys_c_end.loc[xys_c_end[xys_c_end['MajorDiagnosisCoding'] != 0].index, 'MajorDiagnosisCoding'] = 1

    xys_c_end = Generate_TLCPD(xys_c_end.copy())

    ###########
    # 转为np.array
    xys_dd = xys_c_end.values
    x_train, y_train, x_test, y_test = Kc.SplitGroup(xys_dd)
    y_train = y_train.astype(np.float64)
    y_test = y_test.astype(np.float64)

    np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'x_train.npy'), x_train)
    np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'x_test.npy'), x_test)
    np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'y_train.npy'), y_train)
    np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'y_test.npy'), y_test)
    # 分离数据
    x_train_text = np.array(list(map(lambda x: x[13], x_train)))
    x_train_numerics = np.array(list(map(lambda x: x[0:12], x_train)), dtype=np.float64)
    y_train_all = y_train
    x_test_text = np.array(list(map(lambda x: x[13], x_test)))
    x_test_numerics = np.array(list(map(lambda x: x[0:12], x_test)), dtype=np.float64)
    y_test_all = y_test

    x_train_text = x_train_text.reshape(x_train_text.shape[0], x_train_text.shape[1], 1)
    x_train_numerics = x_train_numerics.reshape(x_train_numerics.shape[0], x_train_numerics.shape[1], 1)
    x_test_text = x_test_text.reshape(x_test_text.shape[0], x_test_text.shape[1], 1)
    x_test_numerics = x_test_numerics.reshape(x_test_numerics.shape[0], x_test_numerics.shape[1], 1)

    numclasses = y_train.max() + 1

    # 创建神经网络
    print("搭建神经网络")

    model = DefineModel_textAndNumerics_VGG16x([x_train_text.shape[1], x_train_numerics.shape[1]], numclasses)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    print(model.summary())
    print("训练神经网络")
    metrics1 = Kc.Mertics_f1s_recalls_precisions(validation_data=([x_test_text, x_test_numerics], y_test))
    # metrics2 = Kc.Mertics_roc_auc(validation_data=([x_test_text, x_test_numeric], y_test))
    history = model.fit([x_train_text, x_train_numerics], y_train, batch_size=400, validation_data=([x_test_text, x_test_numerics], y_test), epochs=300,
                        callbacks=[metrics1])

    y_test_result = model.predict([x_test_text, x_test_numerics])


    Kc.CreateDir(os.path.join(path_net, 'result', py_filename))
    Kc.CreateDir(os.path.join(path_net, 'model', py_filename))
    try:
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.plot(metrics1.val_f1s)
        plt.plot(metrics1.val_recalls)
        plt.plot(metrics1.val_precisions)
        plt.legend(['training', 'validation', 'val_f1', 'val_recall', 'val_precision'], loc='upper left')

        plt.savefig(os.path.join(path_net, 'result', py_filename, nowtime + '_cnn_result' + '.png'))
        plt.ion()
        plt.show()

        # plt.plot(metrics2.val_fprs, metrics2.val_tprs)
        # plt.show()
    except:
        print("无法画图")
    finally:
        with open(os.path.join(path_net, 'result', py_filename, nowtime + '_history.txt'), 'w+') as f:
            f.write(str(history.history))
        model.save(os.path.join(path_net, 'model', py_filename, nowtime + r'_' + py_filename +'.model'))

        Kc.CreateDir(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename))
        np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'x_test_text.npy'), x_test_text)
        np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'x_test_numeric.npy'), x_test_numerics)
        np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'y_test_result.npy'), y_test_result)
        np.save(os.path.join(path_net, 'result', py_filename, nowtime + r'_' + py_filename, 'y_test.npy'), y_test)

    f = open(py_filename + '.py', 'r', encoding='utf-8')
    fff = f.read()
    f.close()
    nf = open(os.path.join(path_net, 'result', py_filename, nowtime + '_code' + '.py'), 'w+')
    nf.write(fff)
    nf.close()
    print('结束')

    return 0
Beispiel #4
0
def main():
    path_net = os.path.join('/', 'public', 'home', 'liqi', 'data', 'analysis',
                            'transcribe_CNN')
    # path_net = os.path.join('Z:', 'datas', 'analysis')

    # xys_c_end = PreprocessData(path_net)

    # xys_c_end.to_csv(os.path.join(path_net, r'xys_c_end.csv'))
    xys_c_end = pd.read_csv(os.path.join(path_net, r'xys_c_end.csv'))

    xys_c_end['HistoryOfPastIllness_vec'] = xys_c_end[
        'HistoryOfPastIllness_vec'].apply(
            lambda x: eval(re.sub(r'(?<=\d)\s+', ',', x, flags=re.S)))

    ###########
    # 转为np.array
    xys_dd = xys_c_end.values
    x_train, y_train, x_test, y_test = Kc.SplitGroup(xys_dd)
    y_train = y_train.astype(np.float64)
    y_test = y_test.astype(np.float64)

    # 分离数据
    x_train_text = np.array(list(map(lambda x: x[13], x_train)))
    x_train_numerics = np.array(list(map(lambda x: x[0:12], x_train)),
                                dtype=np.float64)
    y_train_all = y_train
    x_test_text = np.array(list(map(lambda x: x[13], x_test)))
    x_test_numerics = np.array(list(map(lambda x: x[0:12], x_test)),
                               dtype=np.float64)
    y_test_all = y_test

    x_train_text = x_train_text.reshape(x_train_text.shape[0], 1,
                                        x_train_text.shape[1])
    x_train_numerics = x_train_numerics.reshape(x_train_numerics.shape[0], 1,
                                                x_train_numerics.shape[1])
    x_test_text = x_test_text.reshape(x_test_text.shape[0], 1,
                                      x_test_text.shape[1])
    x_test_numerics = x_test_numerics.reshape(x_test_numerics.shape[0], 1,
                                              x_test_numerics.shape[1])

    numclasses = y_train.max() + 1

    # 创建神经网络
    print("搭建神经网络")

    model = DefineModel_textAndNumerics(
        [x_train_text.shape[2], x_train_numerics.shape[2]], numclasses)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    print(model.summary())
    print("训练神经网络")
    metrics = Kc.Mertics_f1s_recalls_precisions(
        validation_data=([x_test_text, x_test_numerics], y_test))
    history = model.fit([x_train_text, x_train_numerics],
                        y_train,
                        batch_size=200,
                        validation_data=([x_test_text,
                                          x_test_numerics], y_test),
                        epochs=700,
                        callbacks=[metrics])

    nowtime = str(int(time.time()))
    try:
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.plot(metrics.val_f1s)
        plt.plot(metrics.val_recalls)
        plt.plot(metrics.val_precisions)
        plt.legend([
            'training', 'validation', 'val_f1', 'val_recall', 'val_precision'
        ],
                   loc='upper left')

        plt.savefig(
            os.path.join(path_net, 'result', 'cnn_mix_1',
                         nowtime + '_cnn_result' + '.png'))
        plt.show()
    except:
        print("无法画图")
    finally:
        with open(
                os.path.join(path_net, 'result', 'cnn_mix_1',
                             nowtime + '_history.txt'), 'w+') as f:
            f.write(str(history.history))

        model.save(
            os.path.join(path_net, 'model', 'cnn_mix_1',
                         nowtime + r'_cnn_mix_1.model'))

    f = open('cnn_text.py', 'r', encoding='utf-8')
    fff = f.read()
    f.close()
    nf = open(
        os.path.join(path_net, 'result', 'cnn_mix_1',
                     nowtime + '_code' + '.py'), 'w+')
    nf.write(fff)
    nf.close()
    print('结束')

    return 0