def python_random_split(data, ratio=0.75, seed=42):
    """Pandas random splitter
    The splitter randomly splits the input data.

    Args:
        data (pd.DataFrame): Pandas DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two halves and the ratio argument indicates the ratio 
            of training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        
    Returns:
        list: Splits of the input data as pd.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    if multi_split:
        splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed)
        splits_new = [x.drop('split_index', axis=1) for x in splits]

        return splits_new
    else:
        return sk_split(data, test_size=None, train_size=ratio, random_state=seed)
Beispiel #2
0
def python_random_split(data, ratio=0.75, seed=42):
    """Pandas random splitter
    The splitter randomly splits the input data.

    Args:
        data (pd.DataFrame): Pandas DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two halves and the ratio argument indicates the ratio 
            of training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        
    Returns:
        list: Splits of the input data as pd.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    if multi_split:
        splits = split_pandas_data_with_ratios(data,
                                               ratio,
                                               shuffle=True,
                                               seed=seed)
        splits_new = [x.drop('split_index', axis=1) for x in splits]

        return splits_new
    else:
        return sk_split(data,
                        test_size=None,
                        train_size=ratio,
                        random_state=seed)
Beispiel #3
0
def main():
    path = 'heart.csv'
    feature, target = Read_data(path)
    save_path = "test-output/"
    # 資料切分
    train_feature, test_feature, train_target, test_target = sk_split(feature, target, test_size=0.3, random_state=10) # 以隨機的方式資料分割 並給隨機種子固定隨機模式
    
    # 模型選擇
    model = clf_rtree(max_depth=5,min_samples_leaf=4,n_estimators=10,random_state=10) # 限制決策樹最大深度 避免overfitting 
    model.fit(train_feature, train_target)
    # 模型預測
    pre_target = model.predict(test_feature)
    
    # true false positive sensitive ROC 
    t_f_postive(test_target,pre_target)
    
    # accurancy 正確率
    scor = model.score(test_feature,test_target)
    print('scor: ',scor)

    # 各feature 的影響程度
    var_influence(model, test_feature, save_path)
    
    # 畫隨機森林樹出來

    column = train_feature.columns
    draw_tree(model , column , save_path)
Beispiel #4
0
def train(model, x, y):
    record_path = 'model_save/training.csv'

    train_x, test_x, train_y, test_y = sk_split(
        x, y, test_size=0.3, random_state=10)  # 以隨機的方式資料分割 並給隨機種子固定隨機模式
    train_y = keras.utils.to_categorical(train_y, 2)
    test_y = keras.utils.to_categorical(test_y, 2)
    adam = Adam(lr=0.05, decay=3e-4)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    # 將每次訓練的結果已CSV檔形式儲存起來 => 可以用來視覺化訓練情況
    csv_logger = CSVLogger(record_path)
    filepath = 'model_save'
    checkpointer = ModelCheckpoint(filepath=filepath +
                                   '/weights-{val_acc:.2f}.hdf5',
                                   verbose=1,
                                   save_best_only=True,
                                   period=10)
    # model.fit(train_x, train_y, batch_size=20, epochs=500, validation_data=(test_x,test_y), verbose=1, callbacks=[csv_logger, checkpointer])
    train_plot(record_path)
    analizy(filepath, x, y)