Esempio n. 1
0
def test_split_data_2():
    '''
    Test to determine proportion of split is correct
    '''
    total_length = len(data)
    df = clean_split_data.clean_data(data)
    X_train, X_test, y_train, y_test = clean_split_data.split_data(df)
    train_length = len(X_train)
    train_split = train_length / total_length
    assert math.isclose(
        train_split, 0.80,
        abs_tol=0.1), ("Training set is not at specified 80% of dataset")

    return
Esempio n. 2
0
def test_split_data_1():
    '''
    Test to determine total length of datafile did not change when splitting
    '''
    total_length = len(data)
    df = clean_split_data.clean_data(data)
    X_train, X_test, y_train, y_test = clean_split_data.split_data(df)
    train_length = len(X_train)
    test_length = len(X_test)
    total_split = train_length + test_length
    assert math.isclose(
        total_length,
        total_split), ("Length of data is not the same as before splitting")

    return
import pandas as pd

# Import Scikit-Learn library for decision tree models
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Import plotting libraries
import matplotlib

# Set larger fontsize for all plots
matplotlib.rcParams.update({'font.size': 18})

# ### Data
data = pd.read_csv('data/data.csv')
data = clean_data(data)
X_train, X_test, y_train, y_test = split_data(data)

# ### Classifier
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)


# ### Optimized Decision Tree Predictor
def feature_names():
    '''
    Returns array of input features of best
    performing backwards stepwise selection test.
    '''

    return [
        'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',