def load_rf_data(cur_path):
    data_folder = "data\\titanic"
    processed_data_folder = os.path.join(cur_path, data_folder)
    # Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess
    #       how well the model performed.
    data_file_path = os.path.join(processed_data_folder, "train.csv")
    data = DataProcessor(data_file_path, processed_data_folder)

    try:
        #Try to load data
        data.load_processed_data()
    except FileNotFoundError:
        #No data found, so process it
        # 10% test, 10% validation, 80% training samples from data
        splits = (0.1, 0.1, 0.8)
        # Only use certain columns
        use_cols = (  # 0, #PassengerID
            1,  # Survived
            2,  # Pclass
            # 3, #Name
            4,  # Sex
            5,  # Age
            6,  # SibSp
            7,  # Parch
            # 8, #Ticket
            9,  # Fare
            # 10, #Cabin
            11,  # Embarked
        )
        # Mark features as categorical (so we can one-hot-encode them later)
        # categorical_cols = ()
        categorical_cols = (2,  # Pclass
                            4,  # Sex
                            11  # Embarked
                            )
        # Convert certain columns to float values (so we can use numpy arrays)
        converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex],
                      11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]}
        data.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols, converters=converters,
                          filter_missing=True)
    return data
Exemple #2
0
import sys
from data_processing import DataProcessor
from matplotlib import pyplot

cur_path = os.path.dirname(__file__)
data_folder = "data\\titanic"
processed_data_folder = os.path.join(cur_path, data_folder)
# Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess
#       how well the model performed.
data_file_path = os.path.join(processed_data_folder, "train.csv")
data_processor = DataProcessor(data_file_path, processed_data_folder, "ffnn_processed.npz")

# Load data
try:
    # Try to load data
    data_processor.load_processed_data()

except FileNotFoundError:
    # No data found, so process it
    # 20% test, 20% validation, 60% training samples from data
    splits = (0.2, 0.2, 0.6)
    # Only use certain columns
    use_cols = (  # 0, #PassengerID
                    1,  # Survived
                    2,  # Pclass
                    # 3, #Name
                    4,  # Sex
                    5,  # Age
                    6,  # SibSp
                    7,  # Parch
                    # 8, #Ticket