Esempio n. 1
0
def main(input_train, input_test, output_train, output_test):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    raw_data = DataSet(input_train, input_test)

    df_train = raw_data.get_train_set()
    df_test = raw_data.get_test_set()

    TitanicPreProcessing(df_train, output_train)
    TitanicPreProcessing(df_test, output_test)
Esempio n. 2
0
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    raw_data = DataSet(train_dir=input_filepath + '/train.csv',
                       test_dir=input_filepath + '/test.csv')
    cleaning = DataWrangling(train_dir=output_filepath + '/train_clean.csv',
                             test_dir=output_filepath + '/test_clean.csv')

    df_train = raw_data.get_train_set()
    df_test = raw_data.get_test_set()
    df_train_clean = cleaning.apply_preprocessing(df_train, target='Survived')
    df_test_clean = cleaning.apply_preprocessing(df_test, target='Survived')
    cleaning.processed_train_data(df_train_clean)
    cleaning.processed_test_data(df_test_clean)
Esempio n. 3
0
def main(input_train, input_test, input_model, output_prediction):
    """ Runs modeling scripts using model pickle (../models) to predict
        outcomes. Outcomes file is saved as .csv (saved in ../models).
    """
    logger = logging.getLogger(__name__)
    logger.info('predicting outcomes')

    data = DataSet(train_dir=input_train, test_dir=input_test)
    test = data.get_test_set()
    X_test = data.get_features(test)

    model = Model.load(input_model + 'XGBClassifier')
    y_pred = model.predict(X_test)

    output = pd.DataFrame({
        'PassengerId': test['PassengerId'],
        'Survived': y_pred
    })
    output.to_csv(output_prediction + 'submission_{}.csv'.format(model.name),
                  index=False)