コード例 #1
0
ファイル: pipelines.py プロジェクト: fariha23/titanic
def run_titanic_analysis(filename):
    """Data pipeline and predictions.
    Parameters
    ----------
    filename: str
        Path to the Titanic CSV input data
    """

    logging.info('Starting the data analysis pipeline')

    processed_data = (
        pd.read_csv(filename, usecols=['Name', 'Sex', 'Age', 'Survived'])
        .pipe(lambda df: df.fillna({'Age': df.Age.median(), }))
        .pipe(lambda df: df.astype({'Age': 'float64',
                                    'Name': 'object',
                                    'Sex': 'category',
                                    'Survived': 'int64'}))
        .pipe(data.extract_title)
    )

    X_train, X_test, y_train, y_test = models.data_preparation(processed_data,
                                                               test_size=0.2,
                                                               random_state=0)

    models.run_majority_vote(X_train, X_test, y_train, y_test)
    models.run_logistic_regression(X_train, X_test, y_train, y_test)

    logging.info('The data analysis pipeline has terminated')

    return
コード例 #2
0
def test_run_logistic_regression():
    processed_data = (pd.read_csv(
        validation_data,
        usecols=['Name', 'Sex', 'Age',
                 'Survived']).pipe(lambda df: df.fillna({
                     'Age': df.Age.median(),
                 })).pipe(lambda df: df.astype({
                     'Age': 'float64',
                     'Name': 'object',
                     'Sex': 'category',
                     'Survived': 'int64'
                 })).pipe(data.extract_title))

    X_train, X_test, y_train, y_test = models.data_preparation(processed_data,
                                                               test_size=0.2,
                                                               random_state=0)

    majority_vote = models.run_majority_vote(X_train, X_test, y_train, y_test)

    linear_regression = models.run_logistic_regression(X_train, X_test,
                                                       y_train, y_test)

    accuracy_majority_vote = accuracy_score(
        y_true=y_test, y_pred=majority_vote.predict(X_test))

    accuracy_linear_regression = accuracy_score(
        y_true=y_test, y_pred=linear_regression.predict(X_test))

    assert accuracy_linear_regression > accuracy_majority_vote