Ejemplo n.º 1
0
def make_submission_file(predicted_vals, name_prefix, create_gz=True):
    current_ts = time.strftime("%a_%d%b%Y_%H%M%S")
    submission_filepath = "submissions/%s%s.csv" % (name_prefix, current_ts)
    submission = pd.read_csv("data/sample_submission.csv")
    submission.PredictedProb = predicted_vals
    submission.to_csv(submission_filepath, index=False, quoting=csv.QUOTE_NONE)
    if create_gz == False:
        logging.info("See %s" % submission_filepath)
        return
    with open(submission_filepath, 'rb') as f_in, \
        gzip.open(submission_filepath + '.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    logger.info("See %s.gz" % submission_filepath)
    return submission
Ejemplo n.º 2
0
def make_submission_file(predicted_vals, name_prefix, create_gz=True):
    current_ts = time.strftime("%a_%d%b%Y_%H%M%S")
    submission_filepath = "submissions/%s%s.csv" % (name_prefix, current_ts)
    submission = pd.read_csv("data/sample_submission.csv")
    submission.PredictedProb = predicted_vals
    submission.to_csv(submission_filepath, index=False, quoting=csv.QUOTE_NONE)
    if create_gz == False:
        logging.info("See %s" % submission_filepath)
        return
    with open(submission_filepath, 'rb') as f_in, \
        gzip.open(submission_filepath + '.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    logger.info("See %s.gz" % submission_filepath)
    return submission
Ejemplo n.º 3
0
def find_best_estimator(base_estimator,
                        X,
                        y,
                        cfg,
                        section,
                        grid_search_params_key,
                        random_search=True,
                        scoring="accuracy",
                        verbosity=3):
    # grid_search_params_key : key under the indicated section of the
    # configuration YML file containing the grid search parameters
    cv_nfold = cfg[section]["cv_nfold"]
    name = type(base_estimator).__name__
    n_iter = cfg[section]["n_iters"]
    n_jobs = cfg[section]["n_jobs"]
    param_dist = cfg[section][grid_search_params_key]
    random_state = cfg["common"]["seed"]
    logger.info("Finding the best %s based on %s score" % (name, scoring))
    if random_search == cfg[section]["use_random_search"]:
        logger.info("Using random search to find the best %s" % name)
        search = grid_search.RandomizedSearchCV(estimator=base_estimator,
                                                param_distributions=param_dist,
                                                n_iter=n_iter,
                                                n_jobs=n_jobs,
                                                cv=cv_nfold,
                                                random_state=random_state,
                                                scoring=scoring,
                                                verbose=verbosity)
    else:
        logger.info("Using grid search to find the best %s" % name)
        search = grid_search.GridSearchCV(estimator=base_estimator,
                                          param_grid=param_dist,
                                          n_jobs=n_jobs,
                                          cv=cv_nfold,
                                          scoring=scoring,
                                          verbose=verbosity)

    logger.info(search)
    start = time.time()
    search.fit(X, y)
    logger.info("Took %.2f seconds to find the best %s." %
                ((time.time() - start), name))
    report_grid_search_scores(search.grid_scores_, n_top=3)
    return search.best_estimator_
Ejemplo n.º 4
0
def convert_categorical_features(df, cat_features, random_delete_one=False):
    for f in cat_features:
        dummy_cols_df = pd.get_dummies(df[f], prefix=f)
        cols = list(dummy_cols_df.columns.values)
        num_ones = [int(dummy_cols_df[col].sum()) for col in cols]
        cols_w_len = zip(cols, num_ones)
        logging.info("Categorical feature '%s' has %d unique values "
                     "with distribution %s" % (f, len(cols), cols_w_len))
        if random_delete_one == True:
            # Deleting one of the dummy variables
            col = random.choice(list(dummy_cols_df.columns.values))
            logger.info("Deleting column %s" % col)
            dummy_cols_df.drop([col], axis=1, inplace=True)
            # Doing so helps avoid the Multicollinearity problem.
            # Tip Credit : http://stackoverflow.com/a/22130844
        df = df.drop([f], axis=1)
        df = pd.concat((df, dummy_cols_df), axis=1)
    
    return df
Ejemplo n.º 5
0
def convert_categorical_features(df, cat_features, random_delete_one=False):
    for f in cat_features:
        dummy_cols_df = pd.get_dummies(df[f], prefix=f)
        cols = list(dummy_cols_df.columns.values)
        num_ones = [int(dummy_cols_df[col].sum()) for col in cols]
        cols_w_len = zip(cols, num_ones)
        logging.info("Categorical feature '%s' has %d unique values "
                     "with distribution %s" % (f, len(cols), cols_w_len))
        if random_delete_one == True:
            # Deleting one of the dummy variables
            col = random.choice(list(dummy_cols_df.columns.values))
            logger.info("Deleting column %s" % col)
            dummy_cols_df.drop([col], axis=1, inplace=True)
            # Doing so helps avoid the Multicollinearity problem.
            # Tip Credit : http://stackoverflow.com/a/22130844
        df = df.drop([f], axis=1)
        df = pd.concat((df, dummy_cols_df), axis=1)

    return df
Ejemplo n.º 6
0
def find_best_estimator(base_estimator, X, y, cfg, section,
                        grid_search_params_key,
                        random_search=True, scoring="accuracy", verbosity=3):
    # grid_search_params_key : key under the indicated section of the
    # configuration YML file containing the grid search parameters
    cv_nfold = cfg[section]["cv_nfold"]
    name = type(base_estimator).__name__
    n_iter = cfg[section]["n_iters"]
    n_jobs = cfg[section]["n_jobs"]
    param_dist = cfg[section][grid_search_params_key]
    random_state = cfg["common"]["seed"]
    logger.info("Finding the best %s based on %s score" % (name, scoring))
    if random_search == cfg[section]["use_random_search"]:
        logger.info("Using random search to find the best %s" % name)
        search = grid_search.RandomizedSearchCV(estimator=base_estimator,
                                                param_distributions=param_dist,
                                                n_iter=n_iter,
                                                n_jobs=n_jobs,
                                                cv=cv_nfold,
                                                random_state=random_state,
                                                scoring=scoring,
                                                verbose=verbosity)
    else:
        logger.info("Using grid search to find the best %s" % name)
        search = grid_search.GridSearchCV(estimator=base_estimator,
                                          param_grid=param_dist,
                                          n_jobs=n_jobs,
                                          cv=cv_nfold,
                                          scoring=scoring,
                                          verbose=verbosity)

    logger.info(search)
    start = time.time()
    search.fit(X, y)
    logger.info("Took %.2f seconds to find the best %s." %
                ((time.time() - start), name))
    report_grid_search_scores(search.grid_scores_, n_top=3)
    return search.best_estimator_
Ejemplo n.º 7
0
def report_grid_search_scores(grid_scores, n_top=5):
    # Utility function to report best scores
    # Credit : http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html
    top_scores = sorted(grid_scores, key=operator.itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        logger.info("Model with rank: {0}".format(i + 1))
        logger.info("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            score.mean_validation_score, np.std(score.cv_validation_scores)))
        logger.info("Parameters: {0}".format(score.parameters))
Ejemplo n.º 8
0
def report_grid_search_scores(grid_scores, n_top=5):
    # Utility function to report best scores
    # Credit : http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html
    top_scores = sorted(grid_scores, key=operator.itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        logger.info("Model with rank: {0}".format(i + 1))
        logger.info("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    score.mean_validation_score,
                    np.std(score.cv_validation_scores)))
        logger.info("Parameters: {0}".format(score.parameters))
Ejemplo n.º 9
0
One-hot encoded, with the rest discarded.

@author: Nirmalya Ghosh
"""

import pandas as pd
from sklearn import cross_validation as cv
from sklearn import ensemble
from sklearn import metrics

import utils
from bnp_config import cfg, logger

if __name__ == "__main__":
    section = "approach1"
    logger.info("Running script for BNP Approach 1")

    train = pd.read_csv("data/train.csv.gz")
    test = pd.read_csv("data/test.csv.gz")
    id_train = train.ID
    id_test = test.ID
    target = train.target
    train = train.drop(["target"], axis=1)

    # Combining the train and test to do some preprocessing
    df_all = pd.concat((train, test), axis=0,
                       ignore_index=True)  # 228714 x 132

    # Deal with missing values
    df_all_nmv = utils.BasicImputer().fit_transform(df_all)
Ejemplo n.º 10
0
@author: Nirmalya Ghosh
"""

import pandas as pd
from sklearn import cross_validation as cv
from sklearn import ensemble
from sklearn import metrics

import utils
from bnp_config import cfg, logger


if __name__ == "__main__":
    section = "approach1"
    logger.info("Running script for BNP Approach 1")
    
    train = pd.read_csv("data/train.csv.gz")
    test = pd.read_csv("data/test.csv.gz")
    id_train = train.ID
    id_test = test.ID
    target = train.target
    train = train.drop(["target"], axis=1)

    # Combining the train and test to do some preprocessing
    df_all = pd.concat((train, test), axis=0, ignore_index=True) # 228714 x 132

    # Deal with missing values
    df_all_nmv = utils.BasicImputer().fit_transform(df_all)

    # Deal with categorical variables
Ejemplo n.º 11
0
import time

import numpy as np
import pandas as pd
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

import utils
from bnp_config import cfg, logger


if __name__ == '__main__':
    s = "approach5"
    logger.info("Running script for BNP Approach 5, %s", cfg[s]["description"])

    train = pd.read_csv("data/train.csv.gz")
    test = pd.read_csv("data/test.csv.gz")
    id_test = test.ID.values
    target = train.target.values

    drops = cfg[s]["columns_to_drop"]
    train = train.drop(drops, axis=1)
    train = train.drop(["target"], axis=1)
    test = test.drop(drops, axis=1)

    # Get the columns with numeric data
    # Credit : http://stackoverflow.com/a/28155580
    numeric_cols = list(
        train.select_dtypes(include=[np.number]).columns.values)
Ejemplo n.º 12
0
"""

import time

import numpy as np
import pandas as pd
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

import utils
from bnp_config import cfg, logger

if __name__ == '__main__':
    s = "approach5"
    logger.info("Running script for BNP Approach 5, %s", cfg[s]["description"])

    train = pd.read_csv("data/train.csv.gz")
    test = pd.read_csv("data/test.csv.gz")
    id_test = test.ID.values
    target = train.target.values

    drops = cfg[s]["columns_to_drop"]
    train = train.drop(drops, axis=1)
    train = train.drop(["target"], axis=1)
    test = test.drop(drops, axis=1)

    # Get the columns with numeric data
    # Credit : http://stackoverflow.com/a/28155580
    numeric_cols = list(
        train.select_dtypes(include=[np.number]).columns.values)