def make_submission_file(predicted_vals, name_prefix, create_gz=True): current_ts = time.strftime("%a_%d%b%Y_%H%M%S") submission_filepath = "submissions/%s%s.csv" % (name_prefix, current_ts) submission = pd.read_csv("data/sample_submission.csv") submission.PredictedProb = predicted_vals submission.to_csv(submission_filepath, index=False, quoting=csv.QUOTE_NONE) if create_gz == False: logging.info("See %s" % submission_filepath) return with open(submission_filepath, 'rb') as f_in, \ gzip.open(submission_filepath + '.gz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) logger.info("See %s.gz" % submission_filepath) return submission
def find_best_estimator(base_estimator, X, y, cfg, section, grid_search_params_key, random_search=True, scoring="accuracy", verbosity=3): # grid_search_params_key : key under the indicated section of the # configuration YML file containing the grid search parameters cv_nfold = cfg[section]["cv_nfold"] name = type(base_estimator).__name__ n_iter = cfg[section]["n_iters"] n_jobs = cfg[section]["n_jobs"] param_dist = cfg[section][grid_search_params_key] random_state = cfg["common"]["seed"] logger.info("Finding the best %s based on %s score" % (name, scoring)) if random_search == cfg[section]["use_random_search"]: logger.info("Using random search to find the best %s" % name) search = grid_search.RandomizedSearchCV(estimator=base_estimator, param_distributions=param_dist, n_iter=n_iter, n_jobs=n_jobs, cv=cv_nfold, random_state=random_state, scoring=scoring, verbose=verbosity) else: logger.info("Using grid search to find the best %s" % name) search = grid_search.GridSearchCV(estimator=base_estimator, param_grid=param_dist, n_jobs=n_jobs, cv=cv_nfold, scoring=scoring, verbose=verbosity) logger.info(search) start = time.time() search.fit(X, y) logger.info("Took %.2f seconds to find the best %s." % ((time.time() - start), name)) report_grid_search_scores(search.grid_scores_, n_top=3) return search.best_estimator_
def convert_categorical_features(df, cat_features, random_delete_one=False): for f in cat_features: dummy_cols_df = pd.get_dummies(df[f], prefix=f) cols = list(dummy_cols_df.columns.values) num_ones = [int(dummy_cols_df[col].sum()) for col in cols] cols_w_len = zip(cols, num_ones) logging.info("Categorical feature '%s' has %d unique values " "with distribution %s" % (f, len(cols), cols_w_len)) if random_delete_one == True: # Deleting one of the dummy variables col = random.choice(list(dummy_cols_df.columns.values)) logger.info("Deleting column %s" % col) dummy_cols_df.drop([col], axis=1, inplace=True) # Doing so helps avoid the Multicollinearity problem. # Tip Credit : http://stackoverflow.com/a/22130844 df = df.drop([f], axis=1) df = pd.concat((df, dummy_cols_df), axis=1) return df
def report_grid_search_scores(grid_scores, n_top=5): # Utility function to report best scores # Credit : http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html top_scores = sorted(grid_scores, key=operator.itemgetter(1), reverse=True)[:n_top] for i, score in enumerate(top_scores): logger.info("Model with rank: {0}".format(i + 1)) logger.info("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) logger.info("Parameters: {0}".format(score.parameters))
One-hot encoded, with the rest discarded. @author: Nirmalya Ghosh """ import pandas as pd from sklearn import cross_validation as cv from sklearn import ensemble from sklearn import metrics import utils from bnp_config import cfg, logger if __name__ == "__main__": section = "approach1" logger.info("Running script for BNP Approach 1") train = pd.read_csv("data/train.csv.gz") test = pd.read_csv("data/test.csv.gz") id_train = train.ID id_test = test.ID target = train.target train = train.drop(["target"], axis=1) # Combining the train and test to do some preprocessing df_all = pd.concat((train, test), axis=0, ignore_index=True) # 228714 x 132 # Deal with missing values df_all_nmv = utils.BasicImputer().fit_transform(df_all)
@author: Nirmalya Ghosh """ import pandas as pd from sklearn import cross_validation as cv from sklearn import ensemble from sklearn import metrics import utils from bnp_config import cfg, logger if __name__ == "__main__": section = "approach1" logger.info("Running script for BNP Approach 1") train = pd.read_csv("data/train.csv.gz") test = pd.read_csv("data/test.csv.gz") id_train = train.ID id_test = test.ID target = train.target train = train.drop(["target"], axis=1) # Combining the train and test to do some preprocessing df_all = pd.concat((train, test), axis=0, ignore_index=True) # 228714 x 132 # Deal with missing values df_all_nmv = utils.BasicImputer().fit_transform(df_all) # Deal with categorical variables
import time import numpy as np import pandas as pd from sklearn import cross_validation as cv from sklearn import metrics from sklearn.ensemble import ExtraTreesClassifier import utils from bnp_config import cfg, logger if __name__ == '__main__': s = "approach5" logger.info("Running script for BNP Approach 5, %s", cfg[s]["description"]) train = pd.read_csv("data/train.csv.gz") test = pd.read_csv("data/test.csv.gz") id_test = test.ID.values target = train.target.values drops = cfg[s]["columns_to_drop"] train = train.drop(drops, axis=1) train = train.drop(["target"], axis=1) test = test.drop(drops, axis=1) # Get the columns with numeric data # Credit : http://stackoverflow.com/a/28155580 numeric_cols = list( train.select_dtypes(include=[np.number]).columns.values)
""" import time import numpy as np import pandas as pd from sklearn import cross_validation as cv from sklearn import metrics from sklearn.ensemble import ExtraTreesClassifier import utils from bnp_config import cfg, logger if __name__ == '__main__': s = "approach5" logger.info("Running script for BNP Approach 5, %s", cfg[s]["description"]) train = pd.read_csv("data/train.csv.gz") test = pd.read_csv("data/test.csv.gz") id_test = test.ID.values target = train.target.values drops = cfg[s]["columns_to_drop"] train = train.drop(drops, axis=1) train = train.drop(["target"], axis=1) test = test.drop(drops, axis=1) # Get the columns with numeric data # Credit : http://stackoverflow.com/a/28155580 numeric_cols = list( train.select_dtypes(include=[np.number]).columns.values)