def norm_tsne2(x, y): if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH, DATA_TSNE2): return None y = y.reset_index(drop=True) norm_df = _normalize(x) tsne_df = _tsne(norm_df, 2) joined_df = pd.concat((norm_df, tsne_df, y), axis=1) assert norm_df.shape[0] == tsne_df.shape[0] == joined_df.shape[0] data_loading.save_data(joined_df, constants.OUTPUT_DATA_PROC_PATH, DATA_TSNE2)
def norm_pca3(x, y): if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH, DATA_PCA3): return None y = y.reset_index(drop=True) norm_df = _normalize(x) pca_df = _pca(norm_df, 3) joined_df = pd.concat((norm_df, pca_df, y), axis=1) assert norm_df.shape[0] == pca_df.shape[0] == joined_df.shape[0] data_loading.save_data(joined_df, constants.OUTPUT_DATA_PROC_PATH, DATA_PCA3)
def main(): # Set the random seed for the entire project. du.set_random_seed(0) # Rationale: ensure reproducibility of the results. # Flush previous runs. constants.flush_project_results(constants.TMP_PATH, constants.OUTPUT_PATH) # Rationale: provide a clear state for the project to run and enforces # reproducibility of the results. # Load, save and split data. dataframe = data_loading.load_data(constants.DATA_PATH) data_loading.save_data(dataframe, constants.TMP_PATH) x_train, x_test, y_train, y_test = data_loading.train_test_split(dataframe) # Rationale: *Loading*: load data in the main module and pass it as a first # argument to every other defined function (that relates to the data set) # thus saving precious time with data loading. *Saving*: for big data sets # saving the dataset as a fast read format (such as HDF5) saves time. # Load and combine data processing pipelines. # TODO: data_processing_pipelines = None # Perform exploratory data analyses. data_exploration.main(dataframe) # Rationale: conduct exploratory data analyses. # Perform grid search. persistent_grid_object = sku.grid_search.PersistentGrid.load_from_path( persistent_grid_path=constants.PERSITENT_GRID_PATH, dataset_path=constants.DATA_PATH) # Iteration over processed data sets may occur here since they are model # dependent. grid_search.main(dataframe, constants.MODELS, data_processing_pipelines, constants.GRIDS, persistent_grid_object) best_grids = grid_search.get_best_grids( # noqa constants.MODELS, data_processing_pipelines, persistent_grid_object)
def main(): # Filter warnings that polute the project stdout. filter_warnings() # Rationale: produce cleaner results. # Set the random seed for the entire project. du.set_random_seed(0) # Rationale: ensure reproducibility of the results. # Flush previous runs. # constants.flush_project_results(constants.TMP_PATH, # constants.OUTPUT_PATH) # Rationale: provide a clear state for the project to run and enforces # reproducibility of the results. # Download, load and save data. data_loading.main() dataframe = data_loading.load_data(constants.DATASET_PATH, constants.TMP_PATH) data_loading.save_data(dataframe, constants.TMP_PATH, constants.DATASET_PATH) # Rationale: *Loading*: load data in the main module and pass it as a first # argument to every other defined function (that relates to the data set) # thus saving precious time with data loading. *Saving*: for big data sets # saving the dataset as a fast read format (such as HDF5) saves time. # Load and combine data processing pipelines. data_processing.main(dataframe, nan_strategy='drop') # Rationale: prepare data to be fed into the models. # Different algorithms make use of different data structures. For instance # XGBoost allow for nans. Data transformations usually don't. # Perform exploratory data analyses. data_exploration.main(dataframe) # Rationale: conduct exploratory data analyses. # Data split. # Removed. # Rationale: module 'models' should execute this. # Perform grid search. # Iteration over processed data sets may occur here since they are model # dependent. grid_search.main(constants.MODELS, constants.GRIDS) best_combination_of_datasets_and_grids = ( grid_search.dict_of_best_datasets_and_grids(constants.MODELS, constants.GRIDS)) best_datasets = best_combination_of_datasets_and_grids['best_datasets'] best_grids = best_combination_of_datasets_and_grids['best_grids'] # Rationale: perform grid search as part of machine learning best # practices. # Summary of what was executed so far: # 1) Setting of the random seed for reproducibility. # 2) Flusing of intermediate results for a clean run. # 3) Data loading and data saving. # 4) Conduction of exploratory data analyses. # 5) Grid search of best model hyper parameters. # To conclude our project we need the grand finale: model selection and # evaluation/comparison. models.main(constants.MODELS, best_datasets, best_grids, constants.MODEL_FITTING_PARAMETERS)
def no_transform(dataframe): if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH, DATA_VANILLA): return None data_loading.save_data(dataframe, constants.OUTPUT_DATA_PROC_PATH, DATA_VANILLA)