rnp.seed(3056) ######################################## from io_helper import IOHelper from data_helper import DataHelper from config_helper import ConfigHelper from metrics_helper import MetricsHelper if __name__ == "__main__": train_data = IOHelper.read_dataset("train") train_X, train_y = DataHelper.extract_feature_labels(train_data) predef = ConfigHelper.use_predefined_cols DataHelper.add_nan_indication_cols(train_X) DataHelper.remove_high_nan_rate_cols(train_X, predef) DataHelper.remove_small_variance_cols(train_X, predef) train_y = DataHelper.remove_high_nan_rate_rows(train_X, train_y) DataHelper.fill_missing_data(train_X, is_train=True) train_X = DataHelper.split_categorical_cols(train_X, is_train=True) DataHelper.scale_continuous_cols(train_X, is_train=True) train_X = DataHelper.select_best_features(train_X, None, train_y, ConfigHelper.max_nb_features, is_train=True) test_X = IOHelper.read_dataset("test")
import time from io_helper import IOHelper from data_helper import DataHelper from config_helper import ConfigHelper from metrics_helper import MetricsHelper if __name__ == "__main__": data = IOHelper.read_dataset("train") feats, labels = DataHelper.extract_feature_labels(data) predef = ConfigHelper.use_predefined_cols DataHelper.add_nan_indication_cols(feats) DataHelper.remove_high_nan_rate_cols(feats, predef) DataHelper.remove_small_variance_cols(feats, predef) for e in xrange(ConfigHelper.nb_executions): print "Execution: " + str(e + 1) MetricsHelper.reset_metrics() for f, (train_idxs, val_idxs) in enumerate(ConfigHelper.k_fold_cv(labels)): start_time = time.time() print "Fold: " + str(f + 1) DataHelper.reset_scaler()