def train(ctx, dataset_fpath, all_data, max_depth, model_fpath, name, test): if not os.path.isfile(dataset_fpath): logging.info('No dataset was provided, building with default settings') data.save_dataset(dataset_fpath) dataset = data.load_dataset(dataset_fpath, return_arrays=False) clf = model.REGISTRY[name](max_depth=max_depth) X_train, y_train = dataset['X_train'], dataset['y_train'] X_test, y_test = dataset['X_test'], dataset['y_test'] if all_data: X_train = np.concatenate((X_train, X_test), axis=0) y_train = np.concatenate((y_train, y_test), axis=0) clf.fit(X_train, y_train) model.save_model(clf, model_fpath) acc = clf.score(X_train, y_train) logging.info("Accuracy on training set: {}".format(acc)) if test: acc = clf.score(X_test, y_test) logging.info("Accuracy on the test set: {}".format(acc))
def create_datasets(X, X_test, y, datasets=[], use_cache=True): """ Generate datasets as needed with different sets of features and save them to disk. The datasets are created by combining a base feature set (combinations of the original variables) with extracted feature sets, with some additional variants. The nomenclature is as follows: Base datasets: - basic: the original columns, minus role1, role2, and role_code - tuples: all order 2 combinations of the original columns - triples: all order 3 combinations of the original columns - greedy[1,2,3]: three different datasets obtained by performing greedy feature selection with different seeds on the triples dataset - effects: experimental. Created to try out a suggestion by Gxav after the competition Feature sets and variants: (denoted by the letters after the underscore in the base dataset name): - s: the base dataset has been sparsified using One-Hot encoding - c: the rare features have been consolidated into one category - f: extracted features have been appended, with a different set for linear models than for tree-based models - b: Benjamin's extracted features. - d: interactions for the extracted feature set have been added - l: the extracted features have been log transformed """ if use_cache: # Check if all files exist. If not, generate the missing ones DATASETS = [] for dataset in datasets: try: with open("cache/%s.pkl" % dataset, 'rb'): pass except IOError: logger.warning("couldn't load dataset %s, will generate it", dataset) DATASETS.append(dataset.split('_')[0]) else: DATASETS = ["basic", "tuples", "triples", "greedy", "greedy2", "greedy3"] # Datasets that require external code to be generated for dataset, module in EXTERNAL_DATASETS.iteritems(): if not get_dataset(dataset): module.create_features() # Generate the missing datasets if len(DATASETS): bsfeats, bsfeats_test = get_dataset('bsfeats') basefeats, basefeats_test = create_features(X, X_test, 3) save_dataset("base_feats", basefeats, basefeats_test) lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0)) save_dataset("lrfeats", lrfeats, lrfeats_test) feats, feats_test = pre_process(*create_features(X, X_test, 1)) save_dataset("features", feats, feats_test) meta, meta_test = pre_process(*create_features(X, X_test, 2), normalize=False) save_dataset("metafeatures", meta, meta_test) X = X[:, SELECTED_COLUMNS] X_test = X_test[:, SELECTED_COLUMNS] save_dataset("basic", X, X_test) Xt = create_tuples(X) Xt_test = create_tuples(X_test) save_dataset("tuples", Xt, Xt_test) Xtr = create_tuples(X) Xtr_test = create_tuples(X_test) save_dataset("triples", Xtr, Xtr_test) Xe, Xe_test = create_effects(X, X_test, y) save_dataset("effects", Xe, Xe_test) feats_d, feats_d_test = pre_process(basefeats, basefeats_test, create_divs=True) bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test, create_divs=True) feats_l, feats_l_test = pre_process(basefeats, basefeats_test, log_transform=True) lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test, log_transform=True) bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test, log_transform=True) for ds in DATASETS: Xg, Xg_test = get_dataset(ds) save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test) save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test) save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test) save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test) save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test) save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test) save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test) save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test) if issubclass(Xg.dtype.type, np.integer): consolidate(Xg, Xg_test) save_dataset(ds + '_c', Xg, Xg_test) save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test) save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test) Xs, Xs_test = sparsify(Xg, Xg_test) save_dataset(ds + '_sc', Xs, Xs_test) save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test) save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test) save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test) save_dataset(ds + '_scbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test)
def download_data(ctx, fpath): data.save_dataset(fpath)