Esempio n. 1
0
def train(ctx, dataset_fpath, all_data, max_depth, model_fpath, name, test):

    if not os.path.isfile(dataset_fpath):
        logging.info('No dataset was provided, building with default settings')
        data.save_dataset(dataset_fpath)

    dataset = data.load_dataset(dataset_fpath, return_arrays=False)
    clf = model.REGISTRY[name](max_depth=max_depth)

    X_train, y_train = dataset['X_train'], dataset['y_train']
    X_test, y_test = dataset['X_test'], dataset['y_test']
    if all_data:
        X_train = np.concatenate((X_train, X_test), axis=0)
        y_train = np.concatenate((y_train, y_test), axis=0)

    clf.fit(X_train, y_train)

    model.save_model(clf, model_fpath)

    acc = clf.score(X_train, y_train)
    logging.info("Accuracy on training set: {}".format(acc))

    if test:
        acc = clf.score(X_test, y_test)
        logging.info("Accuracy on the test set: {}".format(acc))
def create_datasets(X, X_test, y, datasets=[], use_cache=True):
    """
    Generate datasets as needed with different sets of features
    and save them to disk.
    The datasets are created by combining a base feature set (combinations of
    the original variables) with extracted feature sets, with some additional
    variants.

    The nomenclature is as follows:
    Base datasets:
        - basic: the original columns, minus role1, role2, and role_code
        - tuples: all order 2 combinations of the original columns
        - triples: all order 3 combinations of the original columns
        - greedy[1,2,3]: three different datasets obtained by performing
            greedy feature selection with different seeds on the triples
            dataset
        - effects: experimental. Created to try out a suggestion by Gxav
            after the competition

    Feature sets and variants:
    (denoted by the letters after the underscore in the base dataset name):
        - s: the base dataset has been sparsified using One-Hot encoding
        - c: the rare features have been consolidated into one category
        - f: extracted features have been appended, with a different set for
            linear models than for tree-based models
        - b: Benjamin's extracted features.
        - d: interactions for the extracted feature set have been added
        - l: the extracted features have been log transformed
    """
    if use_cache:
        # Check if all files exist. If not, generate the missing ones
        DATASETS = []
        for dataset in datasets:
            try:
                with open("cache/%s.pkl" % dataset, 'rb'):
                    pass
            except IOError:
                logger.warning("couldn't load dataset %s, will generate it",
                               dataset)
                DATASETS.append(dataset.split('_')[0])
    else:
        DATASETS = ["basic", "tuples", "triples",
                    "greedy", "greedy2", "greedy3"]

    # Datasets that require external code to be generated
    for dataset, module in EXTERNAL_DATASETS.iteritems():
        if not get_dataset(dataset):
            module.create_features()

    # Generate the missing datasets
    if len(DATASETS):
        bsfeats, bsfeats_test = get_dataset('bsfeats')

        basefeats, basefeats_test = create_features(X, X_test, 3)
        save_dataset("base_feats", basefeats, basefeats_test)

        lrfeats, lrfeats_test = pre_process(*create_features(X, X_test, 0))
        save_dataset("lrfeats", lrfeats, lrfeats_test)

        feats, feats_test = pre_process(*create_features(X, X_test, 1))
        save_dataset("features", feats, feats_test)

        meta, meta_test = pre_process(*create_features(X, X_test, 2),
                                      normalize=False)
        save_dataset("metafeatures", meta, meta_test)

        X = X[:, SELECTED_COLUMNS]
        X_test = X_test[:, SELECTED_COLUMNS]
        save_dataset("basic", X, X_test)

        Xt = create_tuples(X)
        Xt_test = create_tuples(X_test)
        save_dataset("tuples", Xt, Xt_test)

        Xtr = create_tuples(X)
        Xtr_test = create_tuples(X_test)
        save_dataset("triples", Xtr, Xtr_test)

        Xe, Xe_test = create_effects(X, X_test, y)
        save_dataset("effects", Xe, Xe_test)

        feats_d, feats_d_test = pre_process(basefeats, basefeats_test,
                                            create_divs=True)
        bsfeats_d, bsfeats_d_test = pre_process(bsfeats, bsfeats_test,
                                                create_divs=True)
        feats_l, feats_l_test = pre_process(basefeats, basefeats_test,
                                            log_transform=True)
        lrfeats_l, lrfeats_l_test = pre_process(lrfeats, lrfeats_test,
                                                log_transform=True)
        bsfeats_l, bsfeats_l_test = pre_process(bsfeats, bsfeats_test,
                                                log_transform=True)

        for ds in DATASETS:
            Xg, Xg_test = get_dataset(ds)
            save_dataset(ds + '_b', Xg, Xg_test, bsfeats, bsfeats_test)
            save_dataset(ds + '_f', Xg, Xg_test, feats, feats_test)
            save_dataset(ds + '_fd', Xg, Xg_test, feats_d, feats_d_test)
            save_dataset(ds + '_bd', Xg, Xg_test, bsfeats_d, bsfeats_d_test)
            Xs, Xs_test = sparsify(Xg, Xg_test)
            save_dataset(ds + '_sf', Xs, Xs_test, lrfeats, lrfeats_test)
            save_dataset(ds + '_sfl', Xs, Xs_test, lrfeats_l, lrfeats_l_test)
            save_dataset(ds + '_sfd', Xs, Xs_test, feats_d, feats_d_test)
            save_dataset(ds + '_sb', Xs, Xs_test, bsfeats, bsfeats_test)
            save_dataset(ds + '_sbl', Xs, Xs_test, bsfeats_l, bsfeats_l_test)
            save_dataset(ds + '_sbd', Xs, Xs_test, bsfeats_d, bsfeats_d_test)

            if issubclass(Xg.dtype.type, np.integer):
                consolidate(Xg, Xg_test)
                save_dataset(ds + '_c', Xg, Xg_test)
                save_dataset(ds + '_cf', Xg, Xg_test, feats, feats_test)
                save_dataset(ds + '_cb', Xg, Xg_test, bsfeats, bsfeats_test)
                Xs, Xs_test = sparsify(Xg, Xg_test)
                save_dataset(ds + '_sc', Xs, Xs_test)
                save_dataset(ds + '_scf', Xs, Xs_test, feats, feats_test)
                save_dataset(ds + '_scfl', Xs, Xs_test, feats_l, feats_l_test)
                save_dataset(ds + '_scb', Xs, Xs_test, bsfeats, bsfeats_test)
                save_dataset(ds + '_scbl', Xs, Xs_test,
                             bsfeats_l, bsfeats_l_test)
Esempio n. 3
0
def download_data(ctx, fpath):
    data.save_dataset(fpath)