Beispiel #1
0
def get_dataset(args):
    import criteo
    SUPPORTED_DATASETS = {
        "kaggle":
            (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(),
             {"randomize": 'total',  "memory_map": True}),
        "terabyte":
            (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(),
             {"randomize": 'total',  "memory_map": True}),
    }

    # dataset to use
    wanted_dataset, pre_proc, _, kwargs = SUPPORTED_DATASETS[args.dataset]

    # --count-samples can be used to limit the number of samples used for testing
    ds = wanted_dataset(data_path=args.dataset_path,
                        name=args.dataset,
                        pre_process=pre_proc,  # currently an identity function
                        use_cache=args.cache,  # currently not used
                        count=args.count_samples,
                        samples_to_aggregate_fix=args.samples_to_aggregate_fix,
                        samples_to_aggregate_min=args.samples_to_aggregate_min,
                        samples_to_aggregate_max=args.samples_to_aggregate_max,
                        samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file,
                        samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file,
                        test_num_workers=0,
                        max_ind_range=args.max_ind_range,
                        sub_sample_rate=args.data_sub_sample_rate,
                        mlperf_bin_loader=args.mlperf_bin_loader,
                        **kwargs)
    return ds
Beispiel #2
0
except KeyError:
    print("ERROR: Please set DLRM_DIR environment variable to the dlrm code location")
    sys.exit(0)

logging.basicConfig(level=logging.INFO)
log = logging.getLogger("main")

NANO_SEC = 1e9
MILLI_SEC = 1000

# pylint: disable=missing-docstring

# the datasets we support
SUPPORTED_DATASETS = {
    "kaggle":
        (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(),
         {"randomize": 'total',  "memory_map": True}),
    "terabyte":
        (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(),
         {"randomize": 'total',  "memory_map": True}),
}

# pre-defined command line options so simplify things. They are used as defaults and can be
# overwritten from command line

SUPPORTED_PROFILES = {
    "defaults": {
        "dataset": "terabyte",
        "inputs": "continuous and categorical features",
        "outputs": "probability",
        "backend": "pytorch-native",