def get_dataset(args): import criteo SUPPORTED_DATASETS = { "kaggle": (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(), {"randomize": 'total', "memory_map": True}), "terabyte": (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(), {"randomize": 'total', "memory_map": True}), } # dataset to use wanted_dataset, pre_proc, _, kwargs = SUPPORTED_DATASETS[args.dataset] # --count-samples can be used to limit the number of samples used for testing ds = wanted_dataset(data_path=args.dataset_path, name=args.dataset, pre_process=pre_proc, # currently an identity function use_cache=args.cache, # currently not used count=args.count_samples, samples_to_aggregate_fix=args.samples_to_aggregate_fix, samples_to_aggregate_min=args.samples_to_aggregate_min, samples_to_aggregate_max=args.samples_to_aggregate_max, samples_to_aggregate_quantile_file=args.samples_to_aggregate_quantile_file, samples_to_aggregate_trace_file=args.samples_to_aggregate_trace_file, test_num_workers=0, max_ind_range=args.max_ind_range, sub_sample_rate=args.data_sub_sample_rate, mlperf_bin_loader=args.mlperf_bin_loader, **kwargs) return ds
except KeyError: print("ERROR: Please set DLRM_DIR environment variable to the dlrm code location") sys.exit(0) logging.basicConfig(level=logging.INFO) log = logging.getLogger("main") NANO_SEC = 1e9 MILLI_SEC = 1000 # pylint: disable=missing-docstring # the datasets we support SUPPORTED_DATASETS = { "kaggle": (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(), {"randomize": 'total', "memory_map": True}), "terabyte": (criteo.Criteo, criteo.pre_process_criteo_dlrm, criteo.DlrmPostProcess(), {"randomize": 'total', "memory_map": True}), } # pre-defined command line options so simplify things. They are used as defaults and can be # overwritten from command line SUPPORTED_PROFILES = { "defaults": { "dataset": "terabyte", "inputs": "continuous and categorical features", "outputs": "probability", "backend": "pytorch-native",