def _search_pipeline(dataset, problem, template, input_dir, output_dir, budget, checkpoints, splits, db, tuner_type): dataset_path, problem_path = _get_dataset_paths(input_dir, dataset, 'TRAIN', problem) d3mds = D3MDS(dataset_path, problem_path) searcher = PipelineSearcher(output_dir, cv_splits=splits, db=db, tuner_type=tuner_type) return searcher.search(d3mds, template, budget=budget, checkpoints=checkpoints)
def _test_pipeline(dataset, problem, pipeline_id, input_dir, output_dir): dataset_path, problem_path = _get_dataset_paths(input_dir, dataset, 'TEST', problem) pipeline_path = os.path.join(output_dir, '{}.pkl'.format(pipeline_id)) with open(pipeline_path, 'rb') as pipeline_pkl: pipeline = cloudpickle.load(pipeline_pkl) print('Executing best pipeline {}'.format(pipeline)) d3mds = D3MDS(dataset_path, problem_path) with warnings.catch_warnings(): warnings.simplefilter('ignore') predictions = pipeline.predict(d3mds) return predictions
def load_d3mds(dataset, root=DATA_PATH, force_download=False): """Load dataset into D3MDS format, as necessary downloading tarfile from S3 and extracting If the root directory is Args: dataset (str): dataset identifier root (path-like, optional): root directory to store tarfiles and extracted datasets. Defaults to './data/'. force_download (boolean, optional): download the tarfile even if it already exists, also causing the files to be re-extracted. Defaults to False. Returns: mit_d3m.dataset.D3MDS """ read_only = root != DATA_PATH if not read_only and not os.path.exists(root): os.makedirs(root) if dataset.endswith(DATASET_EXTRA_SUFFIX): dataset = dataset[:len(DATASET_EXTRA_SUFFIX)] dataset_dir = get_dataset_dir(root, dataset) dataset_tarfile = get_dataset_tarfile_path(root, dataset) dataset_key = get_dataset_s3_key(dataset) requires_download = force_download or not os.path.exists(dataset_tarfile) if not read_only and requires_download: download_dataset(BUCKET, dataset_key, dataset_tarfile) requires_extraction = (force_download or not os.path.exists(dataset_dir) or not contains_files(dataset_dir)) if not read_only and requires_extraction: if os.path.exists(dataset_dir) and os.path.isdir(dataset_dir): # probably was an error in a previous extraction attempt shutil.rmtree(dataset_dir, ignore_errors=True) extract_dataset(dataset_tarfile, root) phase_root = os.path.join(dataset_dir, 'TRAIN') dataset_path = os.path.join(phase_root, 'dataset_TRAIN') problem_path = os.path.join(phase_root, 'problem_TRAIN') return D3MDS(dataset=dataset_path, problem=problem_path)
def get_d3mds(dataset, path, phase, problem): config = build_config(dataset, path, phase, problem) dataset_key = 'training' if phase == 'TRAIN' else 'test' d3mds = D3MDS(dataset=config[dataset_key + '_data_root'], problem=config['problem_root']) return d3mds