コード例 #1
0
def _search_pipeline(dataset, problem, template, input_dir, output_dir, budget,
                     checkpoints, splits, db, tuner_type):

    dataset_path, problem_path = _get_dataset_paths(input_dir, dataset,
                                                    'TRAIN', problem)

    d3mds = D3MDS(dataset_path, problem_path)

    searcher = PipelineSearcher(output_dir,
                                cv_splits=splits,
                                db=db,
                                tuner_type=tuner_type)

    return searcher.search(d3mds,
                           template,
                           budget=budget,
                           checkpoints=checkpoints)
コード例 #2
0
ファイル: __main__.py プロジェクト: bigdatamatta/AutoBazaar
def _test_pipeline(dataset, problem, pipeline_id, input_dir, output_dir):

    dataset_path, problem_path = _get_dataset_paths(input_dir, dataset, 'TEST',
                                                    problem)

    pipeline_path = os.path.join(output_dir, '{}.pkl'.format(pipeline_id))
    with open(pipeline_path, 'rb') as pipeline_pkl:
        pipeline = cloudpickle.load(pipeline_pkl)

    print('Executing best pipeline {}'.format(pipeline))

    d3mds = D3MDS(dataset_path, problem_path)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        predictions = pipeline.predict(d3mds)

    return predictions
コード例 #3
0
def load_d3mds(dataset, root=DATA_PATH, force_download=False):
    """Load dataset into D3MDS format, as necessary downloading tarfile from S3 and extracting

    If the root directory is

    Args:
        dataset (str): dataset identifier
        root (path-like, optional): root directory to store tarfiles and extracted datasets.
            Defaults to './data/'.
        force_download (boolean, optional): download the tarfile even if it already exists,
            also causing the files to be re-extracted. Defaults to False.

    Returns:
        mit_d3m.dataset.D3MDS
    """
    read_only = root != DATA_PATH

    if not read_only and not os.path.exists(root):
        os.makedirs(root)

    if dataset.endswith(DATASET_EXTRA_SUFFIX):
        dataset = dataset[:len(DATASET_EXTRA_SUFFIX)]

    dataset_dir = get_dataset_dir(root, dataset)
    dataset_tarfile = get_dataset_tarfile_path(root, dataset)
    dataset_key = get_dataset_s3_key(dataset)

    requires_download = force_download or not os.path.exists(dataset_tarfile)
    if not read_only and requires_download:
        download_dataset(BUCKET, dataset_key, dataset_tarfile)

    requires_extraction = (force_download or not os.path.exists(dataset_dir)
                           or not contains_files(dataset_dir))
    if not read_only and requires_extraction:
        if os.path.exists(dataset_dir) and os.path.isdir(dataset_dir):
            # probably was an error in a previous extraction attempt
            shutil.rmtree(dataset_dir, ignore_errors=True)
        extract_dataset(dataset_tarfile, root)

    phase_root = os.path.join(dataset_dir, 'TRAIN')
    dataset_path = os.path.join(phase_root, 'dataset_TRAIN')
    problem_path = os.path.join(phase_root, 'problem_TRAIN')

    return D3MDS(dataset=dataset_path, problem=problem_path)
コード例 #4
0
ファイル: stats.py プロジェクト: micahjsmith/mit-d3m
def get_d3mds(dataset, path, phase, problem):
    config = build_config(dataset, path, phase, problem)
    dataset_key = 'training' if phase == 'TRAIN' else 'test'
    d3mds = D3MDS(dataset=config[dataset_key + '_data_root'],
                  problem=config['problem_root'])
    return d3mds