def search(dataset_root, problem, args):
    pps = PipelineSearcher(
        args.input,
        args.output,
        args.static,
        dump=True,
        hard_timeout=args.hard,
    )

    return pps.search(problem, args.timeout, args.budget, args.template)
Exemple #2
0
def search(dataset_root, problem, args):

    pps = PipelineSearcher(args.input, args.output, dump=True)

    return pps.search(problem, timeout=args.timeout, budget=args.budget)
Exemple #3
0
def process_dataset(dataset_name, dataset, problem, args):
    box_print("Processing dataset {}".format(dataset_name), True)

    output_path = os.path.join(args.output, dataset_name)
    os.makedirs(output_path, exist_ok=True)

    LOGGER.info("Searching Pipeline for dataset {}".format(dataset_name))
    try:
        start_ts = datetime.utcnow()
        pps = PipelineSearcher(args.input,
                               output_path,
                               args.static,
                               dump=True,
                               hard_timeout=args.hard,
                               ignore_errors=args.ignore_errors,
                               cv_folds=args.folds,
                               subprocess_timeout=args.subprocess_timeout,
                               max_errors=args.max_errors,
                               store_summary=True)
        result = pps.search(dataset, problem, args.timeout, args.budget,
                            args.templates_csv)

        result['elapsed'] = datetime.utcnow() - start_ts
        result['dataset'] = dataset_name

    except Exception as ex:
        result = {
            'dataset': dataset_name,
            'error': '{}: {}'.format(type(ex).__name__, ex),
        }
    else:
        try:
            summary = result.pop('summary')
            candidates = _select_candidates(summary)
            if candidates.empty:
                box_print('No valid pipelines found for dataset {}'.format(
                    dataset_name))
            else:
                ranked_path = os.path.join(output_path, 'pipelines_ranked')
                test_scores = list()
                for _, candidate in candidates.iterrows():
                    try:
                        pipeline = candidate.pipeline
                        pipeline_path = os.path.join(ranked_path, pipeline)
                        test_score = score_pipeline(dataset, problem,
                                                    pipeline_path, args.static,
                                                    output_path)
                        test_scores.append(test_score)
                    except Exception:
                        test_scores.append(None)

                candidates['test_score'] = test_scores
                candidates = candidates.sort_values('test_score',
                                                    ascending=False)

                best = candidates.iloc[0]
                result['test_score'] = best.test_score
                result['template'] = best.template
                result['cv_score'] = best.score
                box_print('Best pipelines for dataset {}:\n{}'.format(
                    dataset_name, candidates.to_string()))

        except Exception as ex:
            LOGGER.exception('Error while testing the winner pipeline')
            result['error'] = 'TEST Error: {}'.format(ex)

    return result