Beispiel #1
0
def do_search(core,
              problem,
              dataset_path,
              time_bound=30.0,
              pipelines_limit=0,
              pipeline_template=None):
    version = pb_core.DESCRIPTOR.GetOptions().Extensions[
        pb_core.protocol_version]

    search = core.SearchSolutions(
        pb_core.SearchSolutionsRequest(
            user_agent='ta3_stub',
            version=version,
            time_bound_search=time_bound,
            rank_solutions_limit=pipelines_limit,
            allowed_value_types=['CSV_URI'],
            problem=encode_problem_description(problem),
            template=pipeline_template,
            inputs=[pb_value.Value(dataset_uri='file://%s' % dataset_path, )],
        ))

    start_time = datetime.datetime.now()
    results = core.GetSearchSolutionsResults(
        pb_core.GetSearchSolutionsResultsRequest(search_id=search.search_id, ))
    solutions = {}
    for result in results:
        if result.solution_id:
            end_time = datetime.datetime.now()
            solutions[result.solution_id] = (result.internal_score,
                                             result.scores,
                                             str(end_time - start_time))

    return str(search.search_id), solutions
Beispiel #2
0
def do_score(core, problem, solutions, dataset_path):
    metrics = []

    for metric in problem['problem']['performance_metrics']:
        metrics.append(encode_performance_metric(metric))

    for solution in solutions:
        try:
            response = core.ScoreSolution(
                pb_core.ScoreSolutionRequest(
                    solution_id=solution,
                    inputs=[
                        pb_value.Value(dataset_uri='file://%s' %
                                       dataset_path, )
                    ],
                    performance_metrics=metrics,
                    users=[],
                    configuration=pb_core.ScoringConfiguration(
                        method='K_FOLD',
                        folds=4,
                        train_test_ratio=0.75,
                        shuffle=True,
                        random_seed=0),
                ))
            results = core.GetScoreSolutionResults(
                pb_core.GetScoreSolutionResultsRequest(
                    request_id=response.request_id, ))
            for _ in results:
                pass
        except Exception:
            logger.exception("Exception during scoring %r", solution)
Beispiel #3
0
def do_test(core, fitted, dataset_path):
    tested = {}
    for fitted_solution in fitted.values():
        try:
            response = core.ProduceSolution(
                pb_core.ProduceSolutionRequest(
                    fitted_solution_id=fitted_solution,
                    inputs=[
                        pb_value.Value(dataset_uri='file://%s' %
                                       dataset_path, )
                    ],
                    expose_outputs=['outputs.0'],
                    expose_value_types=['CSV_URI'],
                    users=[],
                ))
            results = core.GetProduceSolutionResults(
                pb_core.GetProduceSolutionResultsRequest(
                    request_id=response.request_id, ))
            for result in results:
                if result.progress.state == pb_core.COMPLETED:
                    tested[fitted_solution] = result.exposed_outputs[
                        'outputs.0'].csv_uri
        except Exception:
            logger.exception("Exception testing %r", fitted_solution)

    return tested
Beispiel #4
0
    def GetFitSolutionResults(self, request, context):
        """Wait for a training job to be done.
        """
        try:
            job_id = int(request.request_id, 16)
            queue = self._requests[job_id]
        except (ValueError, KeyError):
            raise error(context, grpc.StatusCode.NOT_FOUND, "Unknown ID %r",
                        request.request_id)

        for event, kwargs in queue.read():
            if not context.is_active():
                logger.info("Client closed GetFitSolutionsResults stream")
                break

            if event == 'training_start':
                yield pb_core.GetFitSolutionResultsResponse(
                    progress=pb_core.Progress(
                        state=pb_core.RUNNING,
                        status="Training in progress",
                    ), )
            elif event == 'training_success':
                pipeline_id = kwargs['pipeline_id']
                storage_dir = kwargs['storage_dir']
                steps_to_expose = kwargs['steps_to_expose']
                yield pb_core.GetFitSolutionResultsResponse(
                    progress=pb_core.Progress(
                        state=pb_core.COMPLETED,
                        status="Training completed",
                    ),
                    exposed_outputs={
                        step_id:
                        pb_value.Value(csv_uri='file://%s/fit_%s_%s.csv' %
                                       (storage_dir, pipeline_id, step_id))
                        for step_id in steps_to_expose
                    },
                    fitted_solution_id=str(pipeline_id),
                )
                break
            elif event == 'training_error':
                status = kwargs['error_msg']
                yield pb_core.GetFitSolutionResultsResponse(
                    progress=pb_core.Progress(
                        state=pb_core.ERRORED,
                        status=status,
                    ), )
                break
            elif event == 'done_searching':
                break
Beispiel #5
0
    def GetScoreSolutionResults(self, request, context):
        """Wait for a scoring job to be done.
        """
        try:
            job_id = int(request.request_id, 16)
            queue = self._requests[job_id]
        except (ValueError, KeyError):
            raise error(context, grpc.StatusCode.NOT_FOUND, "Unknown ID %r",
                        request.request_id)

        for event, kwargs in queue.read():
            if not context.is_active():
                logger.info("Client closed GetScoreSolutionResults stream")
                break

            if event == 'scoring_start':
                yield pb_core.GetScoreSolutionResultsResponse(
                    progress=pb_core.Progress(
                        state=pb_core.RUNNING,
                        status="Scoring in progress",
                    ), )
            elif event == 'scoring_success':
                pipeline_id = kwargs['pipeline_id']
                scores = self._ta2.get_pipeline_scores(pipeline_id)
                scores = [
                    pb_core.Score(
                        metric=pb_problem.ProblemPerformanceMetric(
                            metric=m, k=0, pos_label=''),
                        value=pb_value.Value(raw=pb_value.ValueRaw(double=s)),
                    ) for m, s in scores.items()
                ]
                yield pb_core.GetScoreSolutionResultsResponse(
                    progress=pb_core.Progress(
                        state=pb_core.COMPLETED,
                        status="Scoring completed",
                    ),
                    scores=scores,
                )
                break
            elif event == 'scoring_error':
                status = kwargs['error_msg']
                yield pb_core.GetScoreSolutionResultsResponse(
                    progress=pb_core.Progress(
                        state=pb_core.ERRORED,
                        status=status,
                    ), )
                break
Beispiel #6
0
        def msg_solution(pipeline_id):
            scores = self._ta2.get_pipeline_scores(pipeline_id)
            progress = session.progress

            if scores:
                if session.metrics and session.metrics[0][
                        'metric'].name in scores:
                    metric = session.metrics[0]['metric']
                    try:
                        internal_score = metric.normalize(scores[metric.name])
                    except:
                        internal_score = scores[metric.name]
                        logger.warning(
                            'Problems normalizing metric, using the raw value: %.2f'
                            % scores[metric.name])
                else:
                    internal_score = float('nan')
                scores = [
                    pb_core.Score(
                        metric=pb_problem.ProblemPerformanceMetric(
                            metric=m, k=0, pos_label=''),
                        value=pb_value.Value(raw=pb_value.ValueRaw(double=s)),
                    ) for m, s in scores.items()
                ]
                scores = [pb_core.SolutionSearchScore(scores=scores)]
                return pb_core.GetSearchSolutionsResultsResponse(
                    done_ticks=progress.current,
                    all_ticks=progress.total,
                    progress=pb_core.Progress(
                        state=pb_core.RUNNING,
                        status="Solution scored",
                        start=to_timestamp(session.start),
                    ),
                    solution_id=str(pipeline_id),
                    internal_score=internal_score,
                    scores=scores,
                )
Beispiel #7
0
def do_train(core, solutions, dataset_path):
    fitted = {}
    for solution in solutions:
        try:
            response = core.FitSolution(
                pb_core.FitSolutionRequest(
                    solution_id=solution,
                    inputs=[
                        pb_value.Value(dataset_uri='file://%s' %
                                       dataset_path, )
                    ],
                    expose_outputs=['outputs.0'],
                    expose_value_types=['CSV_URI'],
                    users=[],
                ))
            results = core.GetFitSolutionResults(
                pb_core.GetFitSolutionResultsRequest(
                    request_id=response.request_id, ))
            for result in results:
                if result.progress.state == pb_core.COMPLETED:
                    fitted[solution] = result.fitted_solution_id
        except Exception:
            logger.exception("Exception training %r", solution)
    return fitted
Beispiel #8
0
    def _add_step(self, steps, step_descriptions, modules, params,
                  module_to_step, mod):
        if mod.id in module_to_step:
            return module_to_step[mod.id]

        # Special case: the "dataset" module
        if mod.package == 'data' and mod.name == 'dataset':
            module_to_step[mod.id] = 'inputs.0'
            return 'inputs.0'
        elif mod.package != 'd3m':
            raise ValueError("Got unknown module '%s:%s'" %
                             (mod.package, mod.name))

        # Recursively walk upstream modules (to get `steps` in topological
        # order)
        # Add inputs to a dictionary, in deterministic order
        inputs = {}
        for conn in sorted(mod.connections_to, key=lambda c: c.to_input_name):
            step = self._add_step(steps, step_descriptions, modules, params,
                                  module_to_step, modules[conn.from_module_id])
            if step.startswith('inputs.'):
                inputs[conn.to_input_name] = step
            else:
                inputs[conn.to_input_name] = '%s.%s' % (step,
                                                        conn.from_output_name)

        klass = d3m_ta2_nyu.workflow.convert.get_class(mod.name)
        metadata = klass.metadata.query()
        metadata_items = {
            key: metadata[key]
            for key in ('id', 'version', 'python_path', 'name', 'digest')
            if key in metadata
        }

        arguments = {
            name: pb_pipeline.PrimitiveStepArgument(
                container=pb_pipeline.ContainerArgument(data=data, ))
            for name, data in inputs.items()
        }

        # If hyperparameters are set, export them
        step_hyperparams = {}
        if mod.id in params and 'hyperparams' in params[mod.id]:
            hyperparams = pickle.loads(params[mod.id]['hyperparams'])
            for k, v in hyperparams.items():
                step_hyperparams[k] = pb_pipeline.PrimitiveStepHyperparameter(
                    value=pb_pipeline.ValueArgument(data=pb_value.Value(
                        raw=encode_raw_value(v))))

        # Create step description
        step = pb_pipeline.PipelineDescriptionStep(
            primitive=pb_pipeline.PrimitivePipelineDescriptionStep(
                primitive=pb_primitive.Primitive(
                    id=metadata_items['id'],
                    version=metadata_items['version'],
                    python_path=metadata_items['python_path'],
                    name=metadata_items['name'],
                    digest=metadata_items['digest']),
                arguments=arguments,
                outputs=[pb_pipeline.StepOutput(id='produce')],
                hyperparams=step_hyperparams,
            ))

        step_descriptions.append(  # FIXME it's empty
            pb_core.StepDescription(
                primitive=pb_core.PrimitiveStepDescription()))
        step_nb = 'steps.%d' % len(steps)
        steps.append(step)
        module_to_step[mod.id] = step_nb

        return step_nb