Exemple #1
0
        overwrite={'has_diabetes': {
            'transform': [{'method': 'no_transform', 'parameters': {}}]}}
    ))

# Add a trainer
training_pipeline.add_trainer(FeedForwardTrainer(
    loss='binary_crossentropy',
    last_activation='sigmoid',
    output_units=1,
    metrics=['accuracy'],
    epochs=20))

# Add an evaluator
training_pipeline.add_evaluator(
    TFMAEvaluator(slices=[['has_diabetes']],
                  metrics={'has_diabetes': ['binary_crossentropy',
                                            'binary_accuracy']}))

# Run the pipeline on a Google Cloud VM and train on GCP as well
# In order for this to work, the orchestrator and the backend should be in the
# same GCP project. Also, the metadata store and artifact store should be
# accessible by the orchestrator VM and the GCAIP worker VM.

# Note: If you are using a custom Trainer, then you need
# to build a new Docker image based on the ZenML Trainer image, and pass that
# into the `image` parameter in the SingleGPUTrainingGCAIPBackend.


# Define the orchestrator backend
orchestrator_backend = OrchestratorGCPBackend(
    cloudsql_connection_name=cloudsql_connection_name,
Exemple #2
0
    def get_tfx_component_list(self, config: Dict[Text, Any]) -> List:
        """
        Builds the training pipeline as a series of TFX components.

        Args:
            config: A ZenML configuration in dictionary format.

        Returns:
            A chronological list of TFX components making up the training
             pipeline.

        """
        steps = config[keys.GlobalKeys.STEPS]

        component_list = []

        ############
        # RAW DATA #
        ############
        data_config = steps[keys.TrainingSteps.DATA]
        data = DataGen(source=data_config[keys.StepKeys.SOURCE],
                       source_args=data_config[keys.StepKeys.ARGS]).with_id(
                           GDPComponent.DataGen.name)

        statistics_data = StatisticsGen(
            examples=data.outputs.examples).with_id(
                GDPComponent.DataStatistics.name)

        schema_data = SchemaGen(
            statistics=statistics_data.outputs.output, ).with_id(
                GDPComponent.DataSchema.name)

        component_list.extend([data, statistics_data, schema_data])

        datapoints = data.outputs.examples

        #############
        # SPLITTING #
        #############
        # Block to read the data from the corresponding BQ table
        split_config = steps[keys.TrainingSteps.SPLIT]
        splits = SplitGen(
            input_examples=datapoints,
            source=split_config[keys.StepKeys.SOURCE],
            source_args=split_config[keys.StepKeys.ARGS],
            schema=schema_data.outputs.schema,
            statistics=statistics_data.outputs.output,
        ).with_id(GDPComponent.SplitGen.name)

        datapoints = splits.outputs.examples

        statistics_split = StatisticsGen(examples=datapoints).with_id(
            GDPComponent.SplitStatistics.name)

        schema_split = SchemaGen(
            statistics=statistics_split.outputs.output, ).with_id(
                GDPComponent.SplitSchema.name)

        schema = schema_split.outputs.schema

        component_list.extend([splits, statistics_split, schema_split])

        ##############
        # SEQUENCING #
        ##############
        if keys.TrainingSteps.SEQUENCER in steps:
            sequencer_config = steps[keys.TrainingSteps.SEQUENCER]
            sequencer = Sequencer(
                input_examples=datapoints,
                schema=schema,
                statistics=statistics_split.outputs.statistics,
                source=sequencer_config[keys.StepKeys.SOURCE],
                source_args=sequencer_config[keys.StepKeys.ARGS]).with_id(
                    GDPComponent.Sequencer.name)

            sequencer_statistics = StatisticsGen(
                examples=sequencer.outputs.output_examples).with_id(
                    GDPComponent.SequencerStatistics.name)

            sequencer_schema = SchemaGen(
                statistics=sequencer_statistics.outputs.output,
                infer_feature_shape=True,
            ).with_id(GDPComponent.SequencerSchema.name)

            datapoints = sequencer.outputs.output_examples
            schema = sequencer_schema.outputs.schema

            component_list.extend(
                [sequencer, sequencer_statistics, sequencer_schema])

        #################
        # PREPROCESSING #
        #################
        transform = Transform(
            preprocessing_fn=constants.PREPROCESSING_FN,
            examples=datapoints,
            schema=schema,
            custom_config=steps[keys.TrainingSteps.PREPROCESSER]).with_id(
                GDPComponent.Transform.name)

        component_list.extend([transform])

        ############
        # TRAINING #
        ############
        training_backend: TrainingLocalBackend = \
            self.backends_dict[TrainingLocalBackend.BACKEND_KEY]
        training_kwargs = {
            'custom_executor_spec': training_backend.get_executor_spec(),
            'custom_config': steps[keys.TrainingSteps.TRAINER]
        }
        training_kwargs['custom_config'].update(
            training_backend.get_custom_config())

        trainer = Trainer(
            transformed_examples=transform.outputs.transformed_examples,
            transform_graph=transform.outputs.transform_graph,
            run_fn=constants.TRAINER_FN,
            schema=schema,
            train_args=trainer_pb2.TrainArgs(),
            eval_args=trainer_pb2.EvalArgs(),
            **training_kwargs).with_id(GDPComponent.Trainer.name)

        component_list.extend([trainer])

        #############
        # EVALUATOR #
        #############
        if keys.TrainingSteps.EVALUATOR in steps:
            from zenml.utils import source_utils
            eval_module = '.'.join(
                constants.EVALUATOR_MODULE_FN.split('.')[:-1])
            eval_module_file = constants.EVALUATOR_MODULE_FN.split('.')[-1]
            abs_path = source_utils.get_absolute_path_from_module(eval_module)
            custom_extractor_path = os.path.join(abs_path,
                                                 eval_module_file) + '.py'
            eval_step: TFMAEvaluator = TFMAEvaluator.from_config(
                steps[keys.TrainingSteps.EVALUATOR])
            eval_config = eval_step.build_eval_config()
            evaluator = Evaluator(
                examples=transform.outputs.transformed_examples,
                model=trainer.outputs.model,
                eval_config=eval_config,
                module_file=custom_extractor_path,
            ).with_id(GDPComponent.Evaluator.name)
            component_list.append(evaluator)

        ###########
        # SERVING #
        ###########
        if keys.TrainingSteps.DEPLOYER in steps:
            serving_args = steps[keys.TrainingSteps.DEPLOYER]['args']

            project_id = serving_args['project_id']
            output_base_dir = self.artifact_store.path
            if 'model_name' in serving_args:
                model_name = serving_args['model_name']
            else:
                model_name = self.pipeline_name().replace('-', '_')

            gcaip_deployer = GCAIPDeployer(output_base_dir=output_base_dir,
                                           project_id=project_id,
                                           model_name=model_name)

            pusher_config = gcaip_deployer.build_pusher_config()
            pusher_executor_spec = gcaip_deployer.get_executor_spec()

            pusher = Pusher(model_export=trainer.outputs.output,
                            custom_executor_spec=pusher_executor_spec,
                            **pusher_config).with_id(
                                GDPComponent.Deployer.name)

            component_list.append(pusher)

        return component_list
Exemple #3
0
        features=['times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi',
                  'pedigree', 'age'],
        labels=['has_diabetes'],
        overwrite={'has_diabetes': {
            'transform': [{'method': 'no_transform', 'parameters': {}}]}}
    ).with_backend(processing_backend)
)

# Add a trainer
training_pipeline.add_trainer(FeedForwardTrainer(
    loss='binary_crossentropy',
    last_activation='sigmoid',
    output_units=1,
    metrics=['accuracy'],
    epochs=20))

# Add an evaluator
training_pipeline.add_evaluator(
    TFMAEvaluator(
        slices=[['has_diabetes']],
        metrics={'has_diabetes': ['binary_crossentropy', 'binary_accuracy']}
    ).with_backend(processing_backend)
)

# Define the artifact store
artifact_store = ArtifactStore(
    os.path.join(GCP_BUCKET, 'dataflow_processing/artifact_store'))

# Run the pipeline
training_pipeline.run(artifact_store=artifact_store)