Beispiel #1
0
    def load_vocab(self, path_to_vocab: Text):
        """
        Re-instantiate the class tokenizer with output vocabulary / merges.

        Args:
            path_to_vocab: Path to vocab / merges files from a training run.
        """

        # inspect contents of output dir
        contents = path_utils.list_dir(path_to_vocab)

        try:
            vocab_file = next(f for f in contents if "vocab" in f)
        except StopIteration:
            vocab_file = None

        # update tokenizer params with vocab file name
        self.tokenizer_params.update({"vocab": vocab_file})

        # merges are only needed for BPE Tokenizers
        if "bpe" in self.tokenizer_name:
            try:
                merges_file = next(f for f in contents if "merge" in f)
            except StopIteration:
                merges_file = None

            self.tokenizer_params.update({"merges": merges_file})

        # reconstruct tokenizer object
        self.tokenizer = tokenizer_map.get(
            self.tokenizer_name)(**self.tokenizer_params)
        self.tokenizer.enable_padding(length=self.sentence_length)
        self.tokenizer.enable_truncation(max_length=self.sentence_length)
Beispiel #2
0
    def input_fn(self,
                 file_pattern: List[Text],
                 tf_transform_output: tft.TFTransformOutput):
        """
        Load TFRecords on disk to pandas dataframe.

        Args:
            file_pattern: File pattern matching saved TFRecords on disk.
            tf_transform_output: Output of the preceding Transform /
             Preprocessing component.

        Returns:
            dataset: tf.data.Dataset created out of the input files.
        """
        xf_feature_spec = tf_transform_output.transformed_feature_spec()

        xf_feature_spec = {x: xf_feature_spec[x]
                           for x in xf_feature_spec
                           if x.endswith('_xf')}

        root_path = [x.replace("*", "") for x in file_pattern][0]
        dataset = tf.data.TFRecordDataset(
            path_utils.list_dir(root_path),  # a bit ugly
            compression_type='GZIP')
        df = convert_raw_dataset_to_pandas(dataset, xf_feature_spec, 100000)

        # Seperate labels
        X = df[[x for x in df.columns if 'label_' not in x]]
        y = df[[x for x in df.columns if 'label_' in x]]
        return X, y
Beispiel #3
0
    def wrapper():
        repo: Repository = Repository.get_instance()
        repo.zenml_config.set_pipelines_dir(pipeline_root)

        for p_config in path_utils.list_dir(pipeline_root):
            y = yaml_utils.read_yaml(p_config)
            p: TrainingPipeline = TrainingPipeline.from_config(y)
            p.run()
Beispiel #4
0
 def wrapper():
     repo: Repository = Repository.get_instance()
     pipelines_dir = repo.zenml_config.get_pipelines_dir()
     for p_config in path_utils.list_dir(pipelines_dir):
         try:
             os.remove(p_config)
         except Exception as e:
             print(e)
Beispiel #5
0
    def get_pipeline_file_paths(self, only_file_names: bool = False) -> \
            Optional[List[Text]]:
        """Gets list of pipeline file path"""
        self._check_if_initialized()

        pipelines_dir = self.zenml_config.get_pipelines_dir()

        if not path_utils.is_dir(pipelines_dir):
            return []
        return path_utils.list_dir(pipelines_dir, only_file_names)
Beispiel #6
0
def read_files_from_disk(pipeline: beam.Pipeline,
                         base_path: Text) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to read data from a collection of CSV files
    on a local file system.
    Args:
        pipeline: Input beam.Pipeline object coming from a TFX Executor.
        base_path: Base path pointing either to the directory containing the
         CSV files, or to a (single) CSV file.

    Returns:
        A beam.PCollection of data points. Each row in the collection of
         CSV files represents a single data point.

    """
    wildcard_qualifier = "*"
    file_pattern = os.path.join(base_path, wildcard_qualifier)

    if path_utils.is_dir(base_path):
        csv_files = path_utils.list_dir(base_path)
        if not csv_files:
            raise RuntimeError(
                'Split pattern {} does not match any files.'.format(
                    file_pattern))
    else:
        if path_utils.file_exists(base_path):
            csv_files = [base_path]
        else:
            raise RuntimeError(f'{base_path} does not exist.')

    # weed out bad file exts with this logic
    allowed_file_exts = [".csv", ".txt"]  # ".dat"
    csv_files = [
        uri for uri in csv_files
        if os.path.splitext(uri)[1] in allowed_file_exts
    ]

    logger.info(f'Matched {len(csv_files)}: {csv_files}')

    # Always use header from file
    logger.info(f'Using header from file: {csv_files[0]}.')
    column_names = path_utils.load_csv_header(csv_files[0])
    logger.info(f'Header: {column_names}.')

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=base_path,
                                                 skip_header_lines=1)
        | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
        | 'ExtractParsedCSVLines' >>
        beam.Map(lambda x: dict(zip(column_names, x[0]))))

    return parsed_csv_lines
Beispiel #7
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:

        source = exec_properties[StepKeys.SOURCE]
        args = exec_properties[StepKeys.ARGS]

        c = source_utils.load_source_path_class(source)
        tokenizer_step: BaseTokenizer = c(**args)

        tokenizer_location = artifact_utils.get_single_uri(
            output_dict["tokenizer"])

        split_uris, split_names, all_files = [], [], []
        for artifact in input_dict["examples"]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                split_names.append(split)
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
                all_files += path_utils.list_dir(uri)

        # Get output split path
        output_examples = artifact_utils.get_single_instance(
            output_dict["output_examples"])
        output_examples.split_names = artifact_utils.encode_split_names(
            split_names)

        if not tokenizer_step.skip_training:
            tokenizer_step.train(files=all_files)

            tokenizer_step.save(output_dir=tokenizer_location)

        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                input_uri = io_utils.all_files_pattern(uri)

                _ = (p
                     | 'ReadData.' + split >> beam.io.ReadFromTFRecord(
                            file_pattern=input_uri)
                     | "ParseTFExFromString." + split >> beam.Map(
                            tf.train.Example.FromString)
                     | "AddTokens." + split >> beam.Map(
                            append_tf_example,
                            tokenizer_step=tokenizer_step)
                     | 'Serialize.' + split >> beam.Map(
                            lambda x: x.SerializeToString())
                     | 'WriteSplit.' + split >> WriteSplit(
                            get_split_uri(
                                output_dict["output_examples"],
                                split)))
Beispiel #8
0
    def get_predictions(self, sample_size: int = 100000):
        """
        Samples prediction data as a pandas DataFrame.

        Args:
            sample_size: # of rows to sample.
        """
        base_uri = self.get_artifacts_uri_by_component(
            GDPComponent.Inferrer.name)[0]
        data_files = path_utils.list_dir(os.path.join(base_uri, 'examples'))
        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        schema_uri = self.get_artifacts_uri_by_component(
            GDPComponent.DataSchema.name)[0]
        spec = get_feature_spec_from_schema(schema_uri)
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
Beispiel #9
0
 def _get_data_file_paths(self, pipeline):
     """
     Gets path where data is stored as list of file paths.
     Args:
         pipeline: a pipeline with this datasource embedded
     """
     if pipeline.datasource._id != self._id:
         raise AssertionError('This pipeline does not belong to this '
                              'datasource.')
     # Take any pipeline and get the datagen
     data_uri = os.path.join(
         pipeline.get_artifacts_uri_by_component(
             GDPComponent.DataGen.name)[0], 'examples')
     data_files = path_utils.list_dir(data_uri)
     return data_files
Beispiel #10
0
def test_get_artifacts_uri_by_component(repo):
    test_component_name = GDPComponent.SplitGen.name

    p_names = sorted(repo.get_pipeline_names())

    p: BasePipeline = repo.get_pipeline_by_name(p_names[0])

    uri_list = p.get_artifacts_uri_by_component(test_component_name)

    # assert it is not empty
    assert uri_list
    # assert artifact was written
    uri = uri_list[0]
    written_artifacts = path_utils.list_dir(uri)
    assert written_artifacts
    # TODO: Ugly TFRecord validation
    assert all((("tfrecord" in name and os.path.splitext(name)[-1] == ".gz")
                for name in f) for _, _, f in os.walk(uri))
Beispiel #11
0
    def check_module_clean(self, source: Text):
        """
        Returns True if all files within source's module are committed.

        Args:
            source (str): relative module path pointing to a Class.
        """
        # import here to resolve circular dependency
        from zenml.utils import source_utils

        # Get the module path
        module_path = source_utils.get_module_source_from_source(source)

        # Get relative path of module because check_file_committed needs that
        module_dir = source_utils.get_relative_path_from_module_source(
            module_path)

        # Get absolute path of module because path_utils.list_dir needs that
        mod_abs_dir = source_utils.get_absolute_path_from_module_source(
            module_path)
        module_file_names = path_utils.list_dir(mod_abs_dir,
                                                only_file_names=True)

        # Go through each file in module and see if there are uncommitted ones
        for file_path in module_file_names:
            path = os.path.join(module_dir, file_path)

            # if its .gitignored then continue and dont do anything
            if len(self.git_repo.ignored(path)) > 0:
                continue

            if path_utils.is_dir(os.path.join(mod_abs_dir, file_path)):
                logger.warning(
                    f'The step {source} is contained inside a module '
                    f'that '
                    f'has sub-directories (the sub-directory {file_path} at '
                    f'{mod_abs_dir}). For now, ZenML supports only a flat '
                    f'directory structure in which to place Steps. Please make'
                    f' sure that the Step does not utilize the sub-directory.')
            if not self.check_file_committed(path):
                return False
        return True
Beispiel #12
0
    def sample_transformed_data(self,
                                split_name: Text = 'eval',
                                sample_size: int = 100000):
        """
        Samples transformed data as a pandas DataFrame.

        Args:
            split_name: name of split to see
            sample_size: # of rows to sample.
        """
        base_uri = self.get_artifacts_uri_by_component(
            GDPComponent.Transform.name)[0]
        transform_schema = os.path.join(base_uri, 'transformed_metadata')
        spec = get_feature_spec_from_schema(transform_schema)

        base_uri = Path(base_uri)
        id_ = base_uri.name
        transform_data_path = os.path.join(str(base_uri.parent.parent),
                                           'transformed_examples', id_)

        split_data_path = os.path.join(transform_data_path, split_name)
        data_files = path_utils.list_dir(split_data_path)
        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
Beispiel #13
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:

        # Check the inputs
        if constants.EXAMPLES not in input_dict:
            raise ValueError(f'{constants.EXAMPLES} is missing from inputs')
        examples_artifact = input_dict[constants.EXAMPLES]

        input_uri = artifact_utils.get_single_uri(examples_artifact)
        if len(zenml_path_utils.list_dir(input_uri)) == 0:
            raise AssertionError(
                'ZenML can not run the evaluation as the provided input '
                'configuration does not point towards any data. Specifically, '
                'if you are using the agnostic evaluator, please make sure '
                'that you are using a proper test_fn in your trainer step to '
                'write these results.')

        else:
            # Check the outputs
            if constants.EVALUATION not in output_dict:
                raise ValueError(
                    f'{constants.EVALUATION} is missing from outputs')
            evaluation_artifact = output_dict[constants.EVALUATION]
            output_uri = artifact_utils.get_single_uri(evaluation_artifact)

            # Resolve the schema
            schema = None
            if constants.SCHEMA in input_dict:
                schema_artifact = input_dict[constants.SCHEMA]
                schema_uri = artifact_utils.get_single_uri(schema_artifact)
                reader = io_utils.SchemaReader()
                schema = reader.read(io_utils.get_only_uri_in_dir(schema_uri))

            # Create the step with the schema attached if provided
            source = exec_properties[StepKeys.SOURCE]
            args = exec_properties[StepKeys.ARGS]
            c = source_utils.load_source_path_class(source)
            evaluator_step: BaseEvaluatorStep = c(**args)

            # Check the execution parameters
            eval_config = evaluator_step.build_config()
            eval_config = tfma.update_eval_config_with_defaults(eval_config)
            tfma.verify_eval_config(eval_config)

            # Resolve the model
            if constants.MODEL in input_dict:
                model_artifact = input_dict[constants.MODEL]
                model_uri = artifact_utils.get_single_uri(model_artifact)
                model_path = path_utils.serving_model_path(model_uri)

                model_fn = try_get_fn(evaluator_step.CUSTOM_MODULE,
                                      'custom_eval_shared_model'
                                      ) or tfma.default_eval_shared_model

                eval_shared_model = model_fn(
                    model_name='',  # TODO: Fix with model names
                    eval_saved_model_path=model_path,
                    eval_config=eval_config)
            else:
                eval_shared_model = None

            self._log_startup(input_dict, output_dict, exec_properties)

            # Main pipeline
            logging.info('Evaluating model.')
            with self._make_beam_pipeline() as pipeline:
                examples_list = []
                tensor_adapter_config = None

                if tfma.is_batched_input(eval_shared_model, eval_config):
                    tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
                        examples=[
                            artifact_utils.get_single_instance(
                                examples_artifact)
                        ],
                        telemetry_descriptors=_TELEMETRY_DESCRIPTORS,
                        schema=schema,
                        raw_record_column_name=tfma_constants.
                        ARROW_INPUT_COLUMN)
                    for split in evaluator_step.splits:
                        file_pattern = io_utils.all_files_pattern(
                            artifact_utils.get_split_uri(
                                examples_artifact, split))
                        tfxio = tfxio_factory(file_pattern)
                        data = (pipeline
                                | 'ReadFromTFRecordToArrow[%s]' % split >>
                                tfxio.BeamSource())
                        examples_list.append(data)
                    if schema is not None:
                        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                            arrow_schema=tfxio.ArrowSchema(),
                            tensor_representations=tfxio.TensorRepresentations(
                            ))
                else:
                    for split in evaluator_step.splits:
                        file_pattern = io_utils.all_files_pattern(
                            artifact_utils.get_split_uri(
                                examples_artifact, split))
                        data = (pipeline
                                | 'ReadFromTFRecord[%s]' % split >> beam.io.
                                ReadFromTFRecord(file_pattern=file_pattern))
                        examples_list.append(data)

                # Resolve custom extractors
                custom_extractors = try_get_fn(evaluator_step.CUSTOM_MODULE,
                                               'custom_extractors')
                extractors = None
                if custom_extractors:
                    extractors = custom_extractors(
                        eval_shared_model=eval_shared_model,
                        eval_config=eval_config,
                        tensor_adapter_config=tensor_adapter_config)

                # Resolve custom evaluators
                custom_evaluators = try_get_fn(evaluator_step.CUSTOM_MODULE,
                                               'custom_evaluators')
                evaluators = None
                if custom_evaluators:
                    evaluators = custom_evaluators(
                        eval_shared_model=eval_shared_model,
                        eval_config=eval_config,
                        tensor_adapter_config=tensor_adapter_config)

                # Extract, evaluate and write
                (examples_list | 'FlattenExamples' >> beam.Flatten()
                 | 'ExtractEvaluateAndWriteResults' >>
                 tfma.ExtractEvaluateAndWriteResults(
                     eval_config=eval_config,
                     eval_shared_model=eval_shared_model,
                     output_path=output_uri,
                     extractors=extractors,
                     evaluators=evaluators,
                     tensor_adapter_config=tensor_adapter_config))
            logging.info('Evaluation complete. Results written to %s.',
                         output_uri)