コード例 #1
0
ファイル: repo.py プロジェクト: sjoerdteunisse/zenml
    def __init__(self, path: Text = None):
        """
        Construct reference a ZenML repository

        Args:
            path (str): Path to root of repository
        """
        if Repository.__instance__ is None:
            if path is None:
                try:
                    # Start from cwd and traverse up until find zenml config.
                    path = Repository.get_zenml_dir(os.getcwd())
                except Exception:
                    # If there isnt a zenml.config, use the cwd
                    path = os.getcwd()

            if not path_utils.is_dir(path):
                raise Exception(f'{path} does not exist or is not a dir!')
            self.path = path

            # Hook up git, path needs to have a git folder.
            self.git_wrapper = GitWrapper(self.path)

            # Load the ZenML config
            try:
                self.zenml_config = ZenMLConfig(self.path)
            except InitializationException:
                # We allow this because we of the GCP orchestrator for now
                self.zenml_config = None

            Repository.__instance__ = self
        else:
            raise Exception("You cannot create another Repository class!")
コード例 #2
0
    def get_pipeline_file_paths(self, only_file_names: bool = False) -> \
            Optional[List[Text]]:
        """Gets list of pipeline file path"""
        self._check_if_initialized()

        pipelines_dir = self.zenml_config.get_pipelines_dir()

        if not path_utils.is_dir(pipelines_dir):
            return []
        return path_utils.list_dir(pipelines_dir, only_file_names)
コード例 #3
0
ファイル: csv_data_step.py プロジェクト: zeta1999/zenml
def read_files_from_disk(pipeline: beam.Pipeline,
                         base_path: Text) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to read data from a collection of CSV files
    on a local file system.
    Args:
        pipeline: Input beam.Pipeline object coming from a TFX Executor.
        base_path: Base path pointing either to the directory containing the
         CSV files, or to a (single) CSV file.

    Returns:
        A beam.PCollection of data points. Each row in the collection of
         CSV files represents a single data point.

    """
    wildcard_qualifier = "*"
    file_pattern = os.path.join(base_path, wildcard_qualifier)

    if path_utils.is_dir(base_path):
        csv_files = path_utils.list_dir(base_path)
        if not csv_files:
            raise RuntimeError(
                'Split pattern {} does not match any files.'.format(
                    file_pattern))
    else:
        if path_utils.file_exists(base_path):
            csv_files = [base_path]
        else:
            raise RuntimeError(f'{base_path} does not exist.')

    # weed out bad file exts with this logic
    allowed_file_exts = [".csv", ".txt"]  # ".dat"
    csv_files = [
        uri for uri in csv_files
        if os.path.splitext(uri)[1] in allowed_file_exts
    ]

    logger.info(f'Matched {len(csv_files)}: {csv_files}')

    # Always use header from file
    logger.info(f'Using header from file: {csv_files[0]}.')
    column_names = path_utils.load_csv_header(csv_files[0])
    logger.info(f'Header: {column_names}.')

    parsed_csv_lines = (
        pipeline
        | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=base_path,
                                                 skip_header_lines=1)
        | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=','))
        | 'ExtractParsedCSVLines' >>
        beam.Map(lambda x: dict(zip(column_names, x[0]))))

    return parsed_csv_lines
コード例 #4
0
ファイル: yaml_utils.py プロジェクト: zeta1999/zenml
def write_json(file_path: Text, contents: Dict):
    """
    Write contents as JSON format to file_path.

    Args:
        file_path (str): Path to JSON file.
        contents (dict): Contents of JSON file as dict.
    """
    if not path_utils.is_remote(file_path):
        dir_ = str(Path(file_path).parent)
        if not path_utils.is_dir(dir_):
            # If it is a local path and it doesnt exist, raise Exception.
            raise Exception(f'Directory {dir_} does not exist.')
    path_utils.write_file_contents(file_path, json.dumps(contents))
コード例 #5
0
ファイル: torch_ff_trainer.py プロジェクト: zilongqiu/zenml
    def run_fn(self):
        train_dataset = self.input_fn(self.train_files,
                                      self.tf_transform_output)

        eval_dataset = self.input_fn(self.eval_files, self.tf_transform_output)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = self.model_fn(train_dataset, eval_dataset)

        model.to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        model.train()
        for e in range(1, self.epoch + 1):
            epoch_loss = 0
            epoch_acc = 0
            step_count = 0
            for x, y in train_dataset:
                step_count += 1
                X_batch, y_batch = x.to(device), y.to(device)
                optimizer.zero_grad()
                y_pred = model(X_batch)

                loss = criterion(y_pred, y_batch)
                acc = binary_acc(y_pred, y_batch)

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()

            print(f'Epoch {e + 0:03}: | Loss: '
                  f'{epoch_loss / step_count:.5f} | Acc: '
                  f'{epoch_acc / step_count:.3f}')

        path_utils.create_dir_if_not_exists(self.serving_model_dir)
        if path_utils.is_remote(self.serving_model_dir):
            temp_model_dir = '__temp_model_dir__'
            temp_path = os.path.join(os.getcwd(), temp_model_dir)
            if path_utils.is_dir(temp_path):
                raise PermissionError('{} is used as a temp path but it '
                                      'already exists. Please remove it to '
                                      'continue.')
            torch.save(model, temp_path)
            path_utils.copy_dir(temp_path, self.serving_model_dir)
            path_utils.rm_dir(temp_path)
        else:
            torch.save(model, os.path.join(self.serving_model_dir, 'model.pt'))
コード例 #6
0
    def check_module_clean(self, source: Text):
        """
        Returns True if all files within source's module are committed.

        Args:
            source (str): relative module path pointing to a Class.
        """
        # import here to resolve circular dependency
        from zenml.utils import source_utils

        # Get the module path
        module_path = source_utils.get_module_source_from_source(source)

        # Get relative path of module because check_file_committed needs that
        module_dir = source_utils.get_relative_path_from_module_source(
            module_path)

        # Get absolute path of module because path_utils.list_dir needs that
        mod_abs_dir = source_utils.get_absolute_path_from_module_source(
            module_path)
        module_file_names = path_utils.list_dir(mod_abs_dir,
                                                only_file_names=True)

        # Go through each file in module and see if there are uncommitted ones
        for file_path in module_file_names:
            path = os.path.join(module_dir, file_path)

            # if its .gitignored then continue and dont do anything
            if len(self.git_repo.ignored(path)) > 0:
                continue

            if path_utils.is_dir(os.path.join(mod_abs_dir, file_path)):
                logger.warning(
                    f'The step {source} is contained inside a module '
                    f'that '
                    f'has sub-directories (the sub-directory {file_path} at '
                    f'{mod_abs_dir}). For now, ZenML supports only a flat '
                    f'directory structure in which to place Steps. Please make'
                    f' sure that the Step does not utilize the sub-directory.')
            if not self.check_file_committed(path):
                return False
        return True
コード例 #7
0
from zenml.logger import get_logger

logger = get_logger(__name__)

# reset pipeline root to redirect to tests so that it writes the yamls there
ZENML_ROOT = str(Path(zenml.__path__[0]).parent)
TEST_ROOT = os.path.join(ZENML_ROOT, "tests")
Repository.init_repo(TEST_ROOT, analytics_opt_in=False)

pipeline_root = os.path.join(TEST_ROOT, "pipelines")
csv_root = os.path.join(TEST_ROOT, "test_data")
image_root = os.path.join(csv_root, "images")


repo: Repository = Repository.get_instance()
if path_utils.is_dir(pipeline_root):
    path_utils.rm_dir(pipeline_root)
repo.zenml_config.set_pipelines_dir(pipeline_root)

try:
    for i in range(1, 6):
        training_pipeline = TrainingPipeline(name='csvtest{0}'.format(i))

        try:
            # Add a datasource. This will automatically track and version it.
            ds = CSVDatasource(name='my_csv_datasource',
                               path=os.path.join(csv_root, "my_dataframe.csv"))
        except AlreadyExistsException:
            ds = repo.get_datasource_by_name("my_csv_datasource")

        training_pipeline.add_datasource(ds)
コード例 #8
0
    def run_fn(self):
        train_dataset = self.input_fn(self.train_files,
                                      self.tf_transform_output)

        eval_dataset = self.input_fn(self.eval_files, self.tf_transform_output)

        class LitModel(pl.LightningModule):
            def __init__(self):
                super().__init__()
                self.l1 = torch.nn.Linear(8, 64)
                self.layer_out = torch.nn.Linear(64, 1)

            def forward(self, x):
                x = torch.relu(self.l1(x))
                x = self.layer_out(x)
                return x

            def training_step(self, batch, batch_idx):
                x, y = batch
                y_hat = self(x)
                loss = F.binary_cross_entropy_with_logits(y_hat, y)
                tensorboard_logs = {'train_loss': loss}
                return {'loss': loss, 'log': tensorboard_logs}

            def configure_optimizers(self):
                return torch.optim.Adam(self.parameters(), lr=0.001)

            def train_dataloader(self):
                return train_dataset

            def validation_step(self, batch, batch_idx):
                x, y = batch
                y_hat = self(x)
                return {
                    'val_loss': F.binary_cross_entropy_with_logits(y_hat, y)
                }

            def validation_epoch_end(self, outputs):
                avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
                tensorboard_logs = {'val_loss': avg_loss}
                return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

            def val_dataloader(self):
                return eval_dataset

        model = LitModel()

        # most basic trainer, uses good defaults
        trainer = Trainer(
            default_root_dir=self.log_dir,
            max_epochs=self.epoch,
        )
        trainer.fit(model)

        path_utils.create_dir_if_not_exists(self.serving_model_dir)
        if path_utils.is_remote(self.serving_model_dir):
            temp_model_dir = '__temp_model_dir__'
            temp_path = os.path.join(os.getcwd(), temp_model_dir)
            if path_utils.is_dir(temp_path):
                raise PermissionError('{} is used as a temp path but it '
                                      'already exists. Please remove it to '
                                      'continue.')
            trainer.save_checkpoint(os.path.join(temp_path, 'model.cpkt'))
            path_utils.copy_dir(temp_path, self.serving_model_dir)
            path_utils.rm_dir(temp_path)
        else:
            trainer.save_checkpoint(
                os.path.join(self.serving_model_dir, 'model.ckpt'))
コード例 #9
0
    def run_fn(self):
        train_dataset = self.input_fn(self.train_files,
                                      self.tf_transform_output)

        eval_dataset = self.input_fn(self.eval_files,
                                     self.tf_transform_output)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model = self.model_fn(train_dataset, eval_dataset)

        model.to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        writer = SummaryWriter(self.log_dir)

        model.train()

        total_count = 0

        for e in range(1, self.epochs + 1):
            epoch_loss = 0
            epoch_acc = 0
            step_count = 0
            for x, y, _ in train_dataset:
                step_count += 1
                total_count += 1

                x_batch = torch.cat([v.to(device) for v in x.values()], dim=-1)
                y_batch = torch.cat([v.to(device) for v in y.values()], dim=-1)
                optimizer.zero_grad()

                y_pred = model(x_batch)

                loss = criterion(y_pred, y_batch)
                acc = binary_acc(y_pred, y_batch)

                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()
                epoch_acc += acc.item()

                if e == 1 and step_count == 1:
                    writer.add_graph(model, x_batch)

                writer.add_scalar('training_loss', loss, total_count)
                writer.add_scalar('training_accuracy', acc, total_count)

            print(f'Epoch {e + 0:03}: | Loss: '
                  f'{epoch_loss / step_count:.5f} | Acc: '
                  f'{epoch_acc / step_count:.3f}')

        # test
        test_results = self.test_fn(model, eval_dataset)
        utils.save_test_results(test_results, self.test_results)

        path_utils.create_dir_if_not_exists(self.serving_model_dir)
        if path_utils.is_remote(self.serving_model_dir):
            temp_model_dir = '__temp_model_dir__'
            temp_path = os.path.join(os.getcwd(), temp_model_dir)
            if path_utils.is_dir(temp_path):
                raise PermissionError('{} is used as a temp path but it '
                                      'already exists. Please remove it to '
                                      'continue.')
            torch.save(model, temp_path)
            path_utils.copy_dir(temp_path, self.serving_model_dir)
            path_utils.rm_dir(temp_path)
        else:
            torch.save(model, os.path.join(self.serving_model_dir, 'model.pt'))