Ejemplo n.º 1
0
pipeline_root = os.path.join(TEST_ROOT, "pipelines")
csv_root = os.path.join(TEST_ROOT, "test_data")
image_root = os.path.join(csv_root, "images")

repo: Repository = Repository.get_instance()
if path_utils.is_dir(pipeline_root):
    path_utils.rm_dir(pipeline_root)
repo.zenml_config.set_pipelines_dir(pipeline_root)

try:
    for i in range(1, 6):
        training_pipeline = TrainingPipeline(name='csvtest{0}'.format(i))

        try:
            # Add a datasource. This will automatically track and version it.
            ds = CSVDatasource(name='my_csv_datasource',
                               path=os.path.join(csv_root, "my_dataframe.csv"))
        except:
            ds = repo.get_datasource_by_name("my_csv_datasource")

        training_pipeline.add_datasource(ds)

        # Add a split
        training_pipeline.add_split(
            CategoricalDomainSplit(categorical_column="name",
                                   split_map={
                                       'train': ["arnold", "nicholas"],
                                       'eval': ["lülük"]
                                   }))

        # Add a preprocessing unit
        training_pipeline.add_preprocesser(
Ejemplo n.º 2
0
    StandardPreprocesser
from zenml.core.steps.split.random_split import RandomSplit
from zenml.core.steps.trainer.feedforward_trainer import FeedForwardTrainer

artifact_store_path = 'gs://your-bucket-name/optional-subfolder'
project = 'PROJECT'  # the project to launch the VM in
cloudsql_connection_name = f'{project}:REGION:INSTANCE'
mysql_db = 'DATABASE'
mysql_user = '******'
mysql_pw = 'PASSWORD'
training_job_dir = artifact_store_path + '/gcaiptrainer/'

training_pipeline = TrainingPipeline(name='GCP Orchestrated')

# Add a datasource. This will automatically track and version it.
ds = CSVDatasource(name='Pima Indians Diabetes',
                   path='gs://zenml_quickstart/diabetes.csv')
training_pipeline.add_datasource(ds)

# Add a split
training_pipeline.add_split(RandomSplit(
    split_map={'train': 0.7, 'eval': 0.3}))

# Add a preprocessing unit
training_pipeline.add_preprocesser(
    StandardPreprocesser(
        features=['times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi',
                  'pedigree', 'age'],
        labels=['has_diabetes'],
        overwrite={'has_diabetes': {
            'transform': [{'method': 'no_transform', 'parameters': {}}]}}
    ))
Ejemplo n.º 3
0
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
#  or implied. See the License for the specific language governing
#  permissions and limitations under the License.

from examples.nlp.training.trainer import UrduTrainer
from zenml.core.datasources.csv_datasource import CSVDatasource
from zenml.core.pipelines.nlp_pipeline import NLPPipeline
from zenml.core.repo.repo import Repository
from zenml.core.steps.split.random_split import RandomSplit
from zenml.core.steps.tokenizer.hf_tokenizer import HuggingFaceTokenizerStep
from zenml.utils.exceptions import AlreadyExistsException

nlp_pipeline = NLPPipeline()

try:
    ds = CSVDatasource(name="my_text",
                       path="gs://zenml_quickstart/urdu_fake_news.csv")
except AlreadyExistsException:
    ds = Repository.get_instance().get_datasource_by_name(name="my_text")

nlp_pipeline.add_datasource(ds)

tokenizer_step = HuggingFaceTokenizerStep(text_feature="news",
                                          tokenizer="bert-wordpiece",
                                          vocab_size=3000)

nlp_pipeline.add_tokenizer(tokenizer_step=tokenizer_step)

nlp_pipeline.add_split(RandomSplit(split_map={"train": 0.9,
                                              "eval": 0.1}))

nlp_pipeline.add_trainer(UrduTrainer(model_name="distilbert-base-uncased",