pipeline_root = os.path.join(TEST_ROOT, "pipelines") csv_root = os.path.join(TEST_ROOT, "test_data") image_root = os.path.join(csv_root, "images") repo: Repository = Repository.get_instance() if path_utils.is_dir(pipeline_root): path_utils.rm_dir(pipeline_root) repo.zenml_config.set_pipelines_dir(pipeline_root) try: for i in range(1, 6): training_pipeline = TrainingPipeline(name='csvtest{0}'.format(i)) try: # Add a datasource. This will automatically track and version it. ds = CSVDatasource(name='my_csv_datasource', path=os.path.join(csv_root, "my_dataframe.csv")) except: ds = repo.get_datasource_by_name("my_csv_datasource") training_pipeline.add_datasource(ds) # Add a split training_pipeline.add_split( CategoricalDomainSplit(categorical_column="name", split_map={ 'train': ["arnold", "nicholas"], 'eval': ["lülük"] })) # Add a preprocessing unit training_pipeline.add_preprocesser(
StandardPreprocesser from zenml.core.steps.split.random_split import RandomSplit from zenml.core.steps.trainer.feedforward_trainer import FeedForwardTrainer artifact_store_path = 'gs://your-bucket-name/optional-subfolder' project = 'PROJECT' # the project to launch the VM in cloudsql_connection_name = f'{project}:REGION:INSTANCE' mysql_db = 'DATABASE' mysql_user = '******' mysql_pw = 'PASSWORD' training_job_dir = artifact_store_path + '/gcaiptrainer/' training_pipeline = TrainingPipeline(name='GCP Orchestrated') # Add a datasource. This will automatically track and version it. ds = CSVDatasource(name='Pima Indians Diabetes', path='gs://zenml_quickstart/diabetes.csv') training_pipeline.add_datasource(ds) # Add a split training_pipeline.add_split(RandomSplit( split_map={'train': 0.7, 'eval': 0.3})) # Add a preprocessing unit training_pipeline.add_preprocesser( StandardPreprocesser( features=['times_pregnant', 'pgc', 'dbp', 'tst', 'insulin', 'bmi', 'pedigree', 'age'], labels=['has_diabetes'], overwrite={'has_diabetes': { 'transform': [{'method': 'no_transform', 'parameters': {}}]}} ))
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express # or implied. See the License for the specific language governing # permissions and limitations under the License. from examples.nlp.training.trainer import UrduTrainer from zenml.core.datasources.csv_datasource import CSVDatasource from zenml.core.pipelines.nlp_pipeline import NLPPipeline from zenml.core.repo.repo import Repository from zenml.core.steps.split.random_split import RandomSplit from zenml.core.steps.tokenizer.hf_tokenizer import HuggingFaceTokenizerStep from zenml.utils.exceptions import AlreadyExistsException nlp_pipeline = NLPPipeline() try: ds = CSVDatasource(name="my_text", path="gs://zenml_quickstart/urdu_fake_news.csv") except AlreadyExistsException: ds = Repository.get_instance().get_datasource_by_name(name="my_text") nlp_pipeline.add_datasource(ds) tokenizer_step = HuggingFaceTokenizerStep(text_feature="news", tokenizer="bert-wordpiece", vocab_size=3000) nlp_pipeline.add_tokenizer(tokenizer_step=tokenizer_step) nlp_pipeline.add_split(RandomSplit(split_map={"train": 0.9, "eval": 0.1})) nlp_pipeline.add_trainer(UrduTrainer(model_name="distilbert-base-uncased",