Ejemplo n.º 1
0
 def train(df):
     # https://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_logistic_regression_mnist.html
     clf = LogisticRegression(C=50. / 5000,
                              penalty='l1',
                              solver='saga',
                              tol=0.1)
     x_train, y_train = df[0], df[1]
     clf.fit(x_train, y_train)
     model_path = get_file_dir(__file__) + '/saved_model'
     if not os.path.exists(model_path):
         os.makedirs(model_path)
     model_timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
     model_path = model_path + '/' + model_timestamp
     dump(clf, model_path)
     model_meta: af.ModelMeta = execution_context.config.get(
         'model_info')
     print(model_meta.name)
     print(model_timestamp)
     # When registering a model, corresponding type of event will be sent to downstream job as well.
     af.register_model_version(
         model=model_meta,
         model_path=model_path,
         current_stage=ModelVersionStage.GENERATED)
     print(
         af.get_latest_generated_model_version(
             model_name=model_meta.name).model_path)
     return df
Ejemplo n.º 2
0
    def test_project_download_local_same_time(self):
        project_path = get_file_dir(__file__)
        upload_path = '/tmp/upload'
        download_path = '/tmp/download'
        if not os.path.exists(upload_path):
            os.makedirs(upload_path)
        if not os.path.exists(download_path):
            os.makedirs(download_path)
        config = {'local_repository': download_path, 'remote_repository': upload_path,
                  'blob_manager_class': 'ai_flow_plugins.blob_manager_plugins.local_blob_manager.LocalBlobManager'}

        blob_manager = BlobManagerFactory.get_blob_manager(config)
        uploaded_path = blob_manager.upload_project('1', project_path)
        download_project_path = "/tmp/download/workflow_1_project"
        if os.path.exists(download_project_path):
            shutil.rmtree(download_project_path)

        def download_project():
            blob_manager.download_project('1', uploaded_path)

        t1 = threading.Thread(target=download_project, args=())
        t1.setDaemon(True)
        t1.start()
        t2 = threading.Thread(target=download_project, args=())
        t2.setDaemon(True)
        t2.start()
        t1.join()
        t2.join()
        downloaded_path = blob_manager.download_project('1', uploaded_path)
        self.assertEqual('/tmp/download/workflow_1_project/blob_manager_plugins', downloaded_path)
        if os.path.exists(upload_path):
            shutil.rmtree(upload_path)
        if os.path.exists(download_path):
            shutil.rmtree(download_path)
Ejemplo n.º 3
0
    def test_project_upload_download_local(self):
        project_path = get_file_dir(__file__)
        config = {'blob_manager_class': 'ai_flow_plugins.blob_manager_plugins.local_blob_manager.LocalBlobManager'}
        blob_manager = BlobManagerFactory.get_blob_manager(config)
        uploaded_path = blob_manager.upload_project('1', project_path)
        self.assertEqual(uploaded_path, project_path)

        downloaded_path = blob_manager.download_project('1', uploaded_path)
        self.assertEqual(project_path, downloaded_path)
 def test_load_project_config(self):
     project_path = get_file_dir(__file__)
     project_config = ProjectConfig()
     project_config.load_from_file(os.path.join(project_path, 'project.yaml'))
     self.assertEqual(project_config.get_server_uri(), "localhost:50051")
     self.assertIsNone(project_config.get('ai_flow config', None))
     self.assertEqual(project_config['ai_flow_home'], '/opt/ai_flow')
     self.assertEqual(project_config['ai_flow_job_master.host'], 'localhost')
     self.assertEqual(project_config['ai_flow_job_master.port'], 8081)
     self.assertEqual(project_config['ai_flow_conf'], 'taskmanager.slot=2')
Ejemplo n.º 5
0
    def test_project_upload_download_local_2(self):
        project_path = get_file_dir(__file__)
        config = {'local_repository': '/tmp', 'remote_repository': '/tmp',
                  'blob_manager_class': 'ai_flow_plugins.blob_manager_plugins.local_blob_manager.LocalBlobManager'}

        blob_manager = BlobManagerFactory.get_blob_manager(config)
        uploaded_path = blob_manager.upload_project('1', project_path)

        downloaded_path = blob_manager.download_project('1', uploaded_path)
        self.assertEqual('/tmp/workflow_1_project/blob_manager_plugins', downloaded_path)
Ejemplo n.º 6
0
 def test_dump_load_configuration(self):
     config = AIFlowConfiguration()
     test_yaml = get_file_dir(__file__) + "/test.yaml"
     config['a'] = 'a'
     config.dump_to_file(test_yaml)
     config.clear()
     config.load_from_file(test_yaml)
     self.assertEqual('a', config['a'])
     if os.path.exists(test_yaml):
         os.remove(test_yaml)
Ejemplo n.º 7
0
    def test_project_upload_download_oss(self):
        project_path = get_file_dir(__file__)
        config = {
            'blob_manager_class':
            'ai_flow_plugins.blob_manager_plugins.oss_blob_manager.OssBlobManager',
            'local_repository': '/tmp',
            'access_key_id': os.environ.get('access_key_id'),
            'access_key_secret': os.environ.get('access_key_secret'),
            'endpoint': os.environ.get('endpoint'),
            'bucket': os.environ.get('bucket'),
            'repo_name': os.environ.get('repo_name')
        }

        blob_manager = BlobManagerFactory.get_blob_manager(config)
        uploaded_path = blob_manager.upload_project('1', project_path)

        downloaded_path = blob_manager.download_project('1', uploaded_path)
        self.assertEqual('/tmp/workflow_1_project/project', downloaded_path)
Ejemplo n.º 8
0
    def process(self, execution_context: ExecutionContext,
                input_list: List) -> List:
        """
        Train and save KNN model
        """
        model_meta: af.ModelMeta = execution_context.config.get('model_info')
        clf = KNeighborsClassifier(n_neighbors=5)
        x_train, y_train = input_list[0][0], input_list[0][1]
        clf.fit(x_train, y_train)

        # Save model to local
        model_path = get_file_dir(__file__) + '/saved_model'
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_timestamp = time.strftime('%Y_%m_%d_%H_%M_%S', time.localtime())
        model_path = model_path + '/' + model_timestamp
        dump(clf, model_path)
        af.register_model_version(model=model_meta, model_path=model_path)
        return []
Ejemplo n.º 9
0
 def test_build_project_context(self):
     project_path = get_file_dir(__file__)
     project_context = build_project_context(project_path)
     self.assertTrue(
         project_context.get_project_config_file().endswith('project.yaml'))
     self.assertTrue(
         project_context.get_generated_path().endswith('generated'))
     self.assertTrue(
         project_context.get_workflows_path().endswith('workflows'))
     self.assertTrue(
         project_context.get_dependencies_path().endswith('dependencies'))
     self.assertTrue(
         project_context.get_python_dependencies_path().endswith('python'))
     self.assertTrue(
         project_context.get_go_dependencies_path().endswith('go'))
     self.assertTrue(
         project_context.get_jar_dependencies_path().endswith('jar'))
     self.assertTrue(
         project_context.get_resources_path().endswith('resources'))
Ejemplo n.º 10
0
    def process(self, execution_context: ExecutionContext,
                input_list: List) -> List:
        # https://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_logistic_regression_mnist.html
        model_meta: af.ModelMeta = execution_context.config.get('model_info')
        clf = LogisticRegression(C=50. / 5000,
                                 penalty='l1',
                                 solver='saga',
                                 tol=0.1)
        x_train, y_train = input_list[0][0], input_list[0][1]
        clf.fit(x_train, y_train)
        model_path = get_file_dir(__file__) + '/saved_model'
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
        model_path = model_path + '/' + model_timestamp
        dump(clf, model_path)
        af.register_model_version(model=model_meta, model_path=model_path)

        return []
Ejemplo n.º 11
0
def run_workflow():
    # Init project
    af.init_ai_flow_context()

    artifact_prefix = af.current_project_config().get_project_name() + "."
    # Training of model
    with af.job_config('train'):
        # Register metadata of training data(dataset) and read dataset(i.e. training dataset)
        train_dataset = af.register_dataset(name=artifact_prefix + 'train_dataset',
                                            uri=DATASET_URI.format('train'))
        train_read_dataset = af.read_dataset(dataset_info=train_dataset,
                                             read_dataset_processor=DatasetReader())

        # Register model metadata and train model
        train_model = af.register_model(model_name=artifact_prefix + 'KNN',
                                        model_desc='KNN model')
        train_channel = af.train(input=[train_read_dataset],
                                 training_processor=ModelTrainer(),
                                 model_info=train_model)

    # Validation of model
    with af.job_config('validate'):
        # Read validation dataset
        validate_dataset = af.register_dataset(name=artifact_prefix + 'validate_dataset',
                                               uri=DATASET_URI.format('test'))
        # Validate model before it is used to predict
        validate_read_dataset = af.read_dataset(dataset_info=validate_dataset,
                                                read_dataset_processor=ValidateDatasetReader())
        validate_artifact_name = artifact_prefix + 'validate_artifact'
        validate_artifact = af.register_artifact(name=validate_artifact_name,
                                                 uri=get_file_dir(__file__) + '/validate_result')
        validate_channel = af.model_validate(input=[validate_read_dataset],
                                             model_info=train_model,
                                             model_validation_processor=ModelValidator(validate_artifact_name))

    # Prediction(Inference) using flink
    with af.job_config('predict'):
        # Read test data and do prediction
        predict_dataset = af.register_dataset(name=artifact_prefix + 'predict_dataset',
                                              uri=DATASET_URI.format('test'))
        predict_read_dataset = af.read_dataset(dataset_info=predict_dataset,
                                               read_dataset_processor=Source())
        predict_channel = af.predict(input=[predict_read_dataset],
                                     model_info=train_model,
                                     prediction_processor=Predictor())
        # Save prediction result
        write_dataset = af.register_dataset(name=artifact_prefix + 'write_dataset',
                                            uri=get_file_dir(__file__) + '/predict_result.csv')
        af.write_dataset(input=predict_channel,
                         dataset_info=write_dataset,
                         write_dataset_processor=Sink())

    # Define relation graph connected by control edge: train -> validate -> predict
    af.action_on_model_version_event(job_name='validate',
                                     model_version_event_type=ModelVersionEventType.MODEL_GENERATED,
                                     model_name=train_model.name)
    af.action_on_model_version_event(job_name='predict',
                                     model_version_event_type=ModelVersionEventType.MODEL_VALIDATED,
                                     model_name=train_model.name)
    # Submit workflow
    af.workflow_operation.submit_workflow(af.current_workflow_config().workflow_name)
    # Run workflow
    af.workflow_operation.start_new_workflow_execution(af.current_workflow_config().workflow_name)
def run_workflow():
    af.init_ai_flow_context()

    artifact_prefix = af.current_project_config().get_project_name() + "."

    # the config of train job is a periodic job  which means it will
    # run every `interval`(defined in workflow_config.yaml) seconds
    with af.job_config('train'):
        # Register metadata raw training data(dataset) and read dataset(i.e. training dataset)
        train_dataset = af.register_dataset(name=artifact_prefix +
                                            'train_dataset',
                                            uri=DATASET_URI.format('train'))
        train_read_dataset = af.read_dataset(
            dataset_info=train_dataset, read_dataset_processor=DatasetReader())

        # Transform(preprocessing) dataset
        train_transform = af.transform(
            input=[train_read_dataset],
            transform_processor=DatasetTransformer())

        # Register model metadata and train model
        train_model = af.register_model(model_name=artifact_prefix +
                                        'logistic-regression',
                                        model_desc='logistic regression model')
        train_channel = af.train(input=[train_transform],
                                 training_processor=ModelTrainer(),
                                 model_info=train_model)
    with af.job_config('validate'):
        # Validation of model
        # Read validation dataset and validate model before it is used to predict

        validate_dataset = af.register_dataset(
            name=artifact_prefix + 'validate_dataset',
            uri=DATASET_URI.format('evaluate'))
        validate_read_dataset = af.read_dataset(
            dataset_info=validate_dataset,
            read_dataset_processor=ValidateDatasetReader())
        validate_transform = af.transform(
            input=[validate_read_dataset],
            transform_processor=ValidateTransformer())
        validate_artifact_name = artifact_prefix + 'validate_artifact'
        validate_artifact = af.register_artifact(name=validate_artifact_name,
                                                 uri=get_file_dir(__file__) +
                                                 '/validate_result')
        validate_channel = af.model_validate(
            input=[validate_transform],
            model_info=train_model,
            model_validation_processor=ModelValidator(validate_artifact_name))
    with af.job_config('push'):
        # Push model to serving
        # Register metadata of pushed model
        push_model_artifact_name = artifact_prefix + 'push_model_artifact'
        push_model_artifact = af.register_artifact(
            name=push_model_artifact_name,
            uri=get_file_dir(__file__) + '/pushed_model')
        af.push_model(
            model_info=train_model,
            pushing_model_processor=ModelPusher(push_model_artifact_name))

    with af.job_config('predict'):
        # Prediction(Inference)
        predict_dataset = af.register_dataset(
            name=artifact_prefix + 'predict_dataset',
            uri=DATASET_URI.format('predict'))
        predict_read_dataset = af.read_dataset(
            dataset_info=predict_dataset,
            read_dataset_processor=PredictDatasetReader())
        predict_transform = af.transform(
            input=[predict_read_dataset],
            transform_processor=PredictTransformer())
        predict_channel = af.predict(input=[predict_transform],
                                     model_info=train_model,
                                     prediction_processor=ModelPredictor())
        # Save prediction result
        write_dataset = af.register_dataset(
            name=artifact_prefix + 'write_dataset',
            uri=get_file_dir(__file__) + '/predict_result')
        af.write_dataset(input=predict_channel,
                         dataset_info=write_dataset,
                         write_dataset_processor=DatasetWriter())

    # Define relation graph connected by control edge:
    # Once a round of training is done, validator will be launched and
    # pusher will be launched if the new model is better.
    # Prediction will start once the first round of training is done and
    # when pusher pushes(deploys) a new model, the predictor will use the latest deployed model as well.
    af.action_on_model_version_event(
        job_name='validate',
        model_version_event_type=ModelVersionEventType.MODEL_GENERATED,
        model_name=train_model.name)
    af.action_on_model_version_event(
        job_name='push',
        model_version_event_type=ModelVersionEventType.MODEL_VALIDATED,
        model_name=train_model.name)

    # Run workflow
    af.workflow_operation.submit_workflow(
        af.current_workflow_config().workflow_name)
    af.workflow_operation.start_new_workflow_execution(
        af.current_workflow_config().workflow_name)
Ejemplo n.º 13
0
def run_workflow():
    af.init_ai_flow_context()

    artifact_prefix = af.current_project_config().get_project_name() + "."

    with af.job_config('train'):
        # Register metadata raw training data(dataset) and read dataset(i.e. training dataset)
        train_dataset = af.register_dataset(name=artifact_prefix +
                                            'train_dataset',
                                            uri=DATASET_URI.format('train'))
        train_read_dataset = af.read_dataset(
            dataset_info=train_dataset,
            read_dataset_processor=TrainDatasetReader())
        train_transform = af.transform(
            input=[train_read_dataset],
            transform_processor=TrainDatasetTransformer())
        train_model = af.register_model(model_name=artifact_prefix +
                                        'logistic-regression',
                                        model_desc='logistic regression model')
        train_channel = af.train(input=[train_transform],
                                 training_processor=ModelTrainer(),
                                 model_info=train_model)
    with af.job_config('validate'):
        validate_dataset = af.register_dataset(
            name=artifact_prefix + 'validate_dataset',
            uri=DATASET_URI.format('evaluate'))
        validate_read_dataset = af.read_dataset(
            dataset_info=validate_dataset,
            read_dataset_processor=ValidateDatasetReader())
        validate_transform = af.transform(
            input=[validate_read_dataset],
            transform_processor=ValidateTransformer())
        validate_artifact_name = artifact_prefix + 'validate_artifact'
        validate_artifact = af.register_artifact(name=validate_artifact_name,
                                                 uri=get_file_dir(__file__) +
                                                 '/validate_result')
        validate_channel = af.model_validate(
            input=[validate_transform],
            model_info=train_model,
            model_validation_processor=ModelValidator(validate_artifact_name))
    with af.job_config('push'):
        # Push model to serving
        # Register metadata of pushed model
        push_model_artifact_name = artifact_prefix + 'push_model_artifact'
        push_model_artifact = af.register_artifact(
            name=push_model_artifact_name,
            uri=get_file_dir(__file__) + '/pushed_model')
        af.push_model(
            model_info=train_model,
            pushing_model_processor=ModelPusher(push_model_artifact_name))
    with af.job_config('predict'):
        predict_dataset = af.register_dataset(
            name=artifact_prefix + 'predict_dataset',
            uri=DATASET_URI.format('predict'))
        predict_read_dataset = af.read_dataset(
            dataset_info=predict_dataset,
            read_dataset_processor=PredictDatasetReader())
        predict_transform = af.transform(
            input=[predict_read_dataset],
            transform_processor=PredictTransformer())
        predict_channel = af.predict(input=[predict_transform],
                                     model_info=train_model,
                                     prediction_processor=ModelPredictor())
        write_dataset = af.register_dataset(
            name=artifact_prefix + 'export_dataset',
            uri=get_file_dir(__file__) + '/predict_result')
        af.write_dataset(input=predict_channel,
                         dataset_info=write_dataset,
                         write_dataset_processor=DatasetWriter())

    af.action_on_model_version_event(
        job_name='validate',
        model_version_event_type=ModelVersionEventType.MODEL_GENERATED,
        model_name=train_model.name)
    af.action_on_model_version_event(
        job_name='push',
        model_version_event_type=ModelVersionEventType.MODEL_VALIDATED,
        model_name=train_model.name)

    # Run workflow
    af.workflow_operation.submit_workflow(
        af.current_workflow_config().workflow_name)
    af.workflow_operation.start_new_workflow_execution(
        af.current_workflow_config().workflow_name)
Ejemplo n.º 14
0
def run_workflow():
    af.init_ai_flow_context()
    artifact_prefix = af.current_project_config().get_project_name() + "."
    with af.job_config('train'):
        # Training of model
        # Register metadata raw training data(dataset) and read dataset(i.e. training dataset)
        train_dataset = af.register_dataset(name=artifact_prefix +
                                            'train_dataset',
                                            uri=DATASET_URI.format('train'))
        train_read_dataset = af.read_dataset(
            dataset_info=train_dataset, read_dataset_processor=DatasetReader())

        # Transform(preprocessing) dataset
        train_transform = af.transform(
            input=[train_read_dataset],
            transform_processor=DatasetTransformer())

        # Register model metadata and train model
        train_model = af.register_model(model_name=artifact_prefix +
                                        'logistic-regression',
                                        model_desc='logistic regression model')
        train_channel = af.train(input=[train_transform],
                                 training_processor=ModelTrainer(),
                                 model_info=train_model)

    with af.job_config('evaluate'):
        # Evaluation of model
        evaluate_dataset = af.register_dataset(
            name=artifact_prefix + 'evaluate_dataset',
            uri=DATASET_URI.format('evaluate'))
        evaluate_read_dataset = af.read_dataset(
            dataset_info=evaluate_dataset,
            read_dataset_processor=EvaluateDatasetReader())
        evaluate_transform = af.transform(
            input=[evaluate_read_dataset],
            transform_processor=EvaluateTransformer())
        # Register disk path used to save evaluate result
        evaluate_artifact_name = artifact_prefix + 'evaluate_artifact'
        evaluate_artifact = af.register_artifact(name=evaluate_artifact_name,
                                                 uri=get_file_dir(__file__) +
                                                 '/evaluate_result')
        # Evaluate model
        evaluate_channel = af.evaluate(
            input=[evaluate_transform],
            model_info=train_model,
            evaluation_processor=ModelEvaluator(evaluate_artifact_name))

    with af.job_config('validate'):
        # Validation of model
        # Read validation dataset and validate model before it is used to predict

        validate_dataset = af.register_dataset(
            name=artifact_prefix + 'validate_dataset',
            uri=DATASET_URI.format('evaluate'))
        validate_read_dataset = af.read_dataset(
            dataset_info=validate_dataset,
            read_dataset_processor=ValidateDatasetReader())
        validate_transform = af.transform(
            input=[validate_read_dataset],
            transform_processor=ValidateTransformer())
        validate_artifact_name = artifact_prefix + 'validate_artifact'
        validate_artifact = af.register_artifact(name=validate_artifact_name,
                                                 uri=get_file_dir(__file__) +
                                                 '/validate_result')
        validate_channel = af.model_validate(
            input=[validate_transform],
            model_info=train_model,
            model_validation_processor=ModelValidator(validate_artifact_name))
    with af.job_config('push'):
        # Push model to serving
        # Register metadata of pushed model
        push_model_artifact_name = artifact_prefix + 'push_model_artifact'
        push_model_artifact = af.register_artifact(
            name=push_model_artifact_name,
            uri=get_file_dir(__file__) + '/pushed_model')
        af.push_model(
            model_info=train_model,
            pushing_model_processor=ModelPusher(push_model_artifact_name))

    with af.job_config('predict'):
        # Prediction(Inference)
        predict_dataset = af.register_dataset(
            name=artifact_prefix + 'predict_dataset',
            uri=DATASET_URI.format('predict'))
        predict_read_dataset = af.read_dataset(
            dataset_info=predict_dataset,
            read_dataset_processor=PredictDatasetReader())
        predict_transform = af.transform(
            input=[predict_read_dataset],
            transform_processor=PredictTransformer())
        predict_channel = af.predict(input=[predict_transform],
                                     model_info=train_model,
                                     prediction_processor=ModelPredictor())
        # Save prediction result
        write_dataset = af.register_dataset(
            name=artifact_prefix + 'write_dataset',
            uri=get_file_dir(__file__) + '/predict_result')
        af.write_dataset(input=predict_channel,
                         dataset_info=write_dataset,
                         write_dataset_processor=DatasetWriter())

        # Define relation graph connected by control edge: train -> evaluate -> validate -> push -> predict
        af.action_on_job_status('evaluate', 'train')
        af.action_on_job_status('validate', 'evaluate')
        af.action_on_job_status('push', 'validate')
        af.action_on_job_status('predict', 'push')

    # Run workflow
    af.workflow_operation.submit_workflow(
        af.current_workflow_config().workflow_name)
    af.workflow_operation.start_new_workflow_execution(
        af.current_workflow_config().workflow_name)
Ejemplo n.º 15
0
        # Read training example using af.read_example()
        # example_info is the meta information of the example
        read_example_channel = af.read_example(example_info=read_example_meta, exec_args=ExecuteArgs(
            batch_properties=Args(header=None, names=["a", "b", "c"])))

        # Transform examples using af.transform()
        transform_channel = af.transform(input_data_list=[read_example_channel],
                                         executor=PythonObjectExecutor(python_object=SimpleTransform()))

        write_example_meta = af.register_example(name='write_example', support_type=ExampleSupportType.EXAMPLE_BATCH,
                                                 data_format='csv', data_type='pandas', batch_uri=sink_path)

        # Write example to specific path
        write = af.write_example(input_data=transform_channel, example_info=write_example_meta,
                                 exec_args=ExecuteArgs(batch_properties=Args(sep=',', header=False, index=False)))

    # Add control dependency, which means read_example job will start right after command line job finishes.
    af.stop_before_control_dependency(read_example_channel, cmd_job)

    transform_dag = 'simple_transform'
    af.deploy_to_airflow(project_root_path, dag_id=transform_dag)
    context = af.run(project_path=project_root_path,
                     dag_id=transform_dag,
                     scheduler_type=SchedulerType.AIRFLOW)


if __name__ == '__main__':
    project_path = os.path.dirname(get_file_dir(__file__))
    run_project(project_path)