Example #1
0
 def _fit(self, dataset):
     stages = self.getStages()
     for stage in stages:
         if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)):
             raise TypeError(
                 "Cannot recognize a pipeline stage of type %s." % type(stage))
     indexOfLastEstimator = -1
     for i, stage in enumerate(stages):
         if isinstance(stage, Estimator):
             indexOfLastEstimator = i
     transformers = []
     for i, stage in enumerate(stages):
         if i <= indexOfLastEstimator:
             if isinstance(stage, Transformer):
                 transformers.append(stage)
                 dataset = stage.transform(dataset)
             elif isinstance(stage, RecursiveEstimator):
                 model = stage.fit(dataset, pipeline=PipelineModel(transformers))
                 transformers.append(model)
                 if i < indexOfLastEstimator:
                     dataset = model.transform(dataset)
             else:
                 model = stage.fit(dataset)
                 transformers.append(model)
                 if i < indexOfLastEstimator:
                     dataset = model.transform(dataset)
         else:
             transformers.append(stage)
     return PipelineModel(transformers)
Example #2
0
def main():

    if len(sys.argv) == 5:

        print("load data and transform features...")
        df = load_data(sys.argv[1])
        df, labelIndexer = features_transform(df)
        trainingData, testData = df.randomSplit([0.7, 0.3], seed=6)
        trainingData.cache()

        print(
            "train or load RandomForest models with tuning parameters, then make a prediction on testData..."
        )
        if os.path.exists(sys.argv[2]):
            persistedModel = PipelineModel.load(sys.argv[2])
            evaluator = MulticlassClassificationEvaluator(metricName='f1')
            predictions = persistedModel.transform(testData)
        else:
            model1, predictions, evaluator = rf_model(trainingData, testData,
                                                      labelIndexer)
        print("rf model evaluation...")
        # print F1 score of the prediction
        print_scores(predictions, evaluator)

        print(
            "train or load LogisticRegression models with tuning parameters, then make a prediction on testData"
        )
        if os.path.exists(sys.argv[3]):
            persistedModel = PipelineModel.load(sys.argv[3])
            evaluator = MulticlassClassificationEvaluator(metricName='f1')
            predictions = persistedModel.transform(testData)
        else:
            model2, predictions, evaluator = lr_model(trainingData, testData,
                                                      labelIndexer)
        print("lr model evaluation...")
        # print F1 score of the prediction
        print_scores(predictions, evaluator)

        print(
            "train or load DecisionTree models with tuning parameters, then make a prediction on testData"
        )
        if os.path.exists(sys.argv[4]):
            persistedModel = PipelineModel.load(sys.argv[4])
            evaluator = MulticlassClassificationEvaluator(metricName='f1')
            predictions = persistedModel.transform(testData)
        else:
            model3, predictions, evaluator = dt_model(trainingData, testData,
                                                      labelIndexer)
        print("dt model evaluation...")
        # print F1 score of the prediction
        print_scores(predictions, evaluator)

    else:
        print('Please provide the filepath of the features data '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second to fourth argument. \n\nExample: python '\
              'train_classifier.py ../data/user_item.csv rf lr dt')
Example #3
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"])
            tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
def load_xgb_model(path, m_type):
    """

    :param path: model输入路径
    :param m_type: model类型
    :return: 输出对应model
    """
    # 获取model目录下metadata dict
    metadata = DefaultParamsReader.loadMetadata(path, sc)
    # stages_dir = os.path.join(path, "stages")  # stage路径
    stages_dir = path + "/stages"
    stage_uids = metadata['paramMap']['stageUids']  # metadata中model的uid、路径名
    stage_paths = {}  # 构建空dict
    # 循环遍历
    for index, stage_uid in enumerate(stage_uids):
        # 遍历model,获取相应的stages目录下model的路径
        stage_path = \
            PipelineSharedReadWrite.getStagePath(stage_uid, index, len(stage_uids), stages_dir)
        # stage_paths.append(stage_path)
        # 获取model,以及将相应路径写入字典
        key = stage_uid.split('_')[0]
        stage_paths[key] = str(stage_path)

    # stage_paths = load_xgb_model(path, sc)
    # 根据model type选择相应的model load方法,并返回相应的model
    # model type为PipelineModel、XGBoostClassificationModel
    if m_type == 'PipelineModel':
        model_path = stage_paths['PipelineModel']
        model = PipelineModel.load(model_path)
    else:
        model_path = stage_paths['xgbc']
        model = XGBoostClassificationModel.load(model_path)

    return model
Example #5
0
def load_model(path, run_id=None, dfs_tmpdir=DFS_TMP):
    """
    Load the Spark MLlib model from the path.

    :param run_id: Run ID. If provided, combined with ``path`` to identify the model.
    :param path: Local filesystem path or run-relative artifact path to the model.
    :return: SparkML model.
    :rtype: pyspark.ml.pipeline.PipelineModel
    """
    if run_id is not None:
        path = mlflow.tracking._get_model_log_dir(model_name=path, run_id=run_id)
    m = Model.load(os.path.join(path, 'MLmodel'))
    if FLAVOR_NAME not in m.flavors:
        raise Exception("Model does not have {} flavor".format(FLAVOR_NAME))
    conf = m.flavors[FLAVOR_NAME]
    model_path = os.path.join(path, conf['model_data'])
    tmp_path = _tmp_path(dfs_tmpdir)
    try:
        # Spark ML expects the model to be stored on DFS
        # Copy the model to a temp DFS location first.
        _HadoopFileSystem.copy_from_local_file(model_path, tmp_path, removeSrc=False)
        pipeline_model = PipelineModel.load(tmp_path)
        return pipeline_model
    finally:
        _HadoopFileSystem.delete(tmp_path)
Example #6
0
    def test_nested_pipeline_persistence(self):
        """
        Pipeline[HashingTF, Pipeline[PCA]]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.createDataFrame([(["a", "b", "c"], ),
                                             (["c", "d", "e"], )], ["words"])
            tf = HashingTF(numFeatures=10,
                           inputCol="words",
                           outputCol="features")
            pca = PCA(k=2, inputCol="features", outputCol="pca_features")
            p0 = Pipeline(stages=[pca])
            pl = Pipeline(stages=[tf, p0])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Example #7
0
    def test_python_transformer_pipeline_persistence(self):
        """
        Pipeline[MockUnaryTransformer, Binarizer]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.range(0, 10).toDF("input")
            tf = MockUnaryTransformer(
                shiftVal=2).setInputCol("input").setOutputCol("shiftedInput")
            tf2 = Binarizer(threshold=6,
                            inputCol="shiftedInput",
                            outputCol="binarized")
            pl = Pipeline(stages=[tf, tf2])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Example #8
0
def return_prediction(sample_id):
    '''
    INPUT:
    sample_id - (list) list of sampled userId in display order

    OUTPUT:
    actuals - (list) list of actual churn status (0 or 1)
    probas - (list) list of churn probabilities
    preds - (list) list of churn prediction (0 or 1)

    DESCRIPTION:
    Make prediction on sampled data and return labels, probabilities and
    predictions.
    '''

    # create a Spark session (in case of local workspace)
    '''please modify for actual Spark environment'''
    spark = SparkSession \
        .builder \
        .appName("Sparkify") \
        .master("local") \
        .getOrCreate()

    # load processed data
    feature_data_path = './data/micro_sparkify_features.parquet'
    print('Loading data...\n    DATASET: {}'.format(feature_data_path))
    feature_data = spark.read.load(feature_data_path)

    # load trained Gradient Boosted-Tree Classifier model from folder
    model_path = './models/webGbtModel'
    print('Loading model...\n    MODEL: {}'.format(model_path))
    classifierModel = PipelineModel.load(model_path)

    # extract subset of data for sample id
    print('Extracting samples...')
    sample_data = feature_data.filter(col('userId').isin(sample_id))

    # transform with classification pipeline (churn prediction)
    print('Classifying data...')
    classifiedData = classifierModel.transform(sample_data)
    pd_classified = classifiedData.select('userId', 'label', 'probability',
                                          'prediction').toPandas()
    print('Data classified!')

    # sort in display order
    pd_classified['userId'].astype(int, copy=False)
    pd_classified.set_index('userId', inplace=True)
    pd_classified = pd_classified.loc[sample_id, :]

    actuals = [x for x in pd_classified['label']]
    probas = [round(x[1], 3) for x in pd_classified['probability']]
    preds = [x for x in pd_classified['prediction']]

    # clear pyspark dataframe cache
    feature_data.unpersist()
    sample_data.unpersist()
    classifiedData.unpersist()
    spark.stop()

    return actuals, probas, preds
Example #9
0
def logistic_regression(tr_data=None,
                        t_data=None,
                        proc_type='train',
                        example=None):
    """Performs logistic regression pipelining."""
    lr = LogisticRegression(regParam=0.001, family='multinomial')

    pipeline = Pipeline(stages=[lr])

    if proc_type == 'load' or proc_type == 'test':
        model = PipelineModel.load(LR_PATH)
    else:
        model = pipeline.fit(tr_data)
        if os.path.exists(LR_PATH):
            shutil.rmtree(LR_PATH)
        model.save(LR_PATH)

    if proc_type == 'test':
        result = model.transform(example).collect()

        return result, 0.
    else:
        prediction = model.transform(t_data)
        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="label",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(prediction)

        return None, rmse
Example #10
0
def binary_classification(tr_data=None,
                          t_data=None,
                          proc_type='train',
                          example=None):
    """Performs binary classification pipelining."""
    lr = LogisticRegression(tol=1e-6, fitIntercept=True)
    ovr = OneVsRest(classifier=lr)

    pipeline = Pipeline(stages=[ovr])

    if proc_type == 'load' or proc_type == 'test':
        model = PipelineModel.load(BC_PATH)
    else:
        model = pipeline.fit(tr_data)
        if os.path.exists(BC_PATH):
            shutil.rmtree(BC_PATH)
        model.save(BC_PATH)

    if proc_type == 'test':
        result = model.transform(example).collect()

        return result, 0.
    else:
        prediction = model.transform(t_data)
        evaluator = MulticlassClassificationEvaluator(
            labelCol='label', predictionCol='prediction', metricName='f1')
        f1Score = evaluator.evaluate(prediction)

        return None, f1Score
Example #11
0
def multi_class_classification(tr_data=None,
                               t_data=None,
                               proc_type='train',
                               example=None):
    """Performs multi-class classification pipelining."""
    rfc = RandomForestClassifier(labelCol='label',
                                 featuresCol='features',
                                 numTrees=10)

    pipeline = Pipeline(stages=[rfc])

    if proc_type == 'load' or proc_type == 'test':
        model = PipelineModel.load(MCC_PATH)
    else:
        model = pipeline.fit(tr_data)
        if os.path.exists(MCC_PATH):
            shutil.rmtree(MCC_PATH)
        model.save(MCC_PATH)

    if proc_type == 'test':
        result = model.transform(example).collect()

        return result, 0.
    else:
        prediction = model.transform(t_data)
        evaluator = MulticlassClassificationEvaluator(
            labelCol='label',
            predictionCol='prediction',
            metricName='accuracy')
        accuracy = evaluator.evaluate(prediction)

        return None, accuracy
Example #12
0
    def test_python_transformer_pipeline_persistence(self):
        """
        Pipeline[MockUnaryTransformer, Binarizer]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.range(0, 10).toDF('input')
            tf = MockUnaryTransformer(shiftVal=2)\
                .setInputCol("input").setOutputCol("shiftedInput")
            tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized")
            pl = Pipeline(stages=[tf, tf2])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
def main(account_name, account_key):
    sc = SparkContext()
    sqlContext = SQLContext(sc)

    patient_records_container = 'patientrecords'
    glucose_levels_container = 'glucoselevelsaggs'
    preds_container = 'predictions'

    blob_service = BlobService(account_name=account_name, account_key=account_key)
    blob_service.create_container(preds_container)
    
    day_to_predict = get_most_recent_date(blob_service, glucose_levels_container)
    df = get_df_from_blob(blob_service, glucose_levels_container, patient_records_container, day_to_predict)
    
    project_path = 'wasb://model@{}.blob.core.windows.net/{}'
    si_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'si_pipe_model'))
    oh_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'oh_pipe_model'))
    model = RandomForestClassificationModel.read().load(path=project_path.format(account_name, 'model'))
    
    df_spark = sqlContext.createDataFrame(df)
    df_preds = si_pipe_model.transform(df_spark)
    df_preds = oh_pipe_model.transform(df_preds)
    
    num_var_names = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
                     'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'glucose_min',
                     'glucose_max', 'glucose_mean', 'glucose_var']
    cat_var_names = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id',
                     'admission_source_id', 'payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'metformin',
                     'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide',
                     'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
                     'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone',
                     'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'diag_1_missing',
                     'diag_2_missing', 'diag_3_missing', 'race_missing', 'weight_missing', 'payer_code_missing',
                     'medical_specialty_missing']
    va = VectorAssembler(inputCols=(num_var_names + [c + "__encoded__" for c in cat_var_names]), outputCol='features')
    df_preds = va.transform(df_preds).select('features')
    
    df_preds = model.transform(df_preds)
    df_preds_pandas = df_preds.toPandas()
    df_preds_pandas = pd.concat([df[['patient_nbr', 'discharge_date']],
                                 df_preds_pandas['probability'].map(lambda x: x[1])], axis=1)
    
    # Save the predictions
    blob_service.put_block_blob_from_text(blob_name='-'.join(str(day_to_predict).split('/')) + '.csv',
                                          container_name=preds_container,
                                          text=df_preds_pandas.to_csv(index=False))
    return
Example #14
0
def _load_model(model_path, dfs_tmpdir=None):
    if dfs_tmpdir is None:
        dfs_tmpdir = DFS_TMP
    tmp_path = _tmp_path(dfs_tmpdir)
    # Spark ML expects the model to be stored on DFS
    # Copy the model to a temp DFS location first. We cannot delete this file, as
    # Spark may read from it at any point.
    model_path = _HadoopFileSystem.maybe_copy_from_local_file(model_path, tmp_path)
    return PipelineModel.load(model_path)
Example #15
0
def load_pyfunc(path):
    """
    Load the model as PyFunc.
    :param path: Local path
    :return: The model as PyFunc.
    """
    spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
        .master("local[1]").getOrCreate()
    return _PyFuncModelWrapper(spark, PipelineModel.load(path))
Example #16
0
 def pipeline_model_load(self, path):
     """
     加载pipeline model
     :param path:
     :return:
     """
     full_path = self.concat_path(path, self.model_key)
     model = PipelineModel.load(full_path)
     return model
Example #17
0
    def get_stages(pipeline: PipelineModel):
        """
        Extract the stages from a fit or unfit pipeline

        :param pipeline: a fit or unfit Spark pipeline
        :return: stages list
        """
        if hasattr(pipeline, 'getStages'):
            return pipeline.getStages()  # unfit pipeline
        return pipeline.stages  # fit pipeline
Example #18
0
 def _transform(self, dataset):
     for t in self.stages:
         if isinstance(t, HasRecursiveTransform):
             # drops current stage from the recursive pipeline within
             dataset = t.transform_recursive(dataset, PipelineModel(self.stages[:-1]))
         elif isinstance(t, AnnotatorProperties) and t.getLazyAnnotator():
             pass
         else:
             dataset = t.transform(dataset)
     return dataset
Example #19
0
def load_pyfunc(path):
    """
    Load a Python Function model from a local file.

    :param path: Local path.
    :return: The model as PyFunc.
    """
    spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
        .master("local[1]").getOrCreate()
    # We do not need any DFS here as pyfunc should create its own SparkContext with no executors
    return _PyFuncModelWrapper(spark, PipelineModel.load("file:" + os.path.abspath(path)))
def attach_tensorflow_model_to_pipeline(path,
                                        pipelineModel,
                                        inputCol,
                                        tfInput,
                                        tfOutput,
                                        predictionCol='predicted',
                                        tfDropout=None,
                                        toKeepDropout=False):
    spark_model = load_tensorflow_model(path, inputCol, tfInput, tfOutput,
                                        predictionCol, tfDropout,
                                        toKeepDropout)
    return PipelineModel(stages=[pipelineModel, spark_model])
Example #21
0
def load_pyfunc(path):
    """
    Load a persisted Spark MLlib PipelineModel as a ``python_function`` model.

    >>> pyfunc_model = load_pyfunc("/tmp/pyfunc-spark-model")
    >>> predictions = pyfunc_model.predict(test_pandas_df)

    :param path: Local filesystem path to the model saved by :py:func:`mlflow.spark.log_model`.
    :rtype: Pyfunc format model with function
            ``model.predict(pandas DataFrame) -> pandas DataFrame``.
    """
    spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \
        .master("local[1]").getOrCreate()
    # We do not need any DFS here as pyfunc should create its own SparkContext with no executors
    return _PyFuncModelWrapper(spark, PipelineModel.load("file:" + os.path.abspath(path)))
Example #22
0
def load_model(path, run_id=None):
    """
    Load the Spark MLlib model from the given path.
    :param run_id: Run ID. If provided it is combined with path to identify the model.
    :param path: Local filesystem path or Run-relative artifact path to the model.
    :return: SparkML model.
    :rtype: pyspark.ml.pipeline.PipelineModel
    """
    if run_id is not None:
        path = mlflow.tracking._get_model_log_dir(model_name=path,
                                                  run_id=run_id)
    m = Model.load(os.path.join(path, 'MLmodel'))
    if FLAVOR_NAME not in m.flavors:
        raise Exception("Model does not have {} flavor".format(FLAVOR_NAME))
    conf = m.flavors[FLAVOR_NAME]
    return PipelineModel.load(os.path.join(path, conf['model_data']))
Example #23
0
def main(spark, df_user, model_file, output_file):

    # import model
    model = PipelineModel.load(model_file)
    print("imported model")

    # import user data
    df = spark.read.parquet(df_user)
    print("imported user data")

    # transform metadata to get track_index
    userdf = model.stages[0].transform(df)
    print("mapped user_index")

    # output a parquet file
    userdf.write.parquet(output_file)
Example #24
0
def attach_pytorch_model_to_pipeline(
        network: nn.Module,
        pipeline_model: PipelineModel,
        inputCol: str = 'features',
        predictionCol: str = 'predicted',
        useVectorOut: bool = False) -> PipelineModel:
    """
    Attaches a pytorch model to an existing pyspark pipeline.

    :param network: Pytorch Network
    :param pipeline_model: An existing spark pipeline model (This is a fitted pipeline)
    :param inputCol: The input column to the dataframe for the pytorch network
    :param predictionCol: The prediction column.
    :param useVectorOut: option to use a vector output.
    :return: a spark PipelineModel
    """

    spark_model = create_spark_torch_model(network, inputCol, predictionCol,
                                           useVectorOut)
    return PipelineModel(stages=[pipeline_model, spark_model])
Example #25
0
 def test_save_pipeline(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     p = Pipeline(stages=[spark_model]).fit(processed)
     p.write().overwrite().save('example_pipeline')
     p = PysparkPipelineWrapper.unwrap(
         PipelineModel.load('example_pipeline'))
     data = p.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
Example #26
0
def load_model(path, run_id=None, dfs_tmpdir=None):
    """
    Load the Spark MLlib model from the path.

    :param run_id: Run ID. If provided, combined with ``path`` to identify the model.
    :param path: Local filesystem path or run-relative artifact path to the model.
    :return: SparkML model.
    :rtype: pyspark.ml.pipeline.PipelineModel

    >>> from mlflow import spark
    >>> model = mlflow.spark.load_model("spark-model")
    >>> # Prepare test documents, which are unlabeled (id, text) tuples.
    >>> test = spark.createDataFrame([
    ...   (4, "spark i j k"),
    ...   (5, "l m n"),
    ...   (6, "spark hadoop spark"),
    ...   (7, "apache hadoop")], ["id", "text"])
    >>>  # Make predictions on test documents.
    >>> prediction = model.transform(test)

    """
    dfs_tmpdir = dfs_tmpdir if dfs_tmpdir is not None else DFS_TMP
    if run_id is not None:
        path = mlflow.tracking.utils._get_model_log_dir(model_name=path,
                                                        run_id=run_id)
    m = Model.load(os.path.join(path, 'MLmodel'))
    if FLAVOR_NAME not in m.flavors:
        raise Exception("Model does not have {} flavor".format(FLAVOR_NAME))
    conf = m.flavors[FLAVOR_NAME]
    model_path = os.path.join(path, conf['model_data'])
    tmp_path = _tmp_path(dfs_tmpdir)
    # Spark ML expects the model to be stored on DFS
    # Copy the model to a temp DFS location first. We cannot delete this file, as
    # Spark may read from it at any point.
    _HadoopFileSystem.copy_from_local_file(model_path,
                                           tmp_path,
                                           removeSrc=False)
    pipeline_model = PipelineModel.load(tmp_path)
    eprint("Copied SparkML model to %s" % tmp_path)
    return pipeline_model
Example #27
0
    input_df = spark.read.option("header", True).csv(input_data)

    # transform the train data
    tmp_df = input_df.rdd \
        .map(single_line_transform)

    # build the single user features,we should explict name the label,features.
    # features is the column with all features in a `Vectors.dense`
    train_df = tmp_df.reduceByKey(single_instance_hanler) \
        .map(lambda r: Row(label=int(r[1].label), features=Vectors.dense(r[1].feats))) \
        .toDF()

    # simple version, save and load
    # https://spark.apache.org/docs/latest/ml-tuning.html
    pipe_lr_model = simple_train_model(train_df)
    print(pipe_lr_model)
    pipe_lr_model.write().overwrite().save(model_save_path)

    xgb_model = pipe_lr_model.stages[0]
    xgb_model.saveBooster(model_save_path + "_booster")


    result = pipe_lr_model.transform(train_df)
    print(result.take(3))

    # reload the model with pipeline
    loaded_model = PipelineModel.load(model_save_path)
    result = loaded_model.transform(train_df)

    print(result.take(3))
Example #28
0
# 모델 생성 알고리즘 (로지스틱 회귀 평가자)
lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender")

# 모델 생성
model = lr.fit(assembled_training)

# 예측값 생성
model.transform(assembled_training).show()

# 파이프라인
pipeline = Pipeline(stages=[assembler, lr])

# 파이프라인 모델 생성
pipelineModel = pipeline.fit(training)

# 파이프라인 모델을 이용한 예측값 생성
pipelineModel.transform(training).show()

path1 = "/Users/beginspark/Temp/regression-model"
path2 = "/Users/beginspark/Temp/pipelinemodel"

# 모델 저장
model.write().overwrite().save(path1)
pipelineModel.write().overwrite().save(path2)

# 저장된 모델 불러오기
loadedModel = LogisticRegressionModel.load(path1)
loadedPipelineModel = PipelineModel.load(path2)

spark.stop
Example #29
0
def words_to_vector(tweets):
    model_path = 'hdfs://spark01.ctweb.inweb.org.br:9000/limonero/models/' \
                 'word_vector.0000'

    model = PipelineModel.load(model_path)
    return model.transform(tweets)
Example #30
0
                                       outputCol='features')

    # Demonstration of some options. Not all are required
    # Note: This uses the barrier execution mode, which is sensitive to the number of partitions
    spark_model = SparkTorch(inputCol='features',
                             labelCol='_c0',
                             predictionCol='predictions',
                             torchObj=torch_obj,
                             iters=50,
                             verbose=1,
                             validationPct=0.2,
                             miniBatch=128)

    # Create and save the Pipeline
    p = Pipeline(stages=[vector_assembler, spark_model]).fit(df)
    p.write().overwrite().save('simple_cnn')

    # Example of loading the pipeline
    loaded_pipeline = PysparkPipelineWrapper.unwrap(
        PipelineModel.load('simple_cnn'))

    # Run predictions and evaluation
    predictions = loaded_pipeline.transform(df).persist()

    evaluator = MulticlassClassificationEvaluator(labelCol="_c0",
                                                  predictionCol="predictions",
                                                  metricName="accuracy")

    accuracy = evaluator.evaluate(predictions)
    print("Train accuracy = %g" % accuracy)
        df = df.withColumn(f, df[f].cast('string'))
df = df.dropna()

train, test = df.randomSplit([0.8, 0.2], seed=0)

class_index = StringIndexer(inputCol='class', outputCol='label')
vector = VectorAssembler(inputCols=feature_cols, outputCol='feature')
model = LinearSVC(featuresCol='feature', labelCol='label')
pipeline = Pipeline(stages=[class_index, vector, model])

pipeline = pipeline.fit(train)
if os.path.exists(MODEL_SAVE_PATH):
    shutil.rmtree(MODEL_SAVE_PATH)
pipeline.write().overwrite().save(pipeline)  # pipeline.save('/to/path')

load_pipeline = PipelineModel.load('pipeline')
test_predict = load_pipeline.transform(test)

evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',
                                          labelCol='label')

print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderROC'}))
print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderPR'}))

origin_test_df = df.select(feature_cols)

predict_df = load_pipeline.transform(origin_test_df)
print(predict_df.show(20))

from pyspark2pmml import PMMLBuilder
Example #32
0
        .map(single_line_transform)

    # 合并单个用户的训练数据,构建训练特征,需要注意的是需要明确指定label,features.
    # features是所有特征, 用Vectors.dense来构建特征的数据结构
    train_df = tmp_df.reduceByKey(single_instance_hanler) \
        .map(lambda r: Row(label=int(r[1].label), features=Vectors.dense(r[1].feats))) \
        .toDF()

    # 简单版本的训练模型, 保存模型
    # 关于模型超参调整和cv请参照,https://spark.apache.org/docs/latest/ml-tuning.html
    pipe_lr_model = simple_train_model(train_df)
    print(pipe_lr_model)
    pipe_lr_model.write().overwrite().save(model_save_path)

    # 重新加载模型
    loaded_model = PipelineModel.load(model_save_path)
    result = loaded_model.transform(train_df)

    print(result.take(3))

    # 带有超参调整的cv模型
    # train,test数据切分
    bst_model_path = model_save_path + "_bst_model"
    train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345)
    bst_model = train_with_tune(train_df)
    bst_model.write().overwrite().save(bst_model_path)

    # 用训练得到最佳模型来对测试数据进行预测
    # 预测结果的数据结构是类似下面的结构:
    #      features = Vectors.dense(...)
    #      label=0,
Example #33
0
                secure=True)

    cos.fget_object(cos_bucket_name, model_filepath, model_filepath)
    cos.fget_object(cos_bucket_name, train_data_filepath, train_data_filepath)
    cos.fget_object(cos_bucket_name, 'evaluation.json', 'evaluation.json')
    if aios_manifest_path:
        cos.fget_object(cos_bucket_name, aios_manifest_path,
                        aios_manifest_path)

    os.system('unzip %s' % model_filepath)
    print('model ' + model_filepath + ' is downloaded')
    os.system('unzip %s' % train_data_filepath)
    print('train_data ' + train_data_filepath + ' is downloaded')

    sc = SparkContext()
    model = PipelineModel.load(model_filepath.split('.')[0])
    pipeline = Pipeline(stages=model.stages)
    spark = SparkSession.builder.getOrCreate()
    train_data = spark.read.csv(path=train_data_filepath.split('.')[0],
                                sep=",",
                                header=True,
                                inferSchema=True)
    ''' Remove previous deployed model '''
    wml_client = WatsonMachineLearningAPIClient(WML_CREDENTIALS)
    model_deployment_ids = wml_client.deployments.get_uids()
    deleted_model_id = None
    for deployment_id in model_deployment_ids:
        deployment = wml_client.deployments.get_details(deployment_id)
        model_id = deployment['entity']['deployable_asset']['guid']
        if deployment['entity']['name'] == DEPLOYMENT_NAME:
            print('Deleting deployment id', deployment_id)
Example #34
0
File: run.py Project: RubySu/Uda7
from flask import Flask
from flask import render_template, request, jsonify
from plotly.graph_objs import Bar

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, concat, desc, explode, lit, min, max, split, udf, isnull, from_unixtime, instr, when
from pyspark.ml.pipeline import PipelineModel

app = Flask(__name__)

# load data
spark = SparkSession.builder.appName("Spark").getOrCreate()

# load model
model = PipelineModel.load("../models/lr")


# index webpage displays cool visuals and receives user input text for model
@app.route('/')
@app.route('/index')
def index():
    path = "../data/page_churn_byUser.csv"
    df = spark.read.csv(path, header=True, inferSchema=True)
    df.persist()
    genre_counts1 = df.filter(df.churn == 0)
    genre_names1 = genre_counts1.select(
        "page").distinct().toPandas()["page"].tolist()
    genre_counts2 = df.filter(df.churn == 1)
    genre_names2 = genre_counts2.select(
        "page").distinct().toPandas()["page"].tolist()