def createInitialDeepLearningDefinition(): return H2ODeepLearning(seed=42, reproducible=True, labelCol="CAPSULE", featuresCols=["AGE", "RACE", "DPROS", "DCAPS"], hidden=[ 3, ])
def testLoadAndTrainMojo(prostateDataset): mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/deep_learning_prostate.mojo")) dl = H2ODeepLearning(seed=42, reproducible=True, labelCol="CAPSULE") model = dl.fit(prostateDataset) predMojo = mojo.transform(prostateDataset).repartition(1).collect() predModel = model.transform(prostateDataset).repartition(1).collect() assert len(predMojo) == len(predModel) for i in range(0, len(predMojo)): assert predMojo[i] == predModel[i]
def testParams(): dl = H2ODeepLearning(modelId=None, splitRatio=1.0, labelCol="label", weightCol=None, featuresCols=[], allStringColumnsToCategorical=True, columnsToCategorical=[], nfolds=0, keepCrossValidationPredictions=False, keepCrossValidationFoldAssignment=False, parallelizeCrossValidation=True, seed=-1, distribution="AUTO", epochs=10.0, l1=0.0, l2=0.0, hidden=[200, 200], reproducible=False, convertUnknownCategoricalLevelsToNa=False, foldCol=None, predictionCol="prediction", detailedPredictionCol="detailed_prediction", withDetailedPredictionCol=False, convertInvalidNumbersToNa=False) assert dl.getModelId() == None assert dl.getSplitRatio() == 1.0 assert dl.getLabelCol() == "label" assert dl.getWeightCol() == None assert dl.getFeaturesCols() == [] assert dl.getAllStringColumnsToCategorical() == True assert dl.getColumnsToCategorical() == [] assert dl.getNfolds() == 0 assert dl.getKeepCrossValidationPredictions() == False assert dl.getKeepCrossValidationFoldAssignment() == False assert dl.getParallelizeCrossValidation() == True assert dl.getSeed() == -1 assert dl.getDistribution() == "AUTO" assert dl.getEpochs() == 10.0 assert dl.getL1() == 0.0 assert dl.getL2() == 0.0 assert dl.getHidden() == [200, 200] assert dl.getReproducible() == False assert dl.getConvertUnknownCategoricalLevelsToNa() == False assert dl.getFoldCol() == None assert dl.getPredictionCol() == "prediction" assert dl.getDetailedPredictionCol() == "detailed_prediction" assert dl.getWithDetailedPredictionCol() == False assert dl.getConvertInvalidNumbersToNa() == False
def test_load_mojo_deeplearning(self): from pysparkling.ml import H2OMOJOModel, H2ODeepLearning mojo = H2OMOJOModel.create_from_mojo("file://" + os.path.abspath("../ml/src/test/resources/deep_learning_prostate.mojo")) prostate_frame = self._hc.as_spark_frame(h2o.upload_file(unit_test_utils.locate("smalldata/prostate/prostate.csv"))) dl = H2ODeepLearning(seed=42, reproducible=True, predictionCol="CAPSULE") model = dl.fit(prostate_frame) pred_mojo = mojo.predict(prostate_frame).repartition(1).collect() pred_model = model.transform(prostate_frame).repartition(1).collect() assert len(pred_mojo)==len(pred_model) for i in range(0, len(pred_mojo)): assert pred_mojo[i]==pred_model[i]
def h2o_deeplearning(df, label, columns, **kargs): H2OContext.getOrCreate(Spark.instance.spark) df_sti = string_to_index(df, input_cols=label) df_va = vector_assembler(df_sti, input_cols=columns) h2o_deeplearning = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=columns, labelCol=label, **kargs) model = h2o_deeplearning.fit(df_va) df_raw = model.transform(df_va) df_pred = df_raw.withColumn("prediction", when(df_raw.prediction_output["p1"] > 0.5, 1.0).otherwise(0.0)) return df_pred, model
outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2ODeepLearning model dl = H2ODeepLearning(epochs=10, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, dl, colPruner])
def testPipelineSerializationDeepLearning(prostateDataset): gridSearchTester(H2ODeepLearning().setLabelCol("AGE"), prostateDataset)
def testPipelineSerializationDeepLearning(prostateDataset): gridSearchTester(H2ODeepLearning(), prostateDataset)
def testDeepLearningParameters(prostateDataset): features = ['AGE', 'RACE', 'DPROS', 'DCAPS', 'PSA'] algorithm = H2ODeepLearning(seed=1, labelCol="CAPSULE", featuresCols=features) model = algorithm.fit(prostateDataset) compareParameterValues(algorithm, model)
## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(splitRatio=0.8, seed=1, featuresCols=[idf.getOutputCol()], labelCol="label") elif algo == "dl": ## Create H2ODeepLearning model algoStage = H2ODeepLearning(epochs=10, seed=1, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], labelCol="label") elif algo == "automl": ## Create H2OAutoML model algoStage = H2OAutoML( convertUnknownCategoricalLevelsToNa=True, maxRuntimeSecs=60 * 100, # 100 minutes maxModels=3, seed=1, labelCol="label") elif algo == "xgboost": ## Create H2OXGBoost model algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True, featuresCols=[idf.getOutputCol()],