def predict_proba(rf_model, data): ''' This wrapper overcomes the "binary" nature of predictions in the native RandomForestModel. ''' # Collect the individual decision tree models by calling the underlying # Java model. These are returned as JavaArray defined by py4j. trees = rf_model._java_model.trees() ntrees = rf_model.numTrees() scores = DecisionTreeModel(trees[0]).predict( data.map(lambda row: [ float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType), float(row.HistCTR) ])) # For each decision tree, apply its prediction to the entire dataset and # accumulate the results using 'zip'. for i in range(1, ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip( dtm.predict( data.map(lambda row: [ float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType), float(row.HistCTR) ]))) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees return scores.map(lambda x: x / ntrees)
def get_probs_classify (model, data): # Collect the individual decision trees as JavaArray objects trees = model._java_model.trees() ntrees = model.numTrees() scores = DecisionTreeModel(trees[0]).predict(data) # For each tree, apply its prediction to the entire dataset and zip together the results for i in range(1,ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data)) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees return scores.map(lambda x: x/ntrees)
def get_probs_classify(model, data): # Collect the individual decision trees as JavaArray objects trees = model._java_model.trees() ntrees = model.numTrees() scores = DecisionTreeModel(trees[0]).predict(data) # For each tree, apply its prediction to the entire dataset and zip together the results for i in range(1, ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data)) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees return scores.map(lambda x: x / ntrees)
def predict_proba(rf_model, data): ''' This wrapper overcomes the "binary" nature of predictions in the native RandomForestModel. ''' # Collect the individual decision tree models by calling the underlying # Java model. These are returned as JavaArray defined by py4j. trees = rf_model._java_model.trees() ntrees = rf_model.numTrees() scores = DecisionTreeModel(trees[0]).predict(data.map( lambda row: [float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType), float(row.HistCTR)])) # For each decision tree, apply its prediction to the entire dataset and # accumulate the results using 'zip'. for i in range(1, ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data.map(lambda row : [float(row.SearchID),float(row.AdID),float(row.Position),float(row.ObjectType),float(row.HistCTR)]))) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees return scores.map(lambda x: x / ntrees)
def predict_proba(model, data): ''' Input: A PySpark RandomForestModel object, RDD of LabeledPoints Output: List of probabilies This wrapper exposes the probabilities (i.e. confidences) for a given prediciton. ''' # Collect the individual decision tree models by calling the underlying # Java model. These are returned as JavaArray defined by py4j. trees = model._java_model.trees() ntrees = model.numTrees() scores = DecisionTreeModel(trees[0]).predict( data.map(lambda x: x.features)) # For each decision tree, apply its prediction to the entire dataset and # accumulate the results using 'zip'. for i in xrange(1, ntrees): dtm = DecisionTreeModel(trees[i]) scores = scores.zip(dtm.predict(data.map(lambda x: x.features))) scores = scores.map(lambda x: x[0] + x[1]) # Divide the accumulated scores over the number of trees probabilities = scores.map(lambda x: float(x) / ntrees).collect() return probabilities