Ejemplo n.º 1
0
def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel.
    '''  # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(
        data.map(lambda row: [
            float(row.SearchID),
            float(row.AdID),
            float(row.Position),
            float(row.ObjectType),
            float(row.HistCTR)
        ]))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(
            dtm.predict(
                data.map(lambda row: [
                    float(row.SearchID),
                    float(row.AdID),
                    float(row.Position),
                    float(row.ObjectType),
                    float(row.HistCTR)
                ])))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)
Ejemplo n.º 2
0
def get_probs_classify (model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data)

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1,ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data))
        scores = scores.map(lambda x: x[0] + x[1])
    
    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x/ntrees)
Ejemplo n.º 3
0
def get_probs_classify(model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data)

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)
Ejemplo n.º 4
0
def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel.
    '''  # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data.map(
        lambda row: [float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType),
                     float(row.HistCTR)]))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda row : [float(row.SearchID),float(row.AdID),float(row.Position),float(row.ObjectType),float(row.HistCTR)])))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)
Ejemplo n.º 5
0
def predict_proba(model, data):
    '''
    Input: A PySpark RandomForestModel object, RDD of LabeledPoints
    Output: List of probabilies 
    This wrapper exposes the probabilities (i.e. confidences) for a given prediciton. 
    '''
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(
        data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in xrange(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    probabilities = scores.map(lambda x: float(x) / ntrees).collect()
    return probabilities