Ejemplo n.º 1
0
def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel.
    '''  # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(
        data.map(lambda row: [
            float(row.SearchID),
            float(row.AdID),
            float(row.Position),
            float(row.ObjectType),
            float(row.HistCTR)
        ]))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(
            dtm.predict(
                data.map(lambda row: [
                    float(row.SearchID),
                    float(row.AdID),
                    float(row.Position),
                    float(row.ObjectType),
                    float(row.HistCTR)
                ])))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)
Ejemplo n.º 2
0
def get_probs_classify(model, data):
    # Collect the individual decision trees as JavaArray objects
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data)

    # For each tree, apply its prediction to the entire dataset and zip together the results
    for i in range(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    return scores.map(lambda x: x / ntrees)
Ejemplo n.º 3
0
def predict_proba(model, data):
    '''
    Input: A PySpark RandomForestModel object, RDD of LabeledPoints
    Output: List of probabilies 
    This wrapper exposes the probabilities (i.e. confidences) for a given prediciton. 
    '''
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = model._java_model.trees()
    ntrees = model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(
        data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    for i in xrange(1, ntrees):
        dtm = DecisionTreeModel(trees[i])
        scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        scores = scores.map(lambda x: x[0] + x[1])

    # Divide the accumulated scores over the number of trees
    probabilities = scores.map(lambda x: float(x) / ntrees).collect()
    return probabilities
Ejemplo n.º 4
0
def predict_proba(rf_model, data):
    '''
    This wrapper overcomes the "binary" nature of predictions in the native
    RandomForestModel. 
    '''
    # Collect the individual decision tree models by calling the underlying
    # Java model. These are returned as JavaArray defined by py4j.
    trees = rf_model._java_model.trees()
    ntrees = rf_model.numTrees()
    scores = DecisionTreeModel(trees[0]).predict(data.map(lambda x: x.features))

    # For each decision tree, apply its prediction to the entire dataset and
    # accumulate the results using 'zip'.
    featsAndPredictions = sc.parallelize([]) #empty RDD
    for i in range(ntrees):
        dtm = DecisionTreeModel(trees[i])
        predictions = dtm.predict(data.map(lambda x: x.features))
        featsAndPredictions=featsAndPredictions.union(data.map(lambda lp: lp.features).zip(predictions))

        #scores = scores.zip(dtm.predict(data.map(lambda x: x.features)))
        #scores = scores.map(lambda x: x[0] + x[1])
        
    #add up the predictions and divide the accumulated scores over the number of trees
    return featsAndPredictions.reduceByKey(lambda a,b: a+b).map(lambda (key,val): (key,val/ntrees)) #add up the predictions
Ejemplo n.º 5
0
def predict_proba(rf_model, testRDD):

        trees = rf_model._java_model.trees()
        ntrees = rf_model.numTrees()
        scores_dict = {i: 0 for i in range(0,10)}
        scoresRDD = testRDD.map(lambda x: scores_dict.copy())

        for tree in trees:
                dtm = DecisionTreeModel(tree)
                currentScoreRDD = dtm.predict(testRDD)
                scoresRDD = scoresRDD.zip(currentScoreRDD)

                def reduceTuple(x):
                        x[0][int(x[1])] += 1
                        return x[0]

                scoresRDD = scoresRDD.map(reduceTuple)
        return scoresRDD
Ejemplo n.º 6
0
                                         numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=n_estimators,
                                         featureSubsetStrategy="auto",
                                         impurity='gini')
    ''' accuracy test on testset here'''
    predictions = model.predict(test.map(lambda x: x.features))
    labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
    testErr = labelsAndPredictions.filter(lambda _: _[0] != _[1])

    n_unlabeled = unlabeled_data.count()

    rdd = sc.parallelize([])
    for tree in model._java_model.trees():
        predX = DecisionTreeModel(tree).predict(unlabeled_data.map(lambda _ : _[0].features))\
            .zipWithIndex()\
            .map(lambda _: (_[1], _[0]))
        rdd = rdd.union(predX)

    classPrediction = rdd.groupByKey().mapValues(sum)
    classPrediction = classPrediction.sortByKey()
    entropies = classPrediction.map(lambda _: abs(0.5 -
                                                  (1 - (_[1] / n_estimators))))

    unlabeled_entropies = unlabeled_indices.map(lambda _: _[0])\
        .zipWithIndex()\
        .map(lambda _: (_[1], _[0]))\
        .leftOuterJoin(entropies.zipWithIndex().map(lambda _:(_[1], _[0])))\
        .map(lambda _:_[1])

    sorted_unlabeled_entropies = unlabeled_entropies.sortBy(lambda _: _[1])
                         scheduled_departure_time=t[1].scheduled_departure_time,
                         actual_departure_time=t[1].actual_departure_time,
                         departure_delay_minutes=t[1].departure_delay_minutes,
                         scheduled_arrival_time=t[1].scheduled_arrival_time,
                         actual_arrival_time=t[1].actual_arrival_time,
                         arrival_delay_minutes=t[1].arrival_delay_minutes,
                         crs_elapsed_flight_minutes=t[1].crs_elapsed_flight_minutes,
                         distance=t[1].distance)


if __name__ == "__main__":
    sc = SparkContext(appName="InsightEdge Python API Demo: prediction job")
    ssc = StreamingContext(sc, 3)
    sqlc = SQLContext(sc)

    zkQuorum = "localhost:2181"
    topic = "flights"

    model = DecisionTreeModel(Utils.load_model_from_grid("DecisionTreeFlightModel", sc))

    carrier_mapping = sc.broadcast(load_mapping("CarrierMap", sqlc))
    origin_mapping = sc.broadcast(load_mapping("OriginMap", sqlc))
    destination_mapping = sc.broadcast(load_mapping("DestinationMap", sqlc))

    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1})
    lines = kvs.map(lambda x: x[1])
    lines.foreachRDD(predict_and_save)

    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 8
0
    def selectNext(self):
        # get predictions from individual trees
        self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \
            .leftOuterJoin(self.dataset.trainSet) \
            .map(lambda _: (_[0], _[1][1]))

        # zipping actual indices with dummy indices so that they can be traced later
        actualIndices = self.trainDataUnknown.map(lambda _: _[0]) \
            .zipWithIndex() \
            .map(lambda _: (_[1], _[0]))

        # an empty RDD
        rdd = sc.parallelize([])
        ''' these java objects are not serializable
         thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        '''
        for x in self.model._java_model.trees():
            # zipping each prediction from each decision tree with individual sample index so that they can be added later
            predX = DecisionTreeModel(x) \
                .predict(self.trainDataUnknown.map(lambda _: _[1].features)) \
                .zipWithIndex() \
                .map(lambda _: (_[1], _[0]))
            predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1])
            rdd = rdd.union(predX)
        ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s'''
        sumScore = rdd.groupByKey().mapValues(sum)
        totalEstimators = self.nEstimators

        # average of the predicted scores
        f_1 = sumScore.map(lambda _: (_[0], _[1] / totalEstimators))

        # standard deviation of predicted scores
        f_2 = sumScore.map(lambda _: getSD(_, totalEstimators))

        # - proportion of positive points
        nLabeled = self.trainDataKnown.count()
        nUnlabeled = self.trainDataUnknown.count()
        proportionPositivePoints = (self.trainDataKnown.map(
            lambda _: _[1].label).reduce(lambda x, y: x + y)) / nLabeled
        f_3 = f_1.map(lambda _: proportionPositivePoints)

        # - estimate variance of forest by looking at avergae of variance of some predictions
        estimateVariance = (
            f_2.map(lambda _: _[1]).reduce(lambda x, y: x + y)) / nUnlabeled
        f_6 = f_3.map(lambda _: estimateVariance)

        # - number of already labelled datapoints
        f_8 = f_3.map(lambda _: nLabeled)

        myDebugger.TIMESTAMP('features ready for transposing')

        # transposing start
        tempf_1 = f_1.map(lambda _: _[1]).zipWithIndex().map(lambda _:
                                                             (_[1], _[0]))
        tempf_2 = f_2.map(lambda _: _[1]).zipWithIndex().map(lambda _:
                                                             (_[1], _[0]))
        tempf_3 = f_3.zipWithIndex().map(lambda _: (_[1], _[0]))
        tempf_6 = f_6.zipWithIndex().map(lambda _: (_[1], _[0]))
        tempf_8 = f_8.zipWithIndex().map(lambda _: (_[1], _[0]))
        LALDataset = tempf_1\
            .leftOuterJoin(tempf_2)\
            .leftOuterJoin(tempf_3)\
            .leftOuterJoin(tempf_6)\
            .leftOuterJoin(tempf_8)\
            .map(lambda _  : LabeledPoint(_[0] ,
                              [_[1][0][0][0][0],  _[1][0][0][0][1],  _[1][0][0][1], _[1][0][1], _[1][1]]))

        myDebugger.TIMESTAMP('transposing done')

        # # predict the expected reduction in the error by adding the point
        LALprediction = self.lalModel.predict(LALDataset.map(lambda _ : _.features))\
            .zipWithIndex()\
            .map(lambda _ : (_[1],_[0]))
        myDebugger.TIMESTAMP('prediction done')

        # Selecting the index which has the highest uncertainty/ closest to probability 0.5
        selectedIndex1toN = LALprediction.sortBy(lambda _: _[1]).max()[0]

        # takes the selectedIndex from the unknown samples and add it to the known ones
        self.indicesKnown = self.indicesKnown.union(
            sc.parallelize([selectedIndex1toN]))

        # updating unknown indices
        self.indicesUnknown = self.indicesUnknown.filter(
            lambda _: _ != selectedIndex1toN)
        ''' debugging block '''
        myDebugger.TIMESTAMP('update unknown indices')
        myDebugger.DEBUG(selectedIndex1toN)
        myDebugger.DEBUG(self.indicesKnown.collect())
        myDebugger.DEBUG(self.indicesUnknown.collect())
        myDebugger.TIMESTAMP('DEBUGGING DONE')
Ejemplo n.º 9
0
    def selectNext(self):
        # predict for the rest the datapoints
        self.trainDataUnknown = self.indicesUnknown.map(lambda _: (_, None)) \
            .leftOuterJoin(self.dataset.trainSet) \
            .map(lambda _: (_[0], _[1][1]))

        actualIndices = self.trainDataUnknown.map(lambda _ : _[0])\
            .zipWithIndex()\
            .map(lambda _: (_[1], _[0]))

        myDebugger.TIMESTAMP('zipping indices ')

        rdd = sc.parallelize([])
        ''' these java objects are not serializable
         thus still no support to make an RDD out of it!! <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        '''
        for x in self.model._java_model.trees():
            '''
             zipping each prediction from each decision tree
             with individual sample index so that they can be
             added later
            '''
            predX = DecisionTreeModel(x)\
                .predict(self.trainDataUnknown.map(lambda _ : _[1].features))\
                .zipWithIndex()\
                .map(lambda _: (_[1], _[0]))

            predX = actualIndices.leftOuterJoin(predX).map(lambda _: _[1])
            rdd = rdd.union(predX)

        myDebugger.TIMESTAMP('get individual tree predictions')
        ''' adding up no. of 1 in each sample's prediction this is the class prediction of 1s'''
        classPrediction = rdd.groupByKey().mapValues(sum)

        myDebugger.TIMESTAMP('reducing ')

        #  direct self.nEstimators gives error
        totalEstimators = self.nEstimators
        #  predicted probability of class 0
        classPrediction = classPrediction.map(
            lambda _: (_[0], abs(0.5 - (1 - (_[1] / totalEstimators)))))

        myDebugger.TIMESTAMP('mapping')

        # Selecting the index which has the highest uncertainty/ closest to probability 0.5
        selectedIndex1toN = classPrediction.sortBy(lambda _: _[1]).first()[0]

        myDebugger.TIMESTAMP('sorting')

        # takes the selectedIndex from the unknown samples and add it to the known ones
        self.indicesKnown = self.indicesKnown.union(
            sc.parallelize([selectedIndex1toN]))

        myDebugger.TIMESTAMP('update known indices')

        # removing first sample from unlabeled ones(update)
        self.indicesUnknown = self.indicesUnknown.filter(
            lambda _: _ != selectedIndex1toN)

        myDebugger.TIMESTAMP('update unknown indices')

        myDebugger.DEBUG(selectedIndex1toN)
        myDebugger.DEBUG(self.indicesKnown.collect())
        myDebugger.DEBUG(self.indicesUnknown.collect())

        myDebugger.TIMESTAMP('DEBUGGING DONE')