class FeatureSelection: def __init__(self): self.stream = Consumer('bus', 'localhost').get_stream() self.kafka_stream = ConsumerKafka('bus', 'localhost') # kafka_stream and stream are both interchangable def select_feature(self): rdd = self.stream.filter(lambda message: float(message)) \ .map(lambda message: round(float(message))) \ .transform(lambda rdd: rdd.sortByKey()) assembler = VectorAssembler( inputCols=['stop_id', 'delay', 'route_id', 'temperature'], outputCol='features') return assembler.transform(rdd) def random_forests(self): features = self.select_feature() rf = RandomForestClassifier(labelCol='temperature', featuresCol='features') final_df = features.select('features', 'temperature') rf_model = rf.fit(final_df) print(rf_model.featureImportances) return rf_model.featureImportances