class Regression: def __init__(self): self.consumer = Consumer('bus', 'localhost') self.stream = self.consumer.get_stream() self.kafka_stream = ConsumerKafka('bus', 'localhost') self.cleaned_stream = self.stream.map(self.clean_up) self.conf = SparkConf().setMaster('local').setAppName( 'linear_regression') self.sc = SparkContext(conf=self.conf) self.spark = SparkSession(self.sc) # kafka_stream and stream are both interchangable def clean_up(self, data): essential_data = list() read_dictionary = np.load(os.getcwd() + '/model/d1.npy').item() record = json.dumps(data, separators=(',', ':')) values = ast.literal_eval(record) for i in values.get(): rec = values.get(i) item = dict() item['stopid'] = str(i) counter = 0 for j in rec: if j['duetime'] == 'due': counter = counter + 1 item['due_count'] = str(counter) item['longitude'] = read_dictionary[i][0] item['latitude'] = read_dictionary[i][1] essential_data.append(item) def create_data_frame(self): return self.spark.createDateFrame(self.cleaned_stream) def train_test_split(self, data): (train, test) = data.randomSplit([0.3, 0.7]) return (train, test) def linear_regression(self, training_data): linear_regression = LinearRegression(maxIter=10) return linear_regression.fit(training_data) def predict(self, model, test_data): print('Coefficients: ' + str(model.coefficients)) print('Intercept: ' + str(model.intercept)) predictions = model.transform(test_data) return predictions.select('delay').show()
class Classification: def __init__(self): self.consumer = Consumer('bus', 'localhost') self.kafka_stream = ConsumerKafka('bus', 'localhost') self.stream = self.consumer.get_stream() # kafka_stream and stream are both interchangable def logistic_regression(self): # read from the stream rdd = self.stream.filter(lambda message: float(message.temperature)) \ .filter(lambda message: float(message.delay > 10000)) \ .transform(lambda rdd: rdd.sortByKey()) # select the required features log_reg = LogisticRegression(featuresCol = 'features', labelCol = 'delay') temperature_indexer = StringIndexer(inputCol = 'temperature', outputCol = 'temp_index') delay_encoder = OneHotEncoder(inputCol='delay', outputCol = 'delay_vector') pipeline = Pipeline(stages = [temperature_indexer, delay_encoder, log_reg]) columns = rdd.select(['stop_id', 'delay', 'route_id', 'temperature']) train, test = columns.randomSplit([0.7, 0.3]) fit_model = pipeline.fit(train) results = fit_model.transform(test) return results
class K_Means: def __init__(self): self.spark = SparkSession.builder.appName('kmeans').getOrCreate() self.conf = SparkConf().setMaster('local').setAppName('kmeans') self.sc = SparkContext(conf=self.conf) self.consumer = Consumer('bus', 'localhost') self.stream = self.consumer.get_stream() self.kafka_stream = ConsumerKafka('bus', 'localhost') # kafka_stream and stream are both interchangable def kmeans(self): rdd = self.stream.filter(lambda message: float(message.temperature)) \ .filter(lambda message: float(message.delay > 10000)) \ .transform(lambda rdd: rdd.sortByKey()) sqlContext = SQLContext(self.sc) schema = sqlContext.createDataFrame(rdd) df = schema.createOrReplaceTempView('kmeans') assembler = VectorAssembler(inputCols=df.columns, outputCol='features') final_df = assembler.transform(df) scaler = StandardScaler(inputCol='features', outputCol='scaled_features') scaler_model = scaler.fit(final_df) return scaler_model.transform(final_df)