def _model(self): if self.family == GAUSSIAN_: reg = LinearRegression() elif self.family == BINOMIAL_: reg = GeneralizedLinearRegression(family="binomial", link="logit") else: raise NotImplementedError("Family '{}' not implemented".format( self.family)) reg.setLabelCol(self.response) reg.setMaxIter(self.__max_iter) return reg
sc = SparkContext() sqlContext = SQLContext(sc) spark = sqlContext.sparkSession app = Flask(__name__) model = None gpa_df = sqlContext.read.load("./gpa_data.csv", format='com.databricks.spark.csv', header='true', inferSchema='true') lr = LinearRegression(maxIter=20) lr.setFeaturesCol("hs_gpa_vector") lr.setLabelCol("c_gpa") assembler = VectorAssembler(inputCols=["hs_gpa"], outputCol="hs_gpa_vector") output = assembler.transform(gpa_df) split = output.randomSplit([0.7, 0.3]) @app.route('/home') def doHome(): return 'Hello, World!' @app.route('/train') def doTrain(): global model
# In[9]: print(vectorizer.explainParams()) # In[10]: from pyspark.ml.regression import LinearRegression # In[11]: lr = LinearRegression() print(lr.explainParams()) # In[12]: lr.setLabelCol("EP") lr.setFeaturesCol("features") model = lr.fit(df_vect) # In[13]: type(model) # In[14]: print("R2:", model.summary.r2) print("Intercept: ", model.intercept, "Coefficients", model.coefficients) # In[15]: df_pred = model.transform(df_vect)