def prediction(hour,extern_temp): if hour>=0 and hour<6: #Predict Temperature tempModel1 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model1") x = np.array([extern_temp]) temp_data = tempModel1.predict(x) print temp_data elif hour>=6 and hour<12: #Predict Temperature tempModel2 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model2") x = np.array([extern_temp]) temp_data = tempModel2.predict(x) print temp_data elif hour>=12 and hour<18: #Predict Temperature tempModel3 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model3") x = np.array([extern_temp]) temp_data = tempModel3.predict(x) print temp_data elif hour>=18 and hour<24: #Predict Temperature tempModel4 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model4") x = np.array([extern_temp]) temp_data = tempModel4.predict(x) print temp_data f.write('%.2f' % temp_data) f.write('\n')
def load_parameters(self): self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_method') self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_method') self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features') self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol') self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser') amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model') trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model') if self.amount_prediction_method == self.RANDOM_FOREST: amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path) elif self.amount_prediction_method == self.LINEAR_REGRESSION: amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path) else: amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model') if self.trend_prediction_method == self.RANDOM_FOREST: trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.NAIVE_BAYES: trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.SVM: trend_model = SVMModel.load(sc=self.sc, path=trend_model_path) else: trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model') return trend_model, amount_model
def prediction(hour, extern_temp): if hour >= 0 and hour < 6: #Predict Temperature tempModel1 = LinearRegressionModel.load( sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model1") x = np.array([extern_temp]) temp_data = tempModel1.predict(x) print temp_data elif hour >= 6 and hour < 12: #Predict Temperature tempModel2 = LinearRegressionModel.load( sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model2") x = np.array([extern_temp]) temp_data = tempModel2.predict(x) print temp_data elif hour >= 12 and hour < 18: #Predict Temperature tempModel3 = LinearRegressionModel.load( sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model3") x = np.array([extern_temp]) temp_data = tempModel3.predict(x) print temp_data elif hour >= 18 and hour < 24: #Predict Temperature tempModel4 = LinearRegressionModel.load( sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model4") x = np.array([extern_temp]) temp_data = tempModel4.predict(x) print temp_data f.write('%.2f' % temp_data) f.write('\n')
def main(sc): features_cr = sc.pickleFile('/tmp/features_saved') linear_model = LinearRegressionModel.load(sc, "/tmp/linear_model") # Getting the features ready for predicting numberFeatures = len(features_cr.first()) - 1 mappings = [get_mapping(features_cr, i) for i in range(0, numberFeatures)] # Month Dictionary dictio_month = {} for i in range(12): dictio_month[i + 1] = i mappings[1] = dictio_month cat_len = sum(map(len, mappings)) if len(sys.argv) == 2: dateZoneGroup = str(sys.argv[1]).split(',') zones = get_group_zone(dateZoneGroup[5], 'zone', features_cr) groups = get_group_zone(dateZoneGroup[6], 'group', features_cr) year_feat = int(dateZoneGroup[0]) month_feat = int(dateZoneGroup[1]) day_feat = int(dateZoneGroup[3]) startDate = date(year_feat, month_feat, day_feat) endDate = date(year_feat, month_feat, day_feat) print(startDate) else: startDate = datetime.strptime(str(sys.argv[1]), event_date).date() endDate = datetime.strptime(str(sys.argv[2]), event_date).date() zones = get_group_zone(str(sys.argv[3]), 'zone', features_cr) groups = get_group_zone(str(sys.argv[4]), 'group', features_cr) dateRangeWF = list(date_range(startDate, endDate)) # Making predictions: feat_vs_pred = list() featPredShow = list() for g in groups: for z in zones: for dR in dateRangeWF: featureLine = format_date(dR, z, g, mappings, cat_len) #print(featureLine) featureShow = format_show(dR, z, g) predLine = linear_model.predict(featureLine) feat_vs_pred.append(list(featureLine) + [predLine]) featPredShow.append(list(featureShow) + [predLine]) scFeaturesPred = sc.parallelize(featPredShow) [print(j) for j in scFeaturesPred.collect()]
def prediction(): year=yearprediction stations = sc.textFile(output+"/stationtextformat") stations = stations.map(getdata).map(lambda x: (x[0], int(year), float(x[1]), float(x[2]))) lat = stations.map(lambda x: (x[2])).cache() min_lat = lat.min() max_lat = lat.max() longtitude = stations.map(lambda x: (x[3])).cache() min_long = longtitude.min() max_long = longtitude.max() max_ = [float('2050'), max_lat, max_long] min_ = [float('1990'), min_lat, min_long] stations = stations.map(lambda x: scalePoint(x, max_, min_)).cache() stationsDF = sqlContext.createDataFrame(stations) # load the model sameModel = LinearRegressionModel.load(sc, output+"/modelpath") # run the model stationidAndPreds = stations.map(lambda p : (p[0], float(sameModel.predict(p[1:])))) # the result returns a predicted value for each station (stationId) in the given year resultRdd = stationidAndPreds.map(rescale) rddschema = resultRdd.map(lambda (a,b): Row(station= a, avg_prcp=b)).cache() stationidAndPredsDF = sqlContext.createDataFrame(rddschema) stationidAndPredsDF.registerTempTable("stationPrediction") getCountries() countires = sc.textFile(output+"/countries") countriesRdd = countires.map(getdata) countries = countriesRdd.map(lambda (a,b): Row(station= a, country=b)).cache() countriesDF = sqlContext.createDataFrame(countries) countriesDF.registerTempTable("StationTable") countriesDF.cache() shortenstations = sqlContext.sql("SELECT SUBSTR(station, 1, 2) As station,avg_prcp FROM stationPrediction") shortenstations.show() joinedresult = countriesDF.join(shortenstations).where(countriesDF.station == shortenstations.station).select(shortenstations.avg_prcp, countriesDF.country) joinedresult.registerTempTable("joinedresult") results = sqlContext.sql("SELECT country, Avg(avg_prcp) as avg_prcp FROM joinedresult GROUP BY country") results.registerTempTable("results") outrdd=results.repartition(40).rdd.map(lambda l: str(l.country)+","+str(l.avg_prcp)).coalesce(1) path = yearprediction outrdd.saveAsTextFile(output+'/prediction/'+path)
def CustomPredict(date_start, date_end, company): # create spark context sc = SparkContext(appName="Model") # create an api object api = NewsAPI.NewsAPI(date_start.month,date_start.day,date_start.year,date_end.month,date_end.day,date_end.year, company,'56283d7d6075b9d30773e1ceb440e1b2d029f438') # load the prediction model for the company model = LinearRegressionModel.load(sc, company) # getting data for the duration of time specified api.startGetData() # get the sentiment average of the days l = api.getSentimentScore() mean_sent = np.mean(l) print ("\n\n\n\n\n" + str(mean_sent) + "\n\n\n\n\n") # make the prediction using the loaded model pred = model.predict([mean_sent]) print ("\n\n\n\n\n" + str(pred) + "\n\n\n\n\n") # close the spark context sc.stop() # return return pred
def main(): spark = SparkSession.builder.appName("TRAFFIC").config("spark.executor.cores", "4").config("spark.executor.memory", "4g").getOrCreate() sc = spark.sparkContext mapping = sc.textFile("s3a://insighttraffic/ML_model/mappings").collect()[0] mapping = ast.literal_eval(str(mapping)) models=[] for hour in range(0, 24): model = LinearRegressionModel.load(sc, "s3a://insighttraffic/ML_model/linear_model_log_"+str(hour)) models.append(model) category_len = 154 sqlContext = sql.SQLContext(sc) hadoop_conf=sc._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") hadoop_conf.set("fs.s3n.awsAccessKeyId", 'awsAccessKeyId') hadoop_conf.set("fs.s3n.awsSecretAccessKey", 'awsSecretAccessKey') # set microbatch interval as 10 seconds, this can be customized according to the project ssc = StreamingContext(sc,10) # directly receive the data under a certain topic kafkaStream = KafkaUtils.createDirectStream(ssc, ['data'], {"metadata.broker.list": 'Kafka-DNS:9092'}) connection = psycopg2.connect(host = 'postgres-ip-address', database = 'postgres', user = '******', password = '******') cursor = connection.cursor() cursor.execute('CREATE TABLE IF NOT EXISTS realtimetraffic (sid text, location text, latitude double precision, longitude double precision,\ direction text, lanes integer, roadtype text, highway text, current integer, historical double precision, level text, PRIMARY KEY (id));') cursor.execute('SELECT AddGeometryColumn (%s,%s,%s,4326,%s,2);', (public,realtimetraffic,geom,POINT,)) #The inbound stream is a DStream dstream = kafkaStream.map(lambda (key, value): json.loads(value)) dstream.foreachRDD(lambda rdd: update(rdd, models, mapping))
df = sparkSession.createDataFrame([(time.strftime("%Y-%m-%d %H:%M:%S"), store_id, result)], ["timePredicted", "store_id", "value"]) df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save() return predicted else: print("No data received") if __name__ == "__main__": sc = SparkContext(appName="NinoxStreaming") my_spark = SparkSession \ .builder \ .appName("Ninox") \ .config("spark.mongodb.input.uri", "mongodb://172.254.0.4:27017/predictions.data") \ .config("spark.mongodb.output.uri", "mongodb://172.254.0.4:27017/predictions.data") \ .getOrCreate() ssc = StreamingContext(sc, 10) # Load model from HDFS model = LinearRegressionModel.load(sc, "hdfs://172.254.0.2:9000/user/root/models/first.model") # Create stream to get kafka messages directKafkaStream = KafkaUtils.createDirectStream(ssc, ["incomingData"], {"metadata.broker.list": "172.254.0.7:9092"}) # Predict and save to mongo directKafkaStream.foreachRDD(lambda time, rdd: predict(rdd, model, my_spark, time)) ssc.start() ssc.awaitTermination() sc.stop()
conf = SparkConf().setAppName( 'Linear least squares, Lasso, and ridge regression').setMaster('local[2]') sc = SparkContext(conf=conf) # load and parse the data def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile('../data/lpsa.data') parseData = data.map(parsePoint) # build the model model = LinearRegressionWithSGD.train(parseData, iterations=100, step=0.0000001) # evaluate the model on training data valuesAndPreds = parseData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda a, b: a + b) / valuesAndPreds.count() print('mean squared error :' + str(MSE)) # save and load model model.save(sc, '../model/pythonLinearRegressionWithSGDModel') sameModel = LinearRegressionModel.load( sc, '../model/pythonLinearRegressionWithSGDModel') sc.stop()
#!usr/local/spark/python from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint import numpy as np from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel from pyspark.mllib.classification import SVMWithSGD, SVMModel sc = SparkContext("local") from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel import os features = open("/root/Desktop/features.LOL", "r") feature = features.read().strip().split(' ') i = map(lambda x: float(x), feature) param = open("/root/Desktop/parameters.LOL", "r") temp = param.readlines() print "LOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOL" supermodel = temp[2].strip() Modelname = LinearRegressionModel.load(sc, supermodel) print Modelname.predict(i), "ijfksfksnfknfn jncjnnfknsklfnsk" r = str(Modelname.predict(i)) ss = "echo %s >/root/Desktop/predicted.txt" % r os.system(ss)
from pyspark.mllib.regression import LinearRegressionWithSGD, LinearRegressionModel, LabeledPoint from pyspark import SparkContext def parsePoint(line): values=[float(x) for x in line.split(',')] return LabeledPoint(values[2],[values[0], values[1]]) sc=SparkContext() model = LinearRegressionModel.load(sc, "/home/khaled/project/tmp/lin_reg_model") data_test=sc.textFile("/home/khaled/project/data_gen/test.csv") data_test_parsed=data_test.map(parsePoint) data_test2=sc.textFile("/home/khaled/project/data_gen/test2.csv") data_test_parsed2=data_test2.map(lambda x: x.split(',')) predics=data_test_parsed.map(lambda x :model.predict(x.features)) predics2=data_test_parsed2.map(lambda x :model.predict(x)) data_itr=predics.collect() data_itr2=predics2.collect() f=open("predictions.txt","w+") f.write("sbah el khir \n") for i in data_itr: f.write("the ouput consumption for is:" +str(i) + "\n") for i in data_itr2: f.write("the ouput consumption for is:" +str(i) + "\n") f.close()
validMetrics.rootMeanSquaredError validMetrics.meanSquaredError #Section 7.5.3 import operator print(",".join([ str(s) for s in sorted(enumerate([abs(x) for x in model.weights.toArray()]), key=operator.itemgetter(0)) ])) #Section 7.5.4 model.save(sc, "ch07output/model") from pyspark.mllib.regression import LinearRegressionModel model = LinearRegressionModel.load(sc, "ch07output/model") #Section 7.6.1 def iterateLRwSGD(iterNums, stepSizes, train, valid): from pyspark.mllib.regression import LinearRegressionWithSGD import math for numIter in iterNums: for step in stepSizes: alg = LinearRegressionWithSGD() model = alg.train(train, iterations=numIter, step=step, intercept=True) rescaledPredicts = train.map( lambda x: (float(model.predict(x.features)), x.label))
def parsePoint(line): values = [np.float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[6], values[0:6]) data = sc.textFile("/user/cloudera/hw1/train_nohead.csv") wholedata = sc.textFile("/user/cloudera/hw1/wholedata.csv") parsedData = data.map(parsePoint) parsedWholeData = wholedata.map(parsePoint) #Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.1) #Evaluate the model valuesAndPreds = parsedWholeData.map(lambda p: (p.label, model.predict(p.features))) RMSE = np.sqrt( valuesAndPreds \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() ) print("linear regression output : \n") print("RMSE = {0}\n".format(RMSE)) #save and load model model.save(sc, "/user/cloudera/hw1/results/2015310884_linear") sameModel = LinearRegressionModel.load( sc, "/user/cloudera/hw1/results/2015310884_linear")
LabeledPoint(2.0, [1.0, 1.4]), LabeledPoint(4.0, [2.0, 1.9]), LabeledPoint(6.0, [3.0, 4.0]) ] # 训练集 lrm = LinearRegressionWithSGD.train(sc.parallelize(data), iterations=100, initialWeights=np.array([1.0, 1.0])) print(lrm.predict(np.array([2.0, 1.0]))) # 利用训练出的回归模型进行预测 import os, tempfile from pyspark.mllib.regression import LinearRegressionModel from pyspark.mllib.linalg import SparseVector path = tempfile.mkdtemp() lrm.save(sc, path) # 将模型保存至外存 sameModel = LinearRegressionModel.load(sc, path) # 读取模型 print(sameModel.predict(SparseVector(2, { 0: 100.0, 1: 150 }))) # 利用稀疏向量作为数据结构,返回单个预测值 test_set = [] for i in range(100): for j in range(100): test_set.append(SparseVector(2, {0: i, 1: j})) print(sameModel.predict(sc.parallelize(test_set)).collect()) # 预测多值,返回一个RDD数据集 print(sameModel.weights) # 返回参数 # -----------------岭回归------------------ from pyspark.mllib.regression import RidgeRegressionWithSGD
type=p[7], velocity=int(p[8]), error=p[9], integration=int(p[10]), station=p[11] )) trafficDF = sqlContext.createDataFrame(trafficData) trafficDF.registerTempTable("traffic") query = sqlContext.sql("SELECT year, month, day, station, SUM(intensity) intensity " "FROM traffic " "WHERE error='N' AND station = '28079004' " "GROUP BY year, month, day, station " "LIMIT 1") labelPoints = query.map(lambda line:[CommonFunctions.toWeekday(2000 + line[0], line[1], line[2]), CommonFunctions.clasification_intensity(line[4])]) model = LinearRegressionModel.load(sc, dirTrainingModel) valueAir = model.predict(labelPoints.first()) data = query.map(lambda p: Row( valueAir = valueAir, year=int(p[0]), month=int(p[1]), day=int(p[2]), station=p[3], intensity=p[4] )) print data.collect()
# compute mean ndvi for comparison meanNDVI = opsRDD.map(lambda x: x.label).mean() ##### use the MEAN NDVI as the prediction (for comparison) meanRES = test.map(lambda x: (x.label,meanNDVI)) rmseVsMean = rmse(meanRES,numTest) outString = "Simple Mean NDVI RMSE = " + str(rmseVsMean) + "\n\n" fOut.write(outString) ######### MEAN NDVI ##### LINEAR REGRESSION WITH STOCHASTIC GRADIENT DESCENT # if a model has already been trained, use it # otherwise train a new one and save it if os.path.exists('~/CloudRepair/MODELS/lrmCR'): lrm = LinearRegressionModel.load(sc,'~/CloudRepair/MODELS/lrmCR') else: lrm = LinearRegressionWithSGD.train(training, iterations=10000, step=0.0000001, miniBatchFraction=0.10) lrm.save(sc, '~/CloudRepair/MODELS/lrmCR') lrmPred = lrm.predict(test.map(lambda x: x.features)) lrmRES = test.map(lambda x: x.label).zip(lrmPred) rmseLRM = rmse(lrmRES,numTest) outString = "Linear Regression NDVI RMSE = " + str(rmseLRM) + "\n\n" fOut.write(outString) ######### LINEAR REGRESSION WITH STOCHASTIC GRADIENT DESCENT ##### RANDOM FOREST optimization
for i in range(500): o.write(str(k[i][0])) o.write("        ") o.write(str(k[i][1])) o.write("<br>") o.write("\n") o.write("</body>") o.write("</html>") os.system("rm -rf /root/Desktop/mpv2/mapredtest1/templates/valpred.html") os.system("cp /root/Desktop/valpred.html /root/Desktop/mpv2/mapredtest1/templates/valpred.html") ''' trainerr=valuesAndPreds.filter(lambda(v,p):abs(v-p)>20000).count()/float(parsedData.count()) print valuesAndPreds.count(),"iuieuwieuiueieuiwuieuriuwieuieuiruwiurieuriue" #print("Mean Squared Error = " + str(MSE)+"bkbkbbbbbbbbbbbbbbbbbbbbbbbbkkbkbkkkkkkkkkkkkkkkkkkkkkkkkkkkkkbkbkbkb") print parsedtrainData.take(5),"hjjhjhjhjjjjjjjjjjjjjjjjjj" print parsedtestData.take(1)[0].features,"lnlnlnnnlnnnnnnnnnnnnnnlnlnanslnlnslnlansdlnads" print "answer bjbsbdjk=" , model.predict(parsedtestData.take(1)[0].features) print parsedtestData.take(1)[0].label ,"lkmnhbgvyctexrwzea" print parsedtrainData.take(5) print trainerr,"rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr" ''' # Save and load model model.save(sc, supermodel) sameModel = LinearRegressionModel.load(sc, supermodel)
from pyspark.mllib.regression import LinearRegressionWithSGD as lrSGD ourModelWithLinearRegression = lrSGD.train( data = regressionLabelPointTrainData, iterations = 200, step = 0.02, intercept = True) ourModelWithLinearRegression.intercept ourModelWithLinearRegression.weights #Step 9-6-5. Saving the created model. ourModelWithLinearRegression.save(sc, '/home/pysparkbook/ourModelWithLinearRegression') from pyspark.mllib.regression import LinearRegressionModel as linearRegressModel ourModelWithLinearRegressionReloaded = linearRegressModel.load(sc, '/home/pysparkbook/ourModelWithLinearRegression') ourModelWithLinearRegressionReloaded.intercept ourModelWithLinearRegressionReloaded.weights #Step 9-6-6. Predicting the data using model. actualDataandLinearRegressionPredictedData = regressionLabelPointTestData.map(lambda data : (float(data.label) , float(ourModelWithLinearRegression.predict(data.features)))) actualDataandLinearRegressionPredictedData.take(5) #Step 9-6-7. Evaluating the model we have created. from pyspark.mllib.evaluation import RegressionMetrics as rmtrcs ourLinearRegressionModelMetrics = rmtrcs(actualDataandLinearRegressionPredictedData) ourLinearRegressionModelMetrics.rootMeanSquaredError ourLinearRegressionModelMetrics.r2
MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) # Save and load model def save(self, sc, path): java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel( _py2java(sc, self._coeff), self.intercept ) java_model.save(sc._jsc.sc(), path) @classmethod def load(cls, sc, path): java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel.load(sc._jsc.sc(), path) weights = _java2py(sc, java_model.weights()) intercept = java_model.intercept() model = LinearRegressionModel(weights, intercept) return model # Save parameters i.e. min_ and max_ on disk f = open(params, "w") f.write(str(str(min_) + "," + str(max_))) f.close() model.save(sc, myModelPath) sameModel = LinearRegressionModel.load(sc, myModelPath) sample = testData.map(lambda p: p.features) predictValues = sameModel.predict(sample)
def run_saved_model(pr_values, sc): values = list(pr_values) predict_model = LinearRegressionModel.load(sc, '/Users/xiaoru_zhu/PycharmProjects/HousingPriceDA/PricePrediction/model/LR.model') # Configure train_path = '/Users/xiaoru_zhu/PycharmProjects/HousingPriceDA/Dataset/train.csv' # Initialize RDD rdd_lines = sc.textFile(train_path) head = rdd_lines.first() rdd_lines = rdd_lines.filter(lambda ln: ln != head) \ .mapPartitions(lambda x: csv.reader(x)) \ .persist(StorageLevel(True, True, False, False, 1)) # MEMORY_AND_DISK # Prepare for normalization sub = [] minimum = [] for index in range(5, 8): max_ = float(rdd_lines.map(lambda attr: attr[index]).max(key=float)) min_ = float(rdd_lines.map(lambda attr: attr[index]).min(key=float)) subtract = max_ - min_ minimum.append(min_) sub.append(subtract) # Normalization(gui yi): (val - min)/(max - min), to let number feature values in [0, 1] and narrow down the error def normalization(line): line[5] = (float(line[5]) - minimum[0]) / sub[0] line[6] = (float(line[6]) - minimum[1]) / sub[1] line[7] = (float(line[7]) - minimum[2]) / sub[2] return line values = normalization(values) # extract features from every category column and generate dict def be_mapped(rdd_arg, column): return rdd_arg.map(lambda attr: attr[column]) \ .distinct() \ .zipWithIndex() \ .collectAsMap() # result : {'BATH BEACH': 0, 'BAY RIDGE': 1, 'BEDFORD STUYVESANT': 2, ...} mappings = [be_mapped(rdd_lines, i) for i in [0, 1, 2, 8]] # collect dicts into a list print('category feature mapping dict:', mappings) cat_len = sum(map(len, [i for i in mappings])) # category feature numbers using sum + map function num_len = len(rdd_lines.first()[5:8]) # number feature numbers,index = 5,6,7 total_len = num_len + cat_len # total feature numbers rdd_lines = rdd_lines.map(lambda attr: normalization(attr)) # extract features from every category column and generate dict def be_mapped(rdd_arg, column): return rdd_arg.map(lambda attr: attr[column]) \ .distinct() \ .zipWithIndex() \ .collectAsMap() # result : {'BATH BEACH': 0, 'BAY RIDGE': 1, 'BEDFORD STUYVESANT': 2, ...} mappings = [be_mapped(rdd_lines, i) for i in [0, 1, 2, 8]] # collect dicts into a list print('category feature mapping dict:', mappings) cat_len = sum(map(len, [i for i in mappings])) # category feature numbers using sum + map function # num_len = len(rdd_lines.first()[5:8]) # number feature numbers,index = 5,6,7 # Create eigenvectors(feature vectors) for linear regression def extract_features(line): cat_vec = np.zeros(cat_len) # new array for category features, init 0 for all elements step = 0 for i, raw_feature in enumerate([line[0], line[1], line[2], line[8]]): # [(0,line[0]), (1,line[1]), ...) ] dict_cate = mappings[i] # category feature mapping dict {'BATH BEACH': 0, 'BAY RIDGE': 1, 'xxx': 2, ...} idx = dict_cate[raw_feature] # get value from dict cat_vec[idx + step] = 1 # set 1 for index in array step = step + len(dict_cate) # jump to the next attribute area num_vec = np.array([float(raw_feature) for raw_feature in line[5:8]]) return np.concatenate((cat_vec, num_vec)) # splice category and number vectors values_vec = extract_features(values) rst = predict_model.predict(values_vec) rst = round(rst, 2) r_m_s_l_e = round(1.4002, 2) m_a_e = round(2516004.8850, 2) rst_lst = [rst, r_m_s_l_e, m_a_e] print(rst_lst) return rst_lst
def load_model(self, sc, model_file): model = LinearRegressionModel.load(sc, model_file) return model
return (county, LabeledPoint(values[-1], values[1:-1])) if __name__ == '__main__': sc = SparkContext() data = sc.textFile(app.root_path + "/CSVs/test_cancer_final.csv") header = data.first() data = data.filter(lambda x: x != header) parsedData = data.map(parsePoint).map(lambda x: x[1]) # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=10) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) print(valuesAndPreds.collect()) MSE = valuesAndPreds \ .map(lambda vp: (vp[0] - vp[1]) ** 2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save( sc, app.root_path + "/Models/pythonLinearRegressionWithSGDModel_cancer") sameModel = LinearRegressionModel.load( sc, app.root_path + "/Models/pythonLinearRegressionWithSGDModel_cancer")
if __name__ == "__main__": sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") # $example on$ # Load and parse the data def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("data/mllib/ridge-data/lpsa.data") parsedData = data.map(parsePoint) # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ .map(lambda vp: (vp[0] - vp[1])**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "target/tmp/pythonLinearRegressionWithSGDModel") sameModel = LinearRegressionModel.load( sc, "target/tmp/pythonLinearRegressionWithSGDModel") # $example off$
import numpy as np from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel sc = SparkContext("local") # Load and parse the data def parsePoint(line): #values = [float(x) for x in line.replace(',', ' ').split(' ')] values = line.split(',') return LabeledPoint(values[2:3], values[0:2] + values[3:]) data = sc.textFile("/root/Desktop/dataset/kc_house_data.csv") parsedData = data.map(parsePoint) # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE) + "bkbkbbbbbbbbbbbbbbbbbbbbbbbbkkbkbkkkkkkkkkkkkkkkkkkkkkkkkkkkkkbkbkbkb") # Save and load model model.save(sc, "myModelPath") sameModel = LinearRegressionModel.load(sc, "myModelPath")
if __name__ == "__main__": sc = SparkContext(appName="TicTacLinearRegressionExample") # Parse the data and create LabeledPoints def parsePoint(line): values = [x for x in line.split(' ')] # Last row contains the target data and rest of # the rows define the attributes for linear regression return LabeledPoint(values[9], values[0:8]) # Load the data data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt") parsedData = data.map(parsePoint) # Build the model using LinearRegression model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) # Evaluate the model on training data predict = parsedData.map(lambda pd: (pd.label, model.predict(pd.features))) MSE = predict \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / predict.count() # Print Mean Squared Error print("Mean Squared Error for Tic Tac Linear Regression = " + str(MSE)) # Save and load model model.save(sc, "target/tmp/pythonTicTacLinearRegression") sameModel = LinearRegressionModel.load(sc, "target/tmp/pythonTicTacLinearRegression")
sc = SparkContext(appName="TicTacLinearRegressionExample") # Parse the data and create LabeledPoints def parsePoint(line): values = [x for x in line.split(' ')] # Last row contains the target data and rest of # the rows define the attributes for linear regression return LabeledPoint(values[9], values[0:8]) # Load the data data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt") parsedData = data.map(parsePoint) # Build the model using LinearRegression model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) # Evaluate the model on training data predict = parsedData.map(lambda pd: (pd.label, model.predict(pd.features))) MSE = predict \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / predict.count() # Print Mean Squared Error print("Mean Squared Error for Tic Tac Linear Regression = " + str(MSE)) # Save and load model model.save(sc, "target/tmp/pythonTicTacLinearRegression") sameModel = LinearRegressionModel.load( sc, "target/tmp/pythonTicTacLinearRegression")
sc = SparkContext() train_data = sc.textFile("train.csv") test_data = sc.textFile("test.csv") parsedTrainData = train_data.map(parsePoint).filter(lambda x: x is not None) parsedTestData = test_data.map(parsePoint).filter(lambda x: x is not None) mat = RowMatrix(train_data.map(parseVector).filter(lambda x: x is not None)) pc = mat.computePrincipalComponents(2) projected = mat.multiply(pc) x = [vector[0] for vector in projected.rows.collect()] y = [vector[1] for vector in projected.rows.collect()] LinearModel = LinearRegressionModel.load(sc, "Linear") RidgeModel = RidgeRegressionModel.load(sc, "Ridge") LassoModel = LassoModel.load(sc, "Lasso") valuesAndPredsLinearTrain = parsedTrainData.map( lambda p: (p.label, LinearModel.predict(p.features))) valuesAndPredsLinearTest = parsedTestData.map( lambda p: (p.label, LinearModel.predict(p.features))) valuesAndPredsRidgeTrain = parsedTrainData.map( lambda p: (p.label, RidgeModel.predict(p.features))) valuesAndPredsRidgeTest = parsedTestData.map( lambda p: (p.label, RidgeModel.predict(p.features))) valuesAndPredsLassoTrain = parsedTrainData.map( lambda p: (p.label, LassoModel.predict(p.features)))
) for i in range(500): o.write(str(k[i][0])) o.write("        ") o.write(str(k[i][1])) o.write("<br>") o.write("\n") o.write("</body>") o.write("</html>") os.system("rm -rf /root/Desktop/mpv2/mapredtest1/templates/valpred.html") os.system( "cp /root/Desktop/valpred.html /root/Desktop/mpv2/mapredtest1/templates/valpred.html" ) ''' trainerr=valuesAndPreds.filter(lambda(v,p):abs(v-p)>20000).count()/float(parsedData.count()) print valuesAndPreds.count(),"iuieuwieuiueieuiwuieuriuwieuieuiruwiurieuriue" #print("Mean Squared Error = " + str(MSE)+"bkbkbbbbbbbbbbbbbbbbbbbbbbbbkkbkbkkkkkkkkkkkkkkkkkkkkkkkkkkkkkbkbkbkb") print parsedtrainData.take(5),"hjjhjhjhjjjjjjjjjjjjjjjjjj" print parsedtestData.take(1)[0].features,"lnlnlnnnlnnnnnnnnnnnnnnlnlnanslnlnslnlansdlnads" print "answer bjbsbdjk=" , model.predict(parsedtestData.take(1)[0].features) print parsedtestData.take(1)[0].label ,"lkmnhbgvyctexrwzea" print parsedtrainData.take(5) print trainerr,"rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr" ''' # Save and load model model.save(sc, supermodel) sameModel = LinearRegressionModel.load(sc, supermodel)
#Section 7.5.2 from pyspark.mllib.evaluation import RegressionMetrics validMetrics = RegressionMetrics(validPredicts) validMetrics.rootMeanSquaredError validMetrics.meanSquaredError #Section 7.5.3 import operator print(",".join([str(s) for s in sorted(enumerate([abs(x) for x in model.weights.toArray()]), key=operator.itemgetter(0))])) #Section 7.5.4 model.save(sc, "ch07output/model") from pyspark.mllib.regression import LinearRegressionModel model = LinearRegressionModel.load(sc, "ch07output/model") #Section 7.6.1 def iterateLRwSGD(iterNums, stepSizes, train, valid): from pyspark.mllib.regression import LinearRegressionWithSGD import math for numIter in iterNums: for step in stepSizes: alg = LinearRegressionWithSGD() model = alg.train(train, iterations=numIter, step=step, intercept=True) rescaledPredicts = train.map(lambda x: (float(model.predict(x.features)), x.label)) validPredicts = valid.map(lambda x: (float(model.predict(x.features)), x.label)) meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") # $example on$ # Load and parse the data def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("data/mllib/ridge-data/lpsa.data") parsedData = data.map(parsePoint) # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ .map(lambda vp: (vp[0] - vp[1])**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "target/tmp/pythonLinearRegressionWithSGDModel") sameModel = LinearRegressionModel.load(sc, "target/tmp/pythonLinearRegressionWithSGDModel") # $example off$
return LabeledPoint(values[7], values[0:11]) #data_file = sc.textFile("/home/faiz89/Desktop/Eastman/2008.csv") data_file = sc.textFile("../2008_small.csv") header = data_file.first () raw_data = data_file.filter (lambda x:x != header) #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = LinearRegressionWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() testData.cache() # Evaluating the model on training data valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Total Time: '), (datetime.now() - startTime) print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "LinearRegressionNarrow2008_cache_both_train_and_test") sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
from pyspark.sql.functions import * from datetime import datetime,timedelta from pyspark.mllib.regression import LabeledPoint,LinearRegressionWithSGD,LinearRegressionModel from pyspark.mllib.feature import HashingTF inputPath = '/user/ssambasi/SFPD_parquet' conf = SparkConf().setAppName('Predict Alarming District') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) crimeDF =sqlContext.read.parquet(inputPath).cache() htf = HashingTF(5000) #Load the model lrm = LinearRegressionModel.load(sc, '/user/ssambasi/sfo/CrimeCountPredictionModel') #Load test data for demo districtRDD = crimeDF.select('PdDistrict').distinct().rdd.filter(lambda r:r[0]!='').map(lambda r:r[0]).cache() startDate = datetime.now() dateList = [] for dateIndex in range(0,30): dateList.append(startDate + timedelta(days=dateIndex)) dateRDD = sc.parallelize(dateList).cache() testDataRDD = districtRDD.cartesian(dateRDD).map(lambda (district,date): \ ((district,date),LabeledPoint(1.0,htf.transform((district,date))))).cache() #Predict Alarming District using the model def GetMaXCount((district1,count1),(district2,count2)): if(count1>count2):