def prediction(hour,extern_temp):	
	if hour>=0 and hour<6:
		
		#Predict Temperature
		tempModel1 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model1")
		x = np.array([extern_temp])
		temp_data = tempModel1.predict(x)
		print temp_data

	elif hour>=6 and hour<12:
		
		#Predict Temperature
		tempModel2 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model2")
		x = np.array([extern_temp])
		temp_data = tempModel2.predict(x)
		print temp_data

	elif hour>=12 and hour<18:

		#Predict Temperature
		tempModel3 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model3")
		x = np.array([extern_temp])
		temp_data = tempModel3.predict(x)
		print temp_data
		
	elif hour>=18 and hour<24:

		#Predict Temperature
		tempModel4 = LinearRegressionModel.load(sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model4")
		x = np.array([extern_temp])
		temp_data = tempModel4.predict(x)
		print temp_data
	f.write('%.2f' % temp_data)
	f.write('\n')
    def load_parameters(self):
        self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                 file_name='amount_method')
        self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                file_name='trend_method')
        self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features')
        self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol')
        self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser')
        amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model')
        trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model')

        if self.amount_prediction_method == self.RANDOM_FOREST:
            amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path)
        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path)
        else:
            amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model')

        if self.trend_prediction_method == self.RANDOM_FOREST:
            trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.SVM:
            trend_model = SVMModel.load(sc=self.sc, path=trend_model_path)
        else:
            trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model')

        return trend_model, amount_model
def prediction(hour, extern_temp):
    if hour >= 0 and hour < 6:

        #Predict Temperature
        tempModel1 = LinearRegressionModel.load(
            sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model1")
        x = np.array([extern_temp])
        temp_data = tempModel1.predict(x)
        print temp_data

    elif hour >= 6 and hour < 12:

        #Predict Temperature
        tempModel2 = LinearRegressionModel.load(
            sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model2")
        x = np.array([extern_temp])
        temp_data = tempModel2.predict(x)
        print temp_data

    elif hour >= 12 and hour < 18:

        #Predict Temperature
        tempModel3 = LinearRegressionModel.load(
            sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model3")
        x = np.array([extern_temp])
        temp_data = tempModel3.predict(x)
        print temp_data

    elif hour >= 18 and hour < 24:

        #Predict Temperature
        tempModel4 = LinearRegressionModel.load(
            sc, "/home/dhruv/Desktop/IoT/Assignment-3/Final/model4")
        x = np.array([extern_temp])
        temp_data = tempModel4.predict(x)
        print temp_data
    f.write('%.2f' % temp_data)
    f.write('\n')
def main(sc):
    features_cr = sc.pickleFile('/tmp/features_saved')
    linear_model = LinearRegressionModel.load(sc, "/tmp/linear_model")
    # Getting the features ready for predicting
    numberFeatures = len(features_cr.first()) - 1
    mappings = [get_mapping(features_cr, i) for i in range(0, numberFeatures)]

    # Month Dictionary
    dictio_month = {}
    for i in range(12):
        dictio_month[i + 1] = i
    mappings[1] = dictio_month

    cat_len = sum(map(len, mappings))

    if len(sys.argv) == 2:
        dateZoneGroup = str(sys.argv[1]).split(',')
        zones = get_group_zone(dateZoneGroup[5], 'zone', features_cr)
        groups = get_group_zone(dateZoneGroup[6], 'group', features_cr)
        year_feat = int(dateZoneGroup[0])
        month_feat = int(dateZoneGroup[1])
        day_feat = int(dateZoneGroup[3])
        startDate = date(year_feat, month_feat, day_feat)
        endDate = date(year_feat, month_feat, day_feat)
        print(startDate)
    else:
        startDate = datetime.strptime(str(sys.argv[1]), event_date).date()
        endDate = datetime.strptime(str(sys.argv[2]), event_date).date()
        zones = get_group_zone(str(sys.argv[3]), 'zone', features_cr)
        groups = get_group_zone(str(sys.argv[4]), 'group', features_cr)

    dateRangeWF = list(date_range(startDate, endDate))

    # Making predictions:

    feat_vs_pred = list()
    featPredShow = list()
    for g in groups:
        for z in zones:
            for dR in dateRangeWF:
                featureLine = format_date(dR, z, g, mappings, cat_len)
                #print(featureLine)
                featureShow = format_show(dR, z, g)
                predLine = linear_model.predict(featureLine)
                feat_vs_pred.append(list(featureLine) + [predLine])
                featPredShow.append(list(featureShow) + [predLine])

    scFeaturesPred = sc.parallelize(featPredShow)
    [print(j) for j in scFeaturesPred.collect()]
def prediction():
    year=yearprediction
    stations = sc.textFile(output+"/stationtextformat")
    stations = stations.map(getdata).map(lambda x: (x[0], int(year), float(x[1]), float(x[2])))
    lat = stations.map(lambda x: (x[2])).cache()
    min_lat = lat.min()
    max_lat = lat.max()

    longtitude =  stations.map(lambda x: (x[3])).cache()
    min_long = longtitude.min()
    max_long = longtitude.max()

    max_ = [float('2050'), max_lat, max_long]
    min_ = [float('1990'), min_lat, min_long]

    stations = stations.map(lambda x: scalePoint(x, max_, min_)).cache()
    stationsDF = sqlContext.createDataFrame(stations)
    # load the model
    sameModel = LinearRegressionModel.load(sc, output+"/modelpath")
    # run the model
    stationidAndPreds = stations.map(lambda p : (p[0],  float(sameModel.predict(p[1:]))))
    # the result returns a predicted value for each station (stationId) in the given year
    resultRdd = stationidAndPreds.map(rescale)
    rddschema = resultRdd.map(lambda (a,b): Row(station= a, avg_prcp=b)).cache()
    stationidAndPredsDF = sqlContext.createDataFrame(rddschema)
    stationidAndPredsDF.registerTempTable("stationPrediction")
    getCountries()
    countires = sc.textFile(output+"/countries")
    countriesRdd = countires.map(getdata)
    countries = countriesRdd.map(lambda (a,b): Row(station= a, country=b)).cache()
    countriesDF = sqlContext.createDataFrame(countries)
    countriesDF.registerTempTable("StationTable")
    countriesDF.cache()
    shortenstations = sqlContext.sql("SELECT SUBSTR(station, 1, 2) As station,avg_prcp FROM stationPrediction")
    shortenstations.show()
    joinedresult = countriesDF.join(shortenstations).where(countriesDF.station == shortenstations.station).select(shortenstations.avg_prcp, countriesDF.country)
    joinedresult.registerTempTable("joinedresult")
    results = sqlContext.sql("SELECT country, Avg(avg_prcp) as avg_prcp FROM joinedresult GROUP BY country")
    results.registerTempTable("results")
    outrdd=results.repartition(40).rdd.map(lambda l: str(l.country)+","+str(l.avg_prcp)).coalesce(1)
    path = yearprediction
    outrdd.saveAsTextFile(output+'/prediction/'+path)
def CustomPredict(date_start, date_end, company):

# create spark context
    sc = SparkContext(appName="Model")
# create an api object
    api = NewsAPI.NewsAPI(date_start.month,date_start.day,date_start.year,date_end.month,date_end.day,date_end.year, company,'56283d7d6075b9d30773e1ceb440e1b2d029f438')
# load the prediction model for the company
    model = LinearRegressionModel.load(sc, company)
# getting data for the duration of time specified
    api.startGetData()
# get the sentiment average of the days
    l = api.getSentimentScore()
    mean_sent = np.mean(l)
    print ("\n\n\n\n\n" + str(mean_sent) + "\n\n\n\n\n")
# make the prediction using the loaded model
    pred = model.predict([mean_sent])
    print ("\n\n\n\n\n" + str(pred) + "\n\n\n\n\n")
# close the spark context
    sc.stop()
# return
    return pred
Beispiel #7
0
def main():


    spark = SparkSession.builder.appName("TRAFFIC").config("spark.executor.cores", "4").config("spark.executor.memory", "4g").getOrCreate()
    sc = spark.sparkContext
    mapping = sc.textFile("s3a://insighttraffic/ML_model/mappings").collect()[0]
    mapping = ast.literal_eval(str(mapping))

    models=[]
    for hour in range(0, 24):
        model = LinearRegressionModel.load(sc, "s3a://insighttraffic/ML_model/linear_model_log_"+str(hour))
        models.append(model)

    category_len = 154

    sqlContext = sql.SQLContext(sc)


    hadoop_conf=sc._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    hadoop_conf.set("fs.s3n.awsAccessKeyId", 'awsAccessKeyId')
    hadoop_conf.set("fs.s3n.awsSecretAccessKey", 'awsSecretAccessKey')

    # set microbatch interval as 10 seconds, this can be customized according to the project
    ssc = StreamingContext(sc,10)
    # directly receive the data under a certain topic
    kafkaStream = KafkaUtils.createDirectStream(ssc, ['data'], {"metadata.broker.list": 'Kafka-DNS:9092'})


    connection = psycopg2.connect(host = 'postgres-ip-address', database = 'postgres', user = '******', password = '******')
    cursor = connection.cursor()
    cursor.execute('CREATE TABLE IF NOT EXISTS realtimetraffic (sid text, location text, latitude double precision, longitude double precision,\
        direction text, lanes integer, roadtype text, highway text, current integer, historical double precision, level text, PRIMARY KEY (id));')
    cursor.execute('SELECT AddGeometryColumn (%s,%s,%s,4326,%s,2);', (public,realtimetraffic,geom,POINT,))


    #The inbound stream is a DStream
    dstream = kafkaStream.map(lambda (key, value): json.loads(value))
    dstream.foreachRDD(lambda rdd: update(rdd, models, mapping))
#!usr/local/spark/python
from pyspark import SparkContext

from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
from pyspark.mllib.classification import SVMWithSGD, SVMModel
sc = SparkContext("local")
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
import os
features = open("/root/Desktop/features.LOL", "r")
feature = features.read().strip().split(' ')
i = map(lambda x: float(x), feature)

param = open("/root/Desktop/parameters.LOL", "r")
temp = param.readlines()
print "LOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOLOL"
supermodel = temp[2].strip()
Modelname = LinearRegressionModel.load(sc, supermodel)

print Modelname.predict(i), "ijfksfksnfknfn jncjnnfknsklfnsk"
r = str(Modelname.predict(i))
ss = "echo %s >/root/Desktop/predicted.txt" % r
os.system(ss)
Beispiel #9
0
from pyspark.mllib.regression import LinearRegressionWithSGD, LinearRegressionModel, LabeledPoint
from pyspark import SparkContext

def parsePoint(line):

	values=[float(x) for x in line.split(',')]
	
	return LabeledPoint(values[2],[values[0], values[1]])


sc=SparkContext()

model = LinearRegressionModel.load(sc, "/home/khaled/project/tmp/lin_reg_model")

data_test=sc.textFile("/home/khaled/project/data_gen/test.csv")
data_test_parsed=data_test.map(parsePoint)
data_test2=sc.textFile("/home/khaled/project/data_gen/test2.csv")
data_test_parsed2=data_test2.map(lambda x: x.split(','))
predics=data_test_parsed.map(lambda x :model.predict(x.features))
predics2=data_test_parsed2.map(lambda x :model.predict(x))
data_itr=predics.collect()
data_itr2=predics2.collect()
f=open("predictions.txt","w+")
f.write("sbah el khir \n")
for i in data_itr:
	f.write("the ouput consumption for is:" +str(i) + "\n")
for i in data_itr2:
	f.write("the ouput consumption for is:" +str(i) + "\n")
f.close()
Beispiel #10
0
)

for i in range(500):

    o.write(str(k[i][0]))
    o.write("&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp")
    o.write(str(k[i][1]))
    o.write("<br>")
    o.write("\n")
o.write("</body>")
o.write("</html>")

os.system("rm -rf /root/Desktop/mpv2/mapredtest1/templates/valpred.html")
os.system(
    "cp /root/Desktop/valpred.html /root/Desktop/mpv2/mapredtest1/templates/valpred.html"
)
'''
trainerr=valuesAndPreds.filter(lambda(v,p):abs(v-p)>20000).count()/float(parsedData.count())
print valuesAndPreds.count(),"iuieuwieuiueieuiwuieuriuwieuieuiruwiurieuriue"
#print("Mean Squared Error = " + str(MSE)+"bkbkbbbbbbbbbbbbbbbbbbbbbbbbkkbkbkkkkkkkkkkkkkkkkkkkkkkkkkkkkkbkbkbkb")
print parsedtrainData.take(5),"hjjhjhjhjjjjjjjjjjjjjjjjjj"
print parsedtestData.take(1)[0].features,"lnlnlnnnlnnnnnnnnnnnnnnlnlnanslnlnslnlansdlnads"
print "answer bjbsbdjk=" , model.predict(parsedtestData.take(1)[0].features)
print parsedtestData.take(1)[0].label ,"lkmnhbgvyctexrwzea"
print parsedtrainData.take(5)
print trainerr,"rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr"
'''
# Save and load model
model.save(sc, supermodel)
sameModel = LinearRegressionModel.load(sc, supermodel)
Beispiel #11
0
def parsePoint(line):
    values = [np.float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[6], values[0:6])


data = sc.textFile("/user/cloudera/hw1/train_nohead.csv")
wholedata = sc.textFile("/user/cloudera/hw1/wholedata.csv")

parsedData = data.map(parsePoint)
parsedWholeData = wholedata.map(parsePoint)

#Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.1)

#Evaluate the model
valuesAndPreds = parsedWholeData.map(lambda p:
                                     (p.label, model.predict(p.features)))

RMSE = np.sqrt(
    valuesAndPreds \
 .map(lambda (v, p): (v - p)**2) \
 .reduce(lambda x, y: x + y) / valuesAndPreds.count()
)
print("linear regression output : \n")
print("RMSE = {0}\n".format(RMSE))

#save and load model
model.save(sc, "/user/cloudera/hw1/results/2015310884_linear")
sameModel = LinearRegressionModel.load(
    sc, "/user/cloudera/hw1/results/2015310884_linear")
Beispiel #12
0
    LabeledPoint(2.0, [1.0, 1.4]),
    LabeledPoint(4.0, [2.0, 1.9]),
    LabeledPoint(6.0, [3.0, 4.0])
]  # 训练集
lrm = LinearRegressionWithSGD.train(sc.parallelize(data),
                                    iterations=100,
                                    initialWeights=np.array([1.0, 1.0]))
print(lrm.predict(np.array([2.0, 1.0])))  # 利用训练出的回归模型进行预测

import os, tempfile
from pyspark.mllib.regression import LinearRegressionModel
from pyspark.mllib.linalg import SparseVector

path = tempfile.mkdtemp()
lrm.save(sc, path)  # 将模型保存至外存
sameModel = LinearRegressionModel.load(sc, path)  # 读取模型
print(sameModel.predict(SparseVector(2, {
    0: 100.0,
    1: 150
})))  # 利用稀疏向量作为数据结构,返回单个预测值
test_set = []
for i in range(100):
    for j in range(100):
        test_set.append(SparseVector(2, {0: i, 1: j}))
print(sameModel.predict(sc.parallelize(test_set)).collect())  # 预测多值,返回一个RDD数据集
print(sameModel.weights)  # 返回参数

# -----------------岭回归------------------

from pyspark.mllib.regression import RidgeRegressionWithSGD
    return (county, LabeledPoint(values[-1], values[1:-1]))


if __name__ == '__main__':
    sc = SparkContext()

    data = sc.textFile(app.root_path + "/CSVs/test_cancer_final.csv")
    header = data.first()
    data = data.filter(lambda x: x != header)

    parsedData = data.map(parsePoint).map(lambda x: x[1])

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=10)

    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    print(valuesAndPreds.collect())
    MSE = valuesAndPreds \
              .map(lambda vp: (vp[0] - vp[1]) ** 2) \
              .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))

    # Save and load model
    model.save(
        sc,
        app.root_path + "/Models/pythonLinearRegressionWithSGDModel_cancer")
    sameModel = LinearRegressionModel.load(
        sc,
        app.root_path + "/Models/pythonLinearRegressionWithSGDModel_cancer")
for i in range(500):
	
	o.write(str(k[i][0]))
	o.write("&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp&nbsp")
	o.write(str(k[i][1]))
	o.write("<br>")
	o.write("\n")
o.write("</body>")
o.write("</html>")

os.system("rm -rf /root/Desktop/mpv2/mapredtest1/templates/valpred.html")
os.system("cp /root/Desktop/valpred.html /root/Desktop/mpv2/mapredtest1/templates/valpred.html")

'''
trainerr=valuesAndPreds.filter(lambda(v,p):abs(v-p)>20000).count()/float(parsedData.count())
print valuesAndPreds.count(),"iuieuwieuiueieuiwuieuriuwieuieuiruwiurieuriue"
#print("Mean Squared Error = " + str(MSE)+"bkbkbbbbbbbbbbbbbbbbbbbbbbbbkkbkbkkkkkkkkkkkkkkkkkkkkkkkkkkkkkbkbkbkb")
print parsedtrainData.take(5),"hjjhjhjhjjjjjjjjjjjjjjjjjj"
print parsedtestData.take(1)[0].features,"lnlnlnnnlnnnnnnnnnnnnnnlnlnanslnlnslnlansdlnads"
print "answer bjbsbdjk=" , model.predict(parsedtestData.take(1)[0].features)
print parsedtestData.take(1)[0].label ,"lkmnhbgvyctexrwzea"
print parsedtrainData.take(5)
print trainerr,"rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr"
'''
# Save and load model
model.save(sc, supermodel)
sameModel = LinearRegressionModel.load(sc, supermodel)



Beispiel #15
0
from pyspark.mllib.regression import LinearRegressionWithSGD as lrSGD
ourModelWithLinearRegression  = lrSGD.train(
                                                data = regressionLabelPointTrainData,
                                             iterations = 200,
                                             step = 0.02, 
                                             intercept = True)

ourModelWithLinearRegression.intercept
ourModelWithLinearRegression.weights

#Step 9-6-5. Saving the created model.

ourModelWithLinearRegression.save(sc, '/home/pysparkbook/ourModelWithLinearRegression')
from pyspark.mllib.regression import LinearRegressionModel as linearRegressModel

ourModelWithLinearRegressionReloaded = linearRegressModel.load(sc, '/home/pysparkbook/ourModelWithLinearRegression')
ourModelWithLinearRegressionReloaded.intercept
ourModelWithLinearRegressionReloaded.weights

#Step 9-6-6. Predicting the data using model.

actualDataandLinearRegressionPredictedData = regressionLabelPointTestData.map(lambda data : (float(data.label) , float(ourModelWithLinearRegression.predict(data.features))))
actualDataandLinearRegressionPredictedData.take(5)

#Step 9-6-7. Evaluating the model we have created.

from pyspark.mllib.evaluation import RegressionMetrics as rmtrcs
ourLinearRegressionModelMetrics = rmtrcs(actualDataandLinearRegressionPredictedData)
ourLinearRegressionModelMetrics.rootMeanSquaredError
ourLinearRegressionModelMetrics.r2
MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))

# Save and load model
def save(self, sc, path):
    java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel(
        _py2java(sc, self._coeff), self.intercept
    )
    java_model.save(sc._jsc.sc(), path)


@classmethod
def load(cls, sc, path):
    java_model = sc._jvm.org.apache.spark.mllib.regression.LinearRegressionModel.load(sc._jsc.sc(), path)
    weights = _java2py(sc, java_model.weights())
    intercept = java_model.intercept()
    model = LinearRegressionModel(weights, intercept)
    return model


# Save parameters i.e. min_ and max_ on disk
f = open(params, "w")
f.write(str(str(min_) + "," + str(max_)))
f.close()


model.save(sc, myModelPath)
sameModel = LinearRegressionModel.load(sc, myModelPath)
sample = testData.map(lambda p: p.features)
predictValues = sameModel.predict(sample)
def run_saved_model(pr_values, sc):
    values = list(pr_values)
    predict_model = LinearRegressionModel.load(sc, '/Users/xiaoru_zhu/PycharmProjects/HousingPriceDA/PricePrediction/model/LR.model')

    # Configure
    train_path = '/Users/xiaoru_zhu/PycharmProjects/HousingPriceDA/Dataset/train.csv'
    # Initialize RDD
    rdd_lines = sc.textFile(train_path)
    head = rdd_lines.first()
    rdd_lines = rdd_lines.filter(lambda ln: ln != head) \
                         .mapPartitions(lambda x: csv.reader(x)) \
                         .persist(StorageLevel(True, True, False, False, 1))  # MEMORY_AND_DISK
    # Prepare for normalization
    sub = []
    minimum = []
    for index in range(5, 8):
        max_ = float(rdd_lines.map(lambda attr: attr[index]).max(key=float))
        min_ = float(rdd_lines.map(lambda attr: attr[index]).min(key=float))
        subtract = max_ - min_
        minimum.append(min_)
        sub.append(subtract)

    # Normalization(gui yi): (val - min)/(max - min), to let number feature values in [0, 1] and narrow down the error
    def normalization(line):
        line[5] = (float(line[5]) - minimum[0]) / sub[0]
        line[6] = (float(line[6]) - minimum[1]) / sub[1]
        line[7] = (float(line[7]) - minimum[2]) / sub[2]
        return line

    values = normalization(values)


    # extract features from every category column and generate dict
    def be_mapped(rdd_arg, column):
        return rdd_arg.map(lambda attr: attr[column]) \
                      .distinct() \
                      .zipWithIndex() \
                      .collectAsMap()  # result : {'BATH BEACH': 0, 'BAY RIDGE': 1, 'BEDFORD STUYVESANT': 2, ...}

    mappings = [be_mapped(rdd_lines, i) for i in [0, 1, 2, 8]]  # collect dicts into a list
    print('category feature mapping dict:', mappings)
    cat_len = sum(map(len, [i for i in mappings]))  # category feature numbers using sum + map function
    num_len = len(rdd_lines.first()[5:8])  # number feature numbers,index = 5,6,7
    total_len = num_len + cat_len  # total feature numbers

    rdd_lines = rdd_lines.map(lambda attr: normalization(attr))


    # extract features from every category column and generate dict
    def be_mapped(rdd_arg, column):
        return rdd_arg.map(lambda attr: attr[column]) \
            .distinct() \
            .zipWithIndex() \
            .collectAsMap()  # result : {'BATH BEACH': 0, 'BAY RIDGE': 1, 'BEDFORD STUYVESANT': 2, ...}

    mappings = [be_mapped(rdd_lines, i) for i in [0, 1, 2, 8]]  # collect dicts into a list
    print('category feature mapping dict:', mappings)
    cat_len = sum(map(len, [i for i in mappings]))  # category feature numbers using sum + map function
    # num_len = len(rdd_lines.first()[5:8])  # number feature numbers,index = 5,6,7

    # Create eigenvectors(feature vectors) for linear regression
    def extract_features(line):
        cat_vec = np.zeros(cat_len)  # new array for category features, init 0 for all elements
        step = 0
        for i, raw_feature in enumerate([line[0], line[1], line[2], line[8]]):  # [(0,line[0]), (1,line[1]), ...) ]
            dict_cate = mappings[i]  # category feature mapping dict {'BATH BEACH': 0, 'BAY RIDGE': 1, 'xxx': 2, ...}
            idx = dict_cate[raw_feature]  # get value from dict
            cat_vec[idx + step] = 1  # set 1 for index in array
            step = step + len(dict_cate)  # jump to the next attribute area
        num_vec = np.array([float(raw_feature) for raw_feature in line[5:8]])
        return np.concatenate((cat_vec, num_vec))  # splice category and number vectors

    values_vec = extract_features(values)

    rst = predict_model.predict(values_vec)

    rst = round(rst, 2)
    r_m_s_l_e = round(1.4002, 2)
    m_a_e = round(2516004.8850, 2)
    rst_lst = [rst, r_m_s_l_e, m_a_e]
    print(rst_lst)
    return rst_lst
Beispiel #18
0
 def load_model(self, sc, model_file):
     model = LinearRegressionModel.load(sc, model_file)
     return model
Beispiel #19
0
# MAGIC %md The tuned model does better! (Note: Performance can vary because of randomness, but it should be better.)

# COMMAND ----------

print 'Tuned model with best alpha = %g' % bestAlpha
print '  Model intercept: %g' % tunedClf.intercept_
print '  Model coefficients:'
for i in range(len(featureNames)):
  print '    %g\t%s' % (tunedClf.coef_[i], featureNames[i])

# COMMAND ----------

# MAGIC %md ## 3. Converting between scikit-learn and MLlib models
# MAGIC 
# MAGIC It is often possible to convert between scikit-learn and MLlib models.  There is not built-in functionality yet, but we show how to do the conversion for linear models.  This can be useful to take advantage of each library's different sets of functionality.

# COMMAND ----------

# Convert the scikit-learn model into an equivalent MLlib model
from pyspark.mllib.regression import LinearRegressionModel
mllibModel = LinearRegressionModel(tunedClf.coef_, tunedClf.intercept_)
mllibModel

# COMMAND ----------

# Demonstrate that the models compute the same predictions
sklearnPredictions = tunedClf.predict(testFeatures)
mllibPredictions = numpy.array(map(lambda x: mllibModel.predict(x), testFeatures))
differences = sklearnPredictions - mllibPredictions
sumSquaredDifferences = sum(differences * differences)
print 'Total difference between scikit-learn and MLlib model predictions: %g' % sumSquaredDifferences
Beispiel #20
0
# compute mean ndvi for comparison
meanNDVI = opsRDD.map(lambda x: x.label).mean()

##### use the MEAN NDVI as the prediction (for comparison)
meanRES = test.map(lambda x: (x.label,meanNDVI))
rmseVsMean = rmse(meanRES,numTest)
outString = "Simple Mean NDVI RMSE = " + str(rmseVsMean) + "\n\n"
fOut.write(outString)
######### MEAN NDVI

##### LINEAR REGRESSION WITH STOCHASTIC GRADIENT DESCENT
# if a model has already been trained, use it
# otherwise train a new one and save it
if os.path.exists('~/CloudRepair/MODELS/lrmCR'):
    lrm = LinearRegressionModel.load(sc,'~/CloudRepair/MODELS/lrmCR')
else:
    lrm = LinearRegressionWithSGD.train(training,
                                        iterations=10000,
                                        step=0.0000001,
                                        miniBatchFraction=0.10)
    lrm.save(sc, '~/CloudRepair/MODELS/lrmCR')

lrmPred = lrm.predict(test.map(lambda x: x.features))
lrmRES = test.map(lambda x: x.label).zip(lrmPred)
rmseLRM = rmse(lrmRES,numTest)
outString = "Linear Regression NDVI RMSE = " + str(rmseLRM) + "\n\n"
fOut.write(outString)
######### LINEAR REGRESSION WITH STOCHASTIC GRADIENT DESCENT

##### RANDOM FOREST optimization
Beispiel #21
0
conf = SparkConf().setAppName(
    'Linear least squares, Lasso, and ridge regression').setMaster('local[2]')
sc = SparkContext(conf=conf)


# load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])


data = sc.textFile('../data/lpsa.data')
parseData = data.map(parsePoint)

# build the model
model = LinearRegressionWithSGD.train(parseData,
                                      iterations=100,
                                      step=0.0000001)

# evaluate the model on training data
valuesAndPreds = parseData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(
    lambda a, b: a + b) / valuesAndPreds.count()
print('mean squared error :' + str(MSE))

# save and load model
model.save(sc, '../model/pythonLinearRegressionWithSGDModel')
sameModel = LinearRegressionModel.load(
    sc, '../model/pythonLinearRegressionWithSGDModel')
sc.stop()
Beispiel #22
0
        df = sparkSession.createDataFrame([(time.strftime("%Y-%m-%d %H:%M:%S"), store_id, result)], ["timePredicted", "store_id", "value"])
        df.write.format("com.mongodb.spark.sql.DefaultSource").mode("append").save()

        return predicted
    else:
	print("No data received")

if __name__ == "__main__":
    sc = SparkContext(appName="NinoxStreaming")

    my_spark = SparkSession \
        .builder \
        .appName("Ninox") \
        .config("spark.mongodb.input.uri", "mongodb://172.254.0.4:27017/predictions.data") \
        .config("spark.mongodb.output.uri", "mongodb://172.254.0.4:27017/predictions.data") \
        .getOrCreate()

    ssc = StreamingContext(sc, 10)

    # Load model from HDFS
    model = LinearRegressionModel.load(sc, "hdfs://172.254.0.2:9000/user/root/models/first.model")

    # Create stream to get kafka messages
    directKafkaStream = KafkaUtils.createDirectStream(ssc, ["incomingData"], {"metadata.broker.list": "172.254.0.7:9092"})
    
    # Predict and save to mongo
    directKafkaStream.foreachRDD(lambda time, rdd: predict(rdd, model, my_spark, time))

    ssc.start()
    ssc.awaitTermination()
    sc.stop()
Beispiel #23
0
                type=p[7],
                velocity=int(p[8]),
                error=p[9],
                integration=int(p[10]),
                station=p[11]
            ))

        trafficDF = sqlContext.createDataFrame(trafficData)
        trafficDF.registerTempTable("traffic")

        query = sqlContext.sql("SELECT year, month, day, station, SUM(intensity) intensity  "
                                "FROM traffic "
                                "WHERE error='N' AND station = '28079004' "
                                "GROUP BY year, month, day, station "
                                "LIMIT 1")

        labelPoints = query.map(lambda line:[CommonFunctions.toWeekday(2000 + line[0], line[1], line[2]), CommonFunctions.clasification_intensity(line[4])])
        model = LinearRegressionModel.load(sc, dirTrainingModel)
        valueAir = model.predict(labelPoints.first())

        data = query.map(lambda p: Row(
            valueAir = valueAir,
            year=int(p[0]),
            month=int(p[1]),
            day=int(p[2]),
            station=p[3],
            intensity=p[4]
        ))

        print data.collect()
if __name__ == "__main__":

    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.replace(',', ' ').split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("data/mllib/ridge-data/lpsa.data")
    parsedData = data.map(parsePoint)

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData,
                                          iterations=100,
                                          step=0.00000001)

    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p:
                                    (p.label, model.predict(p.features)))
    MSE = valuesAndPreds \
        .map(lambda vp: (vp[0] - vp[1])**2) \
        .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))

    # Save and load model
    model.save(sc, "target/tmp/pythonLinearRegressionWithSGDModel")
    sameModel = LinearRegressionModel.load(
        sc, "target/tmp/pythonLinearRegressionWithSGDModel")
    # $example off$
import numpy as np
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
sc = SparkContext("local")


# Load and parse the data
def parsePoint(line):
    #values = [float(x) for x in line.replace(',', ' ').split(' ')]
    values = line.split(',')
    return LabeledPoint(values[2:3], values[0:2] + values[3:])


data = sc.textFile("/root/Desktop/dataset/kc_house_data.csv")
parsedData = data.map(parsePoint)

# Build the model
model = LinearRegressionWithSGD.train(parsedData,
                                      iterations=100,
                                      step=0.00000001)

# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(
    lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE) +
      "bkbkbbbbbbbbbbbbbbbbbbbbbbbbkkbkbkkkkkkkkkkkkkkkkkkkkkkkkkkkkkbkbkbkb")

# Save and load model
model.save(sc, "myModelPath")
sameModel = LinearRegressionModel.load(sc, "myModelPath")
if __name__ == "__main__":

    sc = SparkContext(appName="TicTacLinearRegressionExample")

    # Parse the data and create LabeledPoints
    def parsePoint(line):
        values = [x for x in line.split(' ')]
        # Last row contains the target data and rest of
        # the rows define the attributes for linear regression
        return LabeledPoint(values[9], values[0:8])

    # Load the data
    data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt")
    parsedData = data.map(parsePoint)

    # Build the model using LinearRegression
    model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)

    # Evaluate the model on training data
    predict = parsedData.map(lambda pd: (pd.label, model.predict(pd.features)))
    MSE = predict \
        .map(lambda (v, p): (v - p)**2) \
        .reduce(lambda x, y: x + y) / predict.count()

    # Print Mean Squared Error
    print("Mean Squared Error for Tic Tac Linear Regression = " + str(MSE))

    # Save and load model
    model.save(sc, "target/tmp/pythonTicTacLinearRegression")
    sameModel = LinearRegressionModel.load(sc, "target/tmp/pythonTicTacLinearRegression")
    sc = SparkContext(appName="TicTacLinearRegressionExample")

    # Parse the data and create LabeledPoints
    def parsePoint(line):
        values = [x for x in line.split(' ')]
        # Last row contains the target data and rest of
        # the rows define the attributes for linear regression
        return LabeledPoint(values[9], values[0:8])

    # Load the data
    data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt")
    parsedData = data.map(parsePoint)

    # Build the model using LinearRegression
    model = LinearRegressionWithSGD.train(parsedData,
                                          iterations=100,
                                          step=0.00000001)

    # Evaluate the model on training data
    predict = parsedData.map(lambda pd: (pd.label, model.predict(pd.features)))
    MSE = predict \
        .map(lambda (v, p): (v - p)**2) \
        .reduce(lambda x, y: x + y) / predict.count()

    # Print Mean Squared Error
    print("Mean Squared Error for Tic Tac Linear Regression = " + str(MSE))

    # Save and load model
    model.save(sc, "target/tmp/pythonTicTacLinearRegression")
    sameModel = LinearRegressionModel.load(
        sc, "target/tmp/pythonTicTacLinearRegression")
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel
# $example off$

if __name__ == "__main__":

    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.replace(',', ' ').split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("data/mllib/ridge-data/lpsa.data")
    parsedData = data.map(parsePoint)

    # Build the model
    model = LinearRegressionWithSGD.train(parsedData, iterations=100, step=0.00000001)

    # Evaluate the model on training data
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    MSE = valuesAndPreds \
        .map(lambda vp: (vp[0] - vp[1])**2) \
        .reduce(lambda x, y: x + y) / valuesAndPreds.count()
    print("Mean Squared Error = " + str(MSE))

    # Save and load model
    model.save(sc, "target/tmp/pythonLinearRegressionWithSGDModel")
    sameModel = LinearRegressionModel.load(sc, "target/tmp/pythonLinearRegressionWithSGDModel")
    # $example off$
validMetrics.rootMeanSquaredError
validMetrics.meanSquaredError

#Section 7.5.3
import operator
print(",".join([
    str(s)
    for s in sorted(enumerate([abs(x) for x in model.weights.toArray()]),
                    key=operator.itemgetter(0))
]))

#Section 7.5.4
model.save(sc, "ch07output/model")

from pyspark.mllib.regression import LinearRegressionModel
model = LinearRegressionModel.load(sc, "ch07output/model")


#Section 7.6.1
def iterateLRwSGD(iterNums, stepSizes, train, valid):
    from pyspark.mllib.regression import LinearRegressionWithSGD
    import math
    for numIter in iterNums:
        for step in stepSizes:
            alg = LinearRegressionWithSGD()
            model = alg.train(train,
                              iterations=numIter,
                              step=step,
                              intercept=True)
            rescaledPredicts = train.map(
                lambda x: (float(model.predict(x.features)), x.label))
#Section 7.5.2
from pyspark.mllib.evaluation import RegressionMetrics
validMetrics = RegressionMetrics(validPredicts)
validMetrics.rootMeanSquaredError
validMetrics.meanSquaredError

#Section 7.5.3
import operator
print(",".join([str(s) for s in sorted(enumerate([abs(x) for x in model.weights.toArray()]), key=operator.itemgetter(0))]))

#Section 7.5.4
model.save(sc, "ch07output/model")

from pyspark.mllib.regression import LinearRegressionModel
model = LinearRegressionModel.load(sc, "ch07output/model")


#Section 7.6.1
def iterateLRwSGD(iterNums, stepSizes, train, valid):
  from pyspark.mllib.regression import LinearRegressionWithSGD
  import math
  for numIter in iterNums:
    for step in stepSizes:
      alg = LinearRegressionWithSGD()
      model = alg.train(train, iterations=numIter, step=step, intercept=True)
      rescaledPredicts = train.map(lambda x: (float(model.predict(x.features)), x.label))
      validPredicts = valid.map(lambda x: (float(model.predict(x.features)), x.label))
      meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
      print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
sc = SparkContext()
train_data = sc.textFile("train.csv")
test_data = sc.textFile("test.csv")

parsedTrainData = train_data.map(parsePoint).filter(lambda x: x is not None)
parsedTestData = test_data.map(parsePoint).filter(lambda x: x is not None)

mat = RowMatrix(train_data.map(parseVector).filter(lambda x: x is not None))
pc = mat.computePrincipalComponents(2)
projected = mat.multiply(pc)

x = [vector[0] for vector in projected.rows.collect()]
y = [vector[1] for vector in projected.rows.collect()]

LinearModel = LinearRegressionModel.load(sc, "Linear")
RidgeModel = RidgeRegressionModel.load(sc, "Ridge")
LassoModel = LassoModel.load(sc, "Lasso")

valuesAndPredsLinearTrain = parsedTrainData.map(
    lambda p: (p.label, LinearModel.predict(p.features)))
valuesAndPredsLinearTest = parsedTestData.map(
    lambda p: (p.label, LinearModel.predict(p.features)))

valuesAndPredsRidgeTrain = parsedTrainData.map(
    lambda p: (p.label, RidgeModel.predict(p.features)))
valuesAndPredsRidgeTest = parsedTestData.map(
    lambda p: (p.label, RidgeModel.predict(p.features)))

valuesAndPredsLassoTrain = parsedTrainData.map(
    lambda p: (p.label, LassoModel.predict(p.features)))
	return LabeledPoint(values[7], values[0:11]) 

#data_file = sc.textFile("/home/faiz89/Desktop/Eastman/2008.csv")
data_file = sc.textFile("../2008_small.csv")
header = data_file.first ()
raw_data = data_file.filter (lambda x:x != header)

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = LinearRegressionWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()
testData.cache()
# Evaluating the model on training data
valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds \
    .map(lambda (v, p): (v - p)**2) \
    .reduce(lambda x, y: x + y) / valuesAndPreds.count()
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Total Time: '), (datetime.now() - startTime)

print("Mean Squared Error = " + str(MSE))
# Save and load model
model.save(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
Beispiel #33
0
from pyspark.sql.functions import *
from datetime import datetime,timedelta
from pyspark.mllib.regression import LabeledPoint,LinearRegressionWithSGD,LinearRegressionModel
from pyspark.mllib.feature import HashingTF

inputPath = '/user/ssambasi/SFPD_parquet'
conf = SparkConf().setAppName('Predict Alarming District')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

crimeDF =sqlContext.read.parquet(inputPath).cache()

htf = HashingTF(5000)

#Load the model
lrm = LinearRegressionModel.load(sc, '/user/ssambasi/sfo/CrimeCountPredictionModel')

#Load test data for demo
districtRDD = crimeDF.select('PdDistrict').distinct().rdd.filter(lambda r:r[0]!='').map(lambda r:r[0]).cache()
startDate = datetime.now()
dateList = []
for dateIndex in range(0,30):
    dateList.append(startDate + timedelta(days=dateIndex))
dateRDD = sc.parallelize(dateList).cache()
testDataRDD = districtRDD.cartesian(dateRDD).map(lambda (district,date): \
                ((district,date),LabeledPoint(1.0,htf.transform((district,date))))).cache()

#Predict Alarming District using the model

def GetMaXCount((district1,count1),(district2,count2)):
    if(count1>count2):