i += 1
        step += len(m)
        num_vec = np.array([float(field) for field in record[5:6]])
        return np.concatenate((cat_vec, num_vec))


def extract_hp_label(record):
    return record[6]


def extract_acc_label(record):
    return record[5]


accData = records.map(
    lambda r: LabeledPoint(extract_acc_label(r), extract_features(r)))
hpData = records.map(
    lambda r: LabeledPoint(extract_hp_label(r), extract_features(r)))

acc_first_point = accData.first()
hp_first_point = hpData.first()

print "Raw data: " + str(first[0:])
print "Acceleration Label: " + str(acc_first_point.label)
print "Linear Model feature vector:\n" + str(acc_first_point.features)
print "Linear Model feature vector length: " + str(
    len(acc_first_point.features))

print "Raw data: " + str(first[0:])
print "Horsepower Label: " + str(hp_first_point.label)
print "Linear Model feature vector:\n" + str(hp_first_point.features)
Beispiel #2
0
# MAGIC #### LabeledPoint

# COMMAND ----------

# MAGIC %md
# MAGIC  
# MAGIC In MLlib, labeled training instances are stored using the [LabeledPoint](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.regression.LabeledPoint) object.  Note that the features and label for a `LabeledPoint` are stored in the `features` and `label` attribute of the object.

# COMMAND ----------

from pyspark.mllib.regression import LabeledPoint
help(LabeledPoint)

# COMMAND ----------

labeledPoint = LabeledPoint(1992, [3.0, 5.5, 10.0])
print 'labeledPoint: {0}'.format(labeledPoint)

print '\nlabeledPoint.features: {0}'.format(labeledPoint.features)
# Notice that feaures are being stored as a DenseVector
print 'type(labeledPoint.features): {0}'.format(type(labeledPoint.features))

print '\nlabeledPoint.label: {0}'.format(labeledPoint.label)
print 'type(labeledPoint.label): {0}'.format(type(labeledPoint.label))


# COMMAND ----------

# View the differences between the class and an instantiated instance
set(dir(labeledPoint)) - set(dir(LabeledPoint))
Acceleration,Cylinders,Displacement,Horsepower,Manufacturer,Model,Model_Year,MPG,Origin,Weight
df1=autoData.map(lambda line: Row(acc = line[1],cyl= line[2],dis=line[3], \
hp=line[4],mauf = line[5],year=line[7],orig=line[9],wt=line[10])).toDF()
>>> df1.show(3)
+----+---+---+---+-------------+-------+----+----+
| acc|cyl|dis| hp|         mauf|   orig|  wt|year|
+----+---+---+---+-------------+-------+----+----+
|  12|  8|307|130|chevrolet    |USA    |3504|  70|
|11.5|  8|350|165|buick        |USA    |3693|  70|
|  11|  8|318|150|plymouth     |USA    |3436|  70|
+----+---+---+---+-------------+-------+----+----+
only showing top 3 rows

#df.show(5)
#Liner regress will begin now 
temp = df.map(lambda line:LabeledPoint(line[0],[line[1:]]))
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
#features_transform.take(5)
##put label and featire together
lab = df.map(lambda row: row[0])
transformedData = lab.zip(features_transform)
transformedData.take(5)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))
transformedData.take(5)
trainingData, testingData = transformedData.randomSplit([.9,.1],seed=123)
linearModel = LinearRegressionWithSGD.train(trainingData,10,.1)
#pull the coff
linearModel.weights
Beispiel #4
0
    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(),
                         dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd,
            numClasses=2,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(),
                         rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(),
                         gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
Beispiel #5
0
def parsePoint(line):
    values = [float(x) for x in line.split(",")]
    return LabeledPoint(values[0], values[1:16])
Beispiel #6
0
def labelData(data):
    return data.rdd.map(lambda row: LabeledPoint(row[-1], row[:-1]))
Beispiel #7
0
from pyspark.sql.session import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForestModel
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.evaluation import MulticlassMetrics
from prettytable import PrettyTable

sc = SparkContext()
spark = SparkSession(sc)
inputDF = spark.read.csv('s3://cloud-proj2/ValidationDataset.csv',
                         header='true',
                         inferSchema='true',
                         sep=';')

datadf = inputDF.rdd.map(
    lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
model = RandomForestModel.load(sc, "s3://cloud-proj2/model_created.model")

predictions = model.predict(datadf.map(lambda x: x.features))

labels_and_predictions = datadf.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(
    datadf.count())

metrics = MulticlassMetrics(labels_and_predictions)
f1 = metrics.fMeasure()
recall = metrics.recall()
precision = metrics.precision()

#evaluation values
print("Model accuracy: %.3f%%" % (acc * 100))
Beispiel #8
0
    # getting the stop words
    sw = load_stopwords()

    cm = load_common_words()

    reference_table = create_hash_table(common_words=cm, stop_words=sw)

    # tokenizing the text
    rdd = rdd.map(lambda d:
                  {
                      'tokens': tokenize(text=d['text'], common_words=cm),
                      'label': d['label']
                  }).\
        map(lambda d: LabeledPoint(0 if d['label'] == 0 else 1,
                                   compute_tf(tokens=d['tokens'],
                                              reference_table=reference_table)))
    # instantiating the logistic regression
    logistic_regression = LogisticRegressionWithSGD()
    # training the logistic regression
    trained_logistic_regression = logistic_regression.train(data=rdd)

    # storing the parameters in a json file
    trained_parameters = {
        'weights': trained_logistic_regression.weights.toArray().tolist(),
        'intercept': trained_logistic_regression.intercept
    }

    with open('model.json', 'w') as model_file:
        json.dump(trained_parameters, fp=model_file)
Beispiel #9
0
 ## one hot encoder for string-indexed categorical features
 df_train = onehot_encoder(df_train, features_categorical_indexed_train)
 features_categorical_indexed_vec_train = [
     s + "_StringIndexed_Vec" for s in features_categorical_train
 ]
 features_modeled_train = label_train + features_numerical_train + features_categorical_indexed_vec_train
 features_categorical_indexed_vec_index_train = columnindex(
     features_modeled_train, features_categorical_indexed_vec_train)
 ## select the one-hot-encoded categorical features along with numerical features as well as label to contrust the modeling dataset
 df_train_modeling = df_train.select(features_modeled_train)
 ## df_train_modeling_rdd for mllib package
 df_train_modeling_rdd = df_train_modeling.rdd.map(
     lambda p: convert_sparsevec_to_vec_df(
         p, features_categorical_indexed_vec_index_train))
 df_train_modeling_rdd = df_train_modeling_rdd.map(
     lambda l: LabeledPoint(l[0], l[1:]))
 ################################################## 5: train random forest regression model
 ## random forest
 ## train model
 rfModel = RandomForest.trainRegressor(df_train_modeling_rdd,
                                       categoricalFeaturesInfo={},
                                       numTrees=100,
                                       featureSubsetStrategy="auto",
                                       impurity='variance',
                                       maxDepth=10,
                                       maxBins=32)
 # Predict on train data
 predictions = rfModel.predict(
     df_train_modeling_rdd.map(lambda l: l.features))
 ## Evaluation of the model
 predictionAndObservations = predictions.zip(
    c for c in encoded2.columns if c not in {
        ' day_of_week', 'category_predict', 'address', 'date',
        'description_ignore', 'pd_district', 'resolution', 'pd_district_Index'
    }
])

ignore = ['category']
assembler = VectorAssembler(
    inputCols=[x for x in cleaned.columns if x not in ignore],
    outputCol='features')

transformed = assembler.transform(cleaned)

data_transformed = transformed.select(
    col("category").alias("label"),
    col("features")).map(lambda row: LabeledPoint(row.label, row.features))

#**************************
# split the training set
train, test = data_transformed.randomSplit([0.7, 0.3], seed=2)

#naivebayes classifier
#lambda = 1.0
# initialize classifier:
model = mllib_class.NaiveBayes.train(train, 1.0)
#this step will take 50 seconds

# Make prediction and test accuracy.
# Evaluating the model on training data
labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda vp: vp[0] != vp[1]).count() / float(
Beispiel #11
0
 def run(df, *args, **kwargs):
     unisampled_df = unisample(df, fraction=fraction)
     labeled_data = df.map(lambda e: LabeledPoint(1, e)) \
         .union(unisampled_df.map(lambda e: LabeledPoint(0, e)))
     return model(labeled_data, *args, **kwargs)
_ind = 1
name_dict = dict()
for name in last_names:
    name_dict[name] = _ind
    _ind += 1

# districts = train.rdd.flatMap(lambda x: x[12]).distinct().collect()
# _ind = 1
# district_dict = dict()
# for district in districts:
# 	district_dict[district] = _ind
# 	_ind += 1
# create features and labels
HDF = HashingTF(50)
train = train.rdd.map(
    lambda x: LabeledPoint(name_dict.get(x[3], 0), HDF.transform(x[12])))
test = test.rdd.map(
    lambda x: LabeledPoint(name_dict.get(x[3], 0), HDF.transform(x[12])))
val = val.rdd.map(
    lambda x: LabeledPoint(name_dict.get(x[3], 0), HDF.transform(x[12])))

with open('H4_15300180012_output.txt', 'w') as f:
    f.write('H4_15300180012_output_naive_bayes\n')

para = 1.0
with open('H4_15300180012_output.txt', 'a') as f:
    f.write('Smoothing parameter: {} \n'.format(para))

# Train a naive Bayes model.
model = NaiveBayes.train(train, para)
Beispiel #13
0
# %%
#填充缺失值
#第一种策略是将后8个特征所有null值填充为0
df_train_filled = df_train.fillna(0)
#df_train_filled.show()

# %%
df_train_filled.write.options(
    header="true").csv("hdfs://node1:9000/user/root/exp4/procd_test_real.csv")

# %%
#将数据转为合适的格式
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
#先转成RDD
df_train_rdd = df_train_filled.rdd
#改成(label,features)的格式
df_train_rdd = df_train_rdd.map(
    lambda line: LabeledPoint(0, Vectors.dense(line[3:])))

# %%
#保存为LibSVMFile格式,方便后面训练使用
from pyspark.mllib.util import MLUtils
MLUtils.saveAsLibSVMFile(df_train_rdd,
                         "hdfs://node1:9000/user/root/exp4/procd_test_real")

# %%
#别忘了关掉session
spark.stop()
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext

sc = SparkContext('local')
denseVec1 = LabeledPoint(1.0, Vectors.dense([-2.0, 5.0, 1.0]))
denseVec2 = LabeledPoint(0.0, Vectors.dense([2.0, 0.0, 1.0]))
vectors = [denseVec1, denseVec2]
dataset = sc.parallelize(vectors)
print(dataset)
model = SVMWithSGD.train(dataset, iterations=200, intercept=True)
print("weights: %s, intercept: %s" % (model.weights, model.intercept))

# weights: [-5.591575185933013,2.8941330995473336,0.0], intercept: -6.895635245995358
def generateDatas(lines):
    datas = []
    for item in lines:
        label, text_list = textParse(item.strip())
        datas.append(LabeledPoint(label, text_list))
    return datas
Beispiel #16
0
                             key=lambda x:x[0])]

        (indices, values) = zip(*docVector)      # unzip
        label = float(dockey[6:])
        return label, indices, values

    vector = data.map( lambda (dockey, doc) : doc2vector(dockey, doc))

    vector.persist(StorageLevel.MEMORY_ONLY)
    d = vector.map( lambda (label, indices, values) : indices[-1] if indices else 0)\
              .reduce(lambda a,b:max(a,b)) + 1


#    print "###### Load svm file", filename
    #examples = MLUtils.loadLibSVMFile(sc, filename, numFeatures = numFeatures)
    examples = vector.map( lambda (label, indices, values) : LabeledPoint(label, Vectors.sparse(d, indices, values)))

    examples.cache()

    # FIXME: need randomSplit!
    training = examples.sample(False, 0.8, 2)
    test = examples.sample(False, 0.2, 2)

    numTraining = training.count()
    numTest = test.count()
    print " numTraining = %d, numTest = %d." % (numTraining, numTest)
    model = NaiveBayes.train(training, 1.0)

    model_share = sc.broadcast(model)
    predictionAndLabel = test.map( lambda x: (x.label, model_share.value.predict(x.features)))
#    prediction = model.predict(test.map( lambda x: x.features ))
Beispiel #17
0
conf = SparkConf().setMaster('local').setAppName('pyAssign11_Multiple') 
sc = SparkContext(conf = conf) 

file_path = 'file:////home/joe/proj/lec11/small_car_data.csv'
raw_data = sc.textFile(file_path)
records = raw_data.map(lambda x: [cell.strip() for cell in x.split(',')])
records.cache()
first = records.first()

cat_idx = [2,5,7,9] #Cylinders, Displacement, Manufacturer, Model_Year, Origin
num_idx = [3,10] #Displacement, Weight

mappings = [get_mapping(records, i) for i in cat_idx]
print 'mappings: ' + str(mappings)

data_dt_acc = records.map(lambda r: LabeledPoint(extract_label_acc(r),extract_features_dt(r)))
data_dt_hp = records.map(lambda r: LabeledPoint(extract_label_hp(r),extract_features_dt(r)))

first_point_dt = data_dt_acc.first()
print "Decision Tree feature vector: " + str(first_point_dt.features)
print "Decision Tree feature vector length: " + str(len(first_point_dt.features))

#def evaluate_final(description, data, maxDepth, maxBins):
evaluate_final('ACCELERATION', data_dt_acc, maxDepth=5, maxBins=32)
evaluate_final('HORSEPOWER', data_dt_hp, maxDepth=3, maxBins=8)



#===============================================================================
# #######################
# #Acceleration, depth = 5
Beispiel #18
0
def run(jobNm,
        sc,
        sqlContext,
        inputFile,
        lPolygon,
        dictFile,
        nDataType=0,
        inputPartitions=-1,
        sNum=30,
        modelSavePath=None,
        bWriteMonitor=False,
        writeFileOutput=True,
        strStop=''):

    if bWriteMonitor:
        import plotting

    bc_lTargetPolygons = sc.broadcast(lPolygon)
    stopSet = set(strStop.split(',')) if strStop != '' else set()

    #Create monitoring plot and associated vectors
    mPX = range(7)
    mPY = [0.] * 7
    mSL = [
        "Initial Read", "Calculate IDF", "Partition for M.L.",
        "Create Training Vector", "Train Model", "Apply Model",
        "Prepare Output Data"
    ]
    mInd = 0

    t0 = time.time()
    #Read in data and filter out entries with no valid words
    t1 = time.time()
    print 'inputFile ', inputFile
    print 'inputPartitions ', inputPartitions
    records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile,
                                             inputPartitions)
    nGoodTweets = records.count()
    t2 = time.time()
    print "Number of good tweets:", nGoodTweets
    diff = t2 - t1
    print "Time to read in and filter nonscorable words", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Find the word document frequency for the corpus
    #this is used for an idf score used in feature vector formation
    t1 = time.time()
    revLookup = []
    lStop = []
    fDict = None
    if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:':
        # read dict file from hdfs
        fDict = sc.textFile(dictFile).collect()
    else:
        # read from local file
        fDict = open(dictFile, "r")
    for line in fDict:
        terms = line.split("\t")
        revLookup.append(terms[0])
        if terms[0] in stopSet:
            lStop.append(terms[1])

    nVecLen = len(revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to read dict:", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Split data into training and apply samples
    # training data is 2 parts, inside r.o.i., and a sample of the areas outside the r.o.i.
    t1 = time.time()
    sqlContext.registerFunction(
        "inRegionOfInterest",
        lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons),
        returnType=BooleanType())
    df1 = sqlContext.sql(
        "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    df1.registerTempTable("df1")
    nIn = df1.count()
    dfn1 = sqlContext.sql(
        "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)"
    ).cache()
    dfn1.registerTempTable("dfn1")
    nOut = dfn1.count()
    modelDict = aggregatedComparison.exemplarDict(df1, revLookup)
    t2 = time.time()
    diff = t2 - t1
    print "Time to find in and out of ROI", diff
    print "N in:", nIn, ", N out:", nOut
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # Create training vectors from in region data, and sample of out region data
    t1 = time.time()
    #grouped = aggregatedComparison.createAggregatedLabledPoint(df1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1.0)
    #grouped2 = aggregatedComparison.createAggregatedLabledPoint(dfn1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, -1.0)
    #nSignal = float(grouped.count())
    #nBack = float(grouped2.count())
    groupedIn = df1.map(lambda x: (x.key, [
        LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    groupedOut = dfn1.map(lambda x: (x.key, [
        LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize
    ])).cache()
    scaleFactor = (10. * nIn) / float(nOut)
    (mlApply,
     groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor])
    mlTrain = groupedIn.union(groupedUse).cache()
    if len(lStop) != 0:
        mlTrain = mlTrain.map(
            lambda x: aggregatedComparison.removeStopWords(x, lStop))
    nTotTrain = mlTrain.count()
    mlApply.cache()
    nApply = mlApply.count()
    t2 = time.time()
    print nTotTrain, "entries for training"
    diff = t2 - t1
    print "Time to get data ready for model by time", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # train model
    t1 = time.time()
    model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]),
                                             categoricalFeaturesInfo={},
                                             numTrees=100,
                                             featureSubsetStrategy="auto",
                                             impurity="variance",
                                             maxDepth=4,
                                             maxBins=32)
    if modelSavePath is not None:
        if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/"
        model_Tree.save(sc, modelSavePath + jobNm)
    t2 = time.time()
    diff = t2 - t1
    print "Time to train model", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    # apply model
    t1 = time.time()
    predictions_Tree = model_Tree.predict(
        mlApply.map(lambda x: x[1][0].features))
    vecAndPredictions = mlApply.zip(predictions_Tree)
    vecAndPredictions.cache()
    vecAndPredictions.count()
    t2 = time.time()
    diff = t2 - t1
    print "Time to apply model: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        mInd = mInd + 1
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    #Get the results
    t1 = time.time()
    resultSet = clustering.locationBasedOutputV2(False, jobNm,
                                                 vecAndPredictions, sNum,
                                                 revLookup, writeFileOutput,
                                                 modelDict)
    t2 = time.time()
    diff = t2 - t1
    print "Time to create json objects for output: ", diff
    if bWriteMonitor:
        mPY[mInd] = diff
        plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm)

    diff = time.time() - t0
    print "<----------BOOM GOES THE DYNOMITE!---------->"
    print "< total number of tweets:,", nGoodTweets
    print "< total process Time:", diff
    print "< total idf vector length:", nVecLen
    print "<------------------------------------------->"
    return resultSet
Beispiel #19
0
def get_labeled_points_from_rdd(rdd):
    """
    returns a labeledpoint from the RDD provided
    """
    from pyspark.mllib.regression import LabeledPoint
    return rdd.map(lambda x: LabeledPoint(float(x[0]), x[1:]))
Beispiel #20
0
def parseOHELine(line, OHEDict, numOHEFeats):
    p = line.split('\t')
    l = p[24]
    return LabeledPoint(l, oneHotEncoder(parseLine(line), OHEDict,
                                         numOHEFeats))
 def labelData(data):
     return data.map(lambda row: LabeledPoint(row[2], row[3:]))
Beispiel #22
0
def parseHashPoint(point, numBuckets):
    p = point.split('\t')
    return LabeledPoint(
        p[24],
        SparseVector(numBuckets,
                     hashFunction(numBuckets, parseLine(point), False)))
Beispiel #23
0
    def test_regression(self):
        from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
            RidgeRegressionWithSGD
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(-1.0, [0, -1]),
            LabeledPoint(1.0, [0, 1]),
            LabeledPoint(-1.0, [0, -2]),
            LabeledPoint(1.0, [0, 2])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        lasso_model = LassoWithSGD.train(rdd, iterations=10)
        self.assertTrue(lasso_model.predict(features[0]) <= 0)
        self.assertTrue(lasso_model.predict(features[1]) > 0)
        self.assertTrue(lasso_model.predict(features[2]) <= 0)
        self.assertTrue(lasso_model.predict(features[3]) > 0)

        rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(rr_model.predict(features[0]) <= 0)
        self.assertTrue(rr_model.predict(features[1]) > 0)
        self.assertTrue(rr_model.predict(features[2]) <= 0)
        self.assertTrue(rr_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 2}  # feature 0 has 2 categories
        dt_model = DecisionTree.trainRegressor(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numTrees=10,
            maxBins=4,
            seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        try:
            LinearRegressionWithSGD.train(rdd,
                                          initialWeights=array([1.0, 1.0]),
                                          iterations=10)
            LassoWithSGD.train(rdd,
                               initialWeights=array([1.0, 1.0]),
                               iterations=10)
            RidgeRegressionWithSGD.train(rdd,
                                         initialWeights=array([1.0, 1.0]),
                                         iterations=10)
        except ValueError:
            self.fail()

        # Verify that maxBins is being passed through
        GradientBoostedTrees.trainRegressor(
            rdd,
            categoricalFeaturesInfo=categoricalFeaturesInfo,
            numIterations=4,
            maxBins=32)
        with self.assertRaises(Exception) as cm:
            GradientBoostedTrees.trainRegressor(
                rdd,
                categoricalFeaturesInfo=categoricalFeaturesInfo,
                numIterations=4,
                maxBins=1)
Beispiel #24
0
def main():
    conf = SparkConf().setAppName("Test")
    sc = SparkContext(conf=conf)

    # load tagged datasets as training data
    dataY0 = sc.wholeTextFiles('/home/xsran/IdeaProjects/hadoop1/data/Y-cut')
    dataN0 = sc.wholeTextFiles('/home/xsran/IdeaProjects/hadoop1/data/N-cut')

    # split text into words
    dataN = dataN0.map(lambda x: x[1].split(" "))
    dataY = dataY0.map(lambda x: x[1].split(" "))

    # merge the positive and negative into a single dataset
    dataA = dataY.union(dataN)

    # map words list into (word,1) tuple
    words = dataA.flatMap(lambda x: x).map(lambda x: (x, 1))
    # counting the number of words
    wordCount = words.reduceByKey(lambda x, y: x + y).map(
        lambda x: (x[1], x[0])).sortByKey(ascending=False)
    wordCount.cache()

    # saving this results
    # wordCount.map(lambda x:'%s,%s' % (x[1],x[0])).saveAsTextFile(dir+'wordCount')
    # wordCount.map(lambda x:(x[1],x[0])).saveAsTextFile(dir+'wordCount_rep')

    # filter this words list. Only keep the words with a certain frequency as features
    # feature_count: (features word, count)
    feature_count = wordCount.filter(lambda x: 150 < x[0] < 5000).map(
        lambda x: (x[1], x[0]))

    # count the word frequency in positive and negative case respectively.
    dataN1 = dataN0.flatMap(lambda x: [(w, 1) for w in set(x[1].split(" "))]
                            ).reduceByKey(lambda x, y: x + y)
    dataY1 = dataY0.flatMap(lambda x: [(w, 1) for w in set(x[1].split(" "))]
                            ).reduceByKey(lambda x, y: x + y)
    # dataA1: (word,(N num,Y num))
    dataA1 = dataN1.fullOuterJoin(dataY1).mapValues(
        lambda x: (x[0] if x[0] else 0, x[1] if x[1] else 0))

    fs = feature_count.map(lambda x: (x[0], 0))

    totalNnum = dataN0.count()
    totalYnum = dataY0.count()
    # only keep those words in the feature_count
    # dataA2:(word,(N num,Y num))
    dataA2 = dataA1.rightOuterJoin(fs).mapValues(lambda x: x[0]).filter(
        lambda x: x[1][0] != totalNnum and x[1][1] != totalYnum)

    # compute the chi square values
    dataA3 = dataA2.mapValues(lambda x: (x, (totalNnum - x[0], totalYnum - x[1]
                                             ), totalNnum + totalYnum))
    dataX2 = dataA3.mapValues(lambda x: (float(x[0][0] * x[1][1] - x[0][1] * x[
        1][0])**2 * x[2]) / ((x[0][0] + x[0][1]) * (x[1][0] + x[1][1]) *
                             (x[0][0] + x[1][0]) * (x[0][1] + x[1][1])))
    # sorting
    dataX2 = dataX2.sortBy(lambda x: abs(x[1]), ascending=False)

    # only keep 100 features with highest chi square values
    # features: this variable only keep the 100 words.
    features = dataX2.map(lambda x: x[0]).collect()[:100]
    # features_x2: this variable record the chi square values of each features
    features_x2 = dataX2.collect()[:100]

    # broadcasting those data to spark's worker nodes.
    features = sc.broadcast(features)
    features_x2 = sc.broadcast(features_x2)

    # this function is used to extract features from a case
    def make_feature(doc):
        doc = doc.split(" ")
        f = []
        for i in features.value:
            f.append(doc.count(i))
        return f

    def make_feature2(doc):
        doc = doc.split(" ")
        f = []
        for k, v in features_x2.value:
            a = doc.count(k)
            a = v if a else 0
            f.append(a)
        return f

    # convert case into features
    fN = dataN0.mapValues(make_feature2)
    fY = dataY0.mapValues(make_feature2)

    # fN.repartition(1).map(lambda x:(x[0].split('/')[-1][:-4],x[1])).saveAsTextFile(dir+'VecN')
    # fY.repartition(1).map(lambda x:(x[0].split('/')[-1][:-4],x[1])).saveAsTextFile(dir+'VecY')

    fN = fN.map(lambda x: x[1])
    fY = fY.map(lambda x: x[1])

    # sc.stop()

    # convert features into LabeledPoint to train the model.
    fNtl = fN.map(lambda x: LabeledPoint(0, x))
    fYtl = fY.map(lambda x: LabeledPoint(1, x))

    # union the positive and negative data and train the NaiveBayes model.
    fTrain = fNtl.union(fYtl)
    bn = NaiveBayes.train(fTrain)

    # load the all untagged data
    inputs = [
        sc.wholeTextFiles('/home/xsran/tmp/BigData/data_c_' + str(i))
        for i in range(10)
    ]
    input = sc.union(inputs)
    # extracting features and use the NaiveBayes model to predict the case.
    predict = input.mapValues(make_feature2).mapValues(bn.predict)
    result = input.join(predict).filter(lambda x: x[1][1])

    # use Regular Expression to remove the Chinese white space.
    r = re.compile(u'[\s\u3000]+')
    r = sc.broadcast(r)
    result1 = result.mapValues(lambda x: r.value.sub(' ', x[0])).map(
        lambda x: (x[0] + ',' + x[1]).encode('utf8'))
    result1.cache()
    # result.saveAsTextFile('/home/xsran/tmp/BigData/result1')
    keys = result.map(lambda x: x[0])
    keys.repartition(1).saveAsTextFile('/home/xsran/tmp/BigData/target_keys')

    print '************************'
    print 'input', input.count()
    print 'target', result1.count()
Beispiel #25
0
                                   n_iter=iter_number,
                                   test_size=0.2,
                                   random_state=0)
Err = 0.0
results = []
for train_index, test_index in ss:
    X_training, Y_training, X_test, Y_test = [], [], [], []
    for i in train_index:
        X_training.append(X[i])
        Y_training.append(Y[i])
    for i in test_index:
        X_test.append(X[i])
        Y_test.append(Y[i])

    parsedData = []
    for i in range(0, len(X_training)):
        parsedData.append(LabeledPoint(Y_training[i], X_training[i]))

    model = SVMWithSGD.train(sc.parallelize(parsedData))

    testErr = 0
    for i in range(0, len(X_test)):
        a = Y_test[i]
        b = model.predict(X_test[i])
        if a != b:
            testErr += 1

    Err += float(testErr) / float(len(X_test))

print("AVG test error: %.6f" % (Err / iter_number))
    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % goodnessOfFitTestResult)

    mat = Matrices.dense(
        3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])  # a contingency matrix

    # conduct Pearson's independence test on the input contingency matrix
    independenceTestResult = Statistics.chiSqTest(mat)

    # summary of the test including the p-value, degrees of freedom,
    # test statistic, the method used, and the null hypothesis.
    print("%s\n" % independenceTestResult)

    obs = sc.parallelize([
        LabeledPoint(1.0, [1.0, 0.0, 3.0]),
        LabeledPoint(1.0, [1.0, 2.0, 0.0]),
        LabeledPoint(1.0, [-1.0, 0.0, -0.5])
    ])  # LabeledPoint(feature, label)

    # The contingency table is constructed from an RDD of LabeledPoint and used to conduct
    # the independence test. Returns an array containing the ChiSquaredTestResult for every feature
    # against the label.
    featureTestResults = Statistics.chiSqTest(obs)

    for i, result in enumerate(featureTestResults):
        print("Column %d:\n%s" % (i + 1, result))
    # $example off$

    sc.stop()
Beispiel #27
0
from datetime import timedelta

# setup spark context and config
conf = SparkConf().setAppName("labeledPoints")
sc = SparkContext(conf=conf)

# get starting time
t1 = datetime.datetime.now()

# create an RDD
data = sc.textFile('file:///home/ubuntu/DATASETS/BIG_DATASETS/creditcard.csv')

# preprocess data
data = data.filter(lambda _: _[0][0] != '"')
data = data.map(lambda _: _.split(','))
data = data.map(lambda row: LabeledPoint(float(row[-1][1]), row[:-1]))

# split data into train-test set
train, test = data.randomSplit([70.0, 30.0])

# if needed, feel free to release memory
# data.unpersist()

# training model
model = RandomForest.trainClassifier(train, numClasses=2, \
 categoricalFeaturesInfo={},numTrees=100, featureSubsetStrategy="auto",impurity='gini')

# calculate time needed
t2 = datetime.datetime.now()
time_difference = t2 - t1
time_difference_in_minutes = time_difference / timedelta(minutes=1)
Beispiel #28
0
def transform_to_labeled_point(line):
    values = [float(x) for x in line.split(',')]
    #return values
    return LabeledPoint(values[0], values[1:])
Beispiel #29
0
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[0], values[1:])
Beispiel #30
0
def parse_data(line):
    values = [float(x) for x in line]
    return LabeledPoint(values[-1], values[:-1])