def prepareData(sc):

    print 'import training data'

    rawDataWithHeader = sc.textFile(Path + 'train.tsv')
    print rawDataWithHeader.take(10)
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x:x != header)
    rData = rawData.map(lambda x: x.replace("\"",""))
    lines = rData.map(lambda x: x.split("\t"))
    print lines.count()

    categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap()
    print categoriesMap
    labelRDD = lines.map(lambda r: extractLabel(r))
    featureRDD = lines.map(lambda r: extractFeatures(r,categoriesMap,len(r)-1))
    # print featureRDD.take(1)
    stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    # print ScalerFeatureRDD.take(1)
    labelPoint = labelRDD.zip(ScalerFeatureRDD)
    labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0],r[1]))
    # print labelPointRDD.take(1)
    (trainData, testData, validationData) = labelPointRDD.randomSplit([8, 1, 1])
    print trainData.count()
    print testData.count()
    print validationData.count()
    return (trainData, testData, validationData, categoriesMap)
def PrepareData(sc):
    #----------------------1.匯入並轉換資料-------------
    print("開始匯入資料...")
    rawDataWithHeader = sc.textFile(Path + "data/train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("共計:" + str(lines.count()) + "筆")
    #----------------------2.建立訓練評估所需資料 RDD[LabeledPoint]-------------
    print "標準化之前:",
    categoriesMap = lines.map(lambda fields: fields[3]). \
                                        distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print(str(i) + ","),
    print ""
    print "標準化之後:",
    stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print(str(i) + ","),
    labelpoint = labelRDD.zip(ScalerFeatureRDD)
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    #----------------------3.以隨機方式將資料分為3部份並且回傳-------------
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("將資料分trainData:" + str(trainData.count()) + "   validationData:" +
          str(validationData.count()) + "   testData:" + str(testData.count()))
    return (trainData, validationData, testData, categoriesMap)  #回傳資料
def PrepareData(sc): 
    #----------------------1.导入并转换数据-------------
    print("开始导入数据...")
    rawDataWithHeader = sc.textFile(Path+"data/train.tsv")
    header = rawDataWithHeader.first() 
    rawData = rawDataWithHeader.filter(lambda x:x !=header)    
    rData=rawData.map(lambda x: x.replace("\"", ""))    
    lines = rData.map(lambda x: x.split("\t"))
    print("共计:" + str(lines.count()) + "项")
    #----------------------2.建立训练评估所需数据 RDD[LabeledPoint]-------------
    print "标准化之前:",        
    categoriesMap = lines.map(lambda fields: fields[3]). \
                                        distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r:  extract_label(r))
    featureRDD = lines.map(lambda r:  extract_features(r,categoriesMap,len(r) - 1))
    for i in featureRDD.first():
        print (str(i)+","),
    print ""       
    
    print "标准化之后:",    
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD)
    ScalerFeatureRDD=stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print (str(i)+","),        
                
    labelpoint=labelRDD.zip(ScalerFeatureRDD)
    labelpointRDD=labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    
    #----------------------3.以随机方式将数据分为3个部分并且返回-------------
    (trainData, validationData, testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("将数据分trainData:" + str(trainData.count()) + 
              "   validationData:" + str(validationData.count()) +
              "   testData:" + str(testData.count()))
    return (trainData, validationData, testData, categoriesMap) #返回数据
def prepare_data(sc):
    #----------------------1.导入并转换数据-------------
    print("开始导入数据...")
    raw_data_with_header = sc.textFile(os.path.join(PATH, 'data/train.tsv'))
    header = raw_data_with_header.first()
    raw_data = raw_data_with_header.filter(lambda x: x!=header)

    # 去除 "" 按 \t 划分一个网页的不同字段
    lines_rdd = raw_data.\
        map(lambda x: x.replace("\"", "")).\
        map(lambda x: x.split('\t'))
    
    print("共计: {}项".format(lines_rdd.count()))
    #---------------------2.数据标准化----------------------- 
    # {新闻类别: 序号, }
    categories_map = lines_rdd.map(lambda fields: fields[3]).\
                        distinct().zipWithIndex().collectAsMap()
    label_rdd = lines_rdd.map(lambda r: get_label(r))
    features_rdd = lines_rdd.map(lambda r: get_features(r, categories_map, len(r)-1))


    scaler = StandardScaler(withMean=True, withStd=True).fit(features_rdd)
    stand_features = scaler.transform(features_rdd)
    #----------3.建立训练评估所需数据 RDD[LabeledPoint]-------   LabeledPoint                    
    labeledpoint_rdd = label_rdd.zip(stand_features).map(lambda r: LabeledPoint(r[0], r[1]))
    #-----------4.以随机方式将数据分为3个部分并且返回-------------
    (trainData, validationData, testData) = labeledpoint_rdd.randomSplit([0.8, 0.1, 0.1])
    print("将数据分trainData: {0}, validationData: {1}, testData: {2}".format(
        trainData.count(), validationData.count(), testData.count()
    ))

    return (trainData, validationData, testData, categories_map) #返回数据
Beispiel #5
0
def PrepareData(sc):
    rawDataWithHeader = sc.textFile(Path + "data/train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("total " + str(lines.count()))
    print("=======before standare========")
    categoriesMap = lines.map(lambda fields: fields[3]) \
        .distinct() \
        .zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print(str(i) + ", ")
    print("=======after standare========")
    stdScale = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    scaleFeatureRDD = stdScale.transform(featureRDD)
    for i in scaleFeatureRDD.first():
        print(str(i) + ",")
    labelPoint = labelRDD.zip(scaleFeatureRDD)
    labelPointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))
    (trainData, validationData,
     testData) = labelPointRDD.randomSplit([8, 1, 1])
    return (trainData, validationData, testData, categoriesMap)
Beispiel #6
0
def PrepareData(sc):
    print("开始导入数据。。。")
    path = Path + "train.tsv"
    print(path)
    # 使用minPartitions=40,将数据分成40片,不然报错
    rawDataWithHeader = sc.textFile(path, minPartitions=40)
    header = rawDataWithHeader.first()
    # 去掉首行,标题
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    # 去掉引号
    rData = rawData.map(lambda x: x.replace("\"", ""))
    # 按照制表符分字段
    lines = rData.map(lambda x: x.split("\t"))
    print("总共有:", str(lines.count()))
    #----2。创建训练所需的RDD数据
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extractFeatures(r, categoriesMap,
                                                     len(r) - 1))
    print(featureRDD.first())
    #----3.随机分成3部分数据返回
    print("数据标准化之后===:")
    stdScaler = StandardScaler(withMean=True, withStd=True).fit(featureRDD)
    scalerFeatureRDD = stdScaler.transform(featureRDD)
    print(scalerFeatureRDD.first())
    labelPoint = labelRDD.zip(scalerFeatureRDD)
    labelpointRDD = labelPoint.map(lambda r: LabeledPoint(r[0], r[1]))
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print("数据集划分为:trainData:", str(trainData.count()), "validationData:",
          str(validationData.count()), "testData:", str(testData.count()))
    return (trainData, validationData, testData, categoriesMap)
Beispiel #7
0
 def test_model_setters(self):
     data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]),
                      DenseVector([-1.0, -1.0, -1.0]))
Beispiel #8
0
 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
Beispiel #9
0
 def test_model_transform(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
Beispiel #10
0
def getScaledData(data):
    features = data.map(lambda x: x.features)
    label = data.map(lambda x: x.label)
    scaler = StandardScaler(withMean=True, withStd=True).fit(features)
    scaled = label\
     .zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))\
     .map(lambda x: LabeledPoint(x[0], x[1]))

    return scaled
Beispiel #11
0
 def test_model_setters(self):
     data = [
         [1.0, 2.0, 3.0],
         [2.0, 3.0, 4.0],
         [3.0, 4.0, 5.0]
     ]
     model = StandardScaler().fit(self.sc.parallelize(data))
     self.assertIsNotNone(model.setWithMean(True))
     self.assertIsNotNone(model.setWithStd(True))
     self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
Beispiel #12
0
    def norm_train(self, train_data):
        train_features = train_data.map(lambda lp: lp.features)
        self.normalizer = StandardScaler().fit(train_features)

        # TODO: This can't be efficient...
        #return train_data.map(lambda lp: lp.label).zip(self.norm(train_features)).map(lambda r: LabeledPoint(r[0], r[1]))
        labels = train_data.map(lambda lp: lp.label).collect()
        features = self.norm(train_features).collect()
        return get_df(zip(
            labels, features)).rdd.map(lambda r: LabeledPoint(r[0], r[1]))
Beispiel #13
0
    def normalizer(self):
        """
        This function normalize the training data
  
        """
  
        if self._typeNorm == 'norm':
            #Normalize input features
            RDD_X = self._data.map(lambda x: x[1])
            self._scaler = StandardScaler(withMean=True, withStd=True).fit(RDD_X)
            RDD_X_norm = self._scaler.transform(RDD_X)
            RDD_Y = self._data.map(lambda x: x[0])
            RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y)
        else:
            #Normalize input features
            RDD_X = self._data.map(lambda x: x[1])
            self._scaler = StandardScaler(withMean=True, withStd=False).fit(RDD_X)
            RDD_X_norm = self._scaler.transform(RDD_X)
            if self._typeMVA == 'PCA':
                RDD_Y = self._data.map(lambda x: x[0])
                RDD_Y_norm = StandardScaler(withMean=True, withStd=False).fit(RDD_Y).transform(RDD_Y)
            else:
                RDD_Y_norm = self._data.map(lambda x: x[0])

        # Create a new RDD of LabeledPoint data using the normalized features
        self._normdata = RDD_Y_norm.zip(RDD_X_norm)
Beispiel #14
0
def get_std_scaler(labeledpoints):
    std = StandardScaler()
    train_features = labeledpoints.map(lambda lp: lp.features)

    scaler_model = std.fit(train_features)
    transformed_features = scaler_model.transform(train_features)

    transformed_label_features = \
        zip(labeledpoints.map(lambda lp: lp.label).collect(), transformed_features.collect())

    return to_labeled_points(transformed_label_features), scaler_model
Beispiel #15
0
def TrainLRModel(trainData, iterations, step,
                 miniBatchFraction):  # Logistic Regression
    srcFeatures = trainData.map(lambda line: line.features)
    print srcFeatures.first()
    scaler = StandardScaler(withMean=True, withStd=True).fit(srcFeatures)
    srcLabel = trainData.map(lambda line: line.label)
    scaledFeature = scaler.transform(srcFeatures)
    print scaledFeature.first()
    scaledData = srcLabel.zip(scaledFeature)
    trainData = scaledData.map(
        lambda (label, features): LabeledPoint(label, features))
    model = LogisticRegressionWithSGD.train(data = trainData, iterations = iterations, step = step, \
                                            miniBatchFraction = miniBatchFraction)
    return model
def training(model_directory, libsvm, scaler):
    sc = SparkContext(appName="PythonLinearRegressionWithSGDExample")
    training_rdd = MLUtils.loadLibSVMFile(sc, libsvm)
    training_rdd.cache()
    if scaler == '1':
        label = training_rdd.map(lambda x: x.label)
        features = training_rdd.map(lambda x: x.features)

        scaler1 = StandardScaler().fit(features)
        data1 = label.zip(scaler1.transform(features))
        # convert into labeled point
        data2 = data1.map(lambda x: LabeledPoint(x[0], x[1]))
        model_logistic = LogisticRegressionWithLBFGS.train(data2)
    else:
        model_logistic = LogisticRegressionWithLBFGS.train(training_rdd)
    model_logistic.save(sc, model_directory)
Beispiel #17
0
def PrepareData(sc):
    '''
    准备数据
    :param sc:
    :return: (trainData, validationData, testData, categoriesMap)
    '''
    print('======================= 准备数据 =======================')
    # ----------------------------- 1. 导入并转换数据 -----------------------------
    print('========== [PrepareData] >>>> 开始导入 train.tsv 数据....')
    rawDataWithHeader = sc.textFile(Path + u'data/stumbleupon/train-100.tsv')
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace('\"', ''))
    lines = rData.map(lambda x: x.split('\t'))
    print('========== [PrepareData] >>>> 共计:' + str(lines.count()) + ' 项')
    # ----------------------------- 2. 建立训练评估所需数据RDD[LabeledPoint] -----------------------------
    # categoriesMap = lines.map(lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    # labelpointRDD = lines.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, categoriesMap, -1)))
    print('========== [PrepareData] >>>> 标准化之前:'),
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    for i in featureRDD.first():
        print('\t\t' + str(i) + '(' + str(type(i)) + '),'),
    print('')
    print('========== [PrepareData] >>>> 标准化之后:'),
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(
        featureRDD
    )  # 创建标准化刻度,由于数值特征字段单位不同而数字差异很大,故无法比较,因此需要标准化处理。这里不使用平均值密集输出,使用稀疏数据,因此设置withMean=False
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    for i in ScalerFeatureRDD.first():
        print('\t\t' + str(i) + '(' + str(type(i)) + '),'),
    labelpoint = labelRDD.zip(
        ScalerFeatureRDD)  # 使用zip将label与标准化后的特征字段结合起来建立labelpoint
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))
    # ----------------------------- 3. 以随机方式将数据分为3个部分并返回 -----------------------------
    (trainData, validationData,
     testData) = labelpointRDD.randomSplit([8, 1, 1])
    print('========== [PrepareData] >>>> 将数据以随机方式差分为三个部分:trainData: ' +
          str(trainData.count()) + ' 项, validationData: ' +
          str(validationData.count()) + ' 项, testData: ' +
          str(testData.count()) + ' 项')
    # ----------------------------- 4. 返回元组数据 -----------------------------
    return (trainData, validationData, testData, categoriesMap)
Beispiel #18
0
    def fit(self, dataset):
        """
        Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos.

        :param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet`

        """
        if isinstance(dataset, LabeledDataSet):
            dataset = dataset.features
        if isinstance(dataset, pyspark.rdd.RDD):
            standarizer = StdSc(self.flag_mean, self.flag_std)
            self.model = standarizer.fit(dataset)
        else:
            if type(dataset) is not np.ndarray:
                dataset = np.array(dataset)
            if self.flag_mean is True:
                self.mean = dataset.mean(axis=0)
            if self.flag_std is True:
                self.std = dataset.std(axis=0, ddof=1)
        return
Beispiel #19
0
    def fit(self, dataset):
        """
        Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos.

        :param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet`

        """
        if isinstance(dataset, LabeledDataSet):
            dataset = dataset.features
        if isinstance(dataset, pyspark.rdd.RDD):
            standarizer = StdSc(self.flag_mean, self.flag_std)
            self.model = standarizer.fit(dataset)
        else:
            if type(dataset) is not np.ndarray:
                dataset = np.array(dataset)
            if self.flag_mean is True:
                self.mean = dataset.mean(axis=0)
            if self.flag_std is True:
                self.std = dataset.std(axis=0, ddof=1)
        return
def PrepareData(sc):
    #---------------------1. 导入并转换数据---------------------
    global Path
    if sc.master[:5] == "local" or sc.master[:5] == "spark":
        Path = "file:/Users/johnnie/pythonwork/workspace/PythonProject/data/"
    else:
        Path = "hdfs://localhost:9000/user/hduser/test/data/"

    print("开始导入数据...")
    rawDataWithHeader = sc.textFile(Path + "train.tsv")
    header = rawDataWithHeader.first()
    rawData = rawDataWithHeader.filter(lambda x: x != header)
    rData = rawData.map(lambda x: x.replace("\"", ""))
    lines = rData.map(lambda x: x.split("\t"))
    print("共计:" + str(lines.count()) + "项")

    #---------------------2. 建立训练评估所需数据RDD[LabeledPoint]---------------------
    print("标准化之前:")
    categoriesMap = lines.map(
        lambda fields: fields[3]).distinct().zipWithIndex().collectAsMap()
    labelRDD = lines.map(lambda r: extract_label(r))
    featureRDD = lines.map(lambda r: extract_features(r, categoriesMap,
                                                      len(r) - 1))
    print(featureRDD.first())
    print("\n")
    print("标准化之后:")
    stdScaler = StandardScaler(withMean=False, withStd=True).fit(featureRDD)
    ScalerFeatureRDD = stdScaler.transform(featureRDD)
    print(ScalerFeatureRDD.first())
    labelpoint = labelRDD.zip(ScalerFeatureRDD)
    # r[0]是label
    # r[1]是features
    labelpointRDD = labelpoint.map(lambda r: LabeledPoint(r[0], r[1]))

    #---------------------3. 以随机方式将数据分为3个部分并返回---------------------
    trainData, validationData, testData = labelpointRDD.randomSplit([8, 1, 1])
    print("将数据分trainData: " + str(trainData.count()) + " validationData: " +
          str(validationData.count()) + " testData: " + str(testData.count()))

    return trainData, validationData, testData, categoriesMap
    def extract_features(self, feat='tfidf', **kwargs):
        """
        Converts each subtitle into its TF/TFIDF representation.
        Normalizes if necessary.

        Parameters
        --------
        Feat: 'tf' or 'tfidf'.
        kwargs: num_features, minDocFreq, or other arguments to be passed
        to the MLLib objects.

        Returns
        --------
        RDD of features with key.
        """

        # transform BOW into TF vectors
        num_features = kwargs.get('num_features', 10000)
        htf = HashingTF(num_features)
        feat_rdd = self.RDD.mapValues(htf.transform).cache()

        # transform TF vectors into IDF vectors
        if feat == 'tfidf':
            keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
            minDocFreq = kwargs.get('minDocFreq', 2)
            idf = IDF(minDocFreq=minDocFreq)
            idf_model = idf.fit(tf_vecs)
            idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(idf_rdd)

        if self.model_type == 'log_reg':
            normalizer = StandardScaler(withMean=True, withStd=True)
            keys, vecs = feat_rdd.keys(), feat_rdd.values()
            norm_model = normalizer.fit(vecs)
            norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
            feat_rdd = keys.zip(norm_rdd)

        return feat_rdd
Beispiel #22
0
class StandardScalerNormalizer:
    def __init__(self):
        self.normalizer = None

    def norm_train(self, train_data):
        train_features = train_data.map(lambda lp: lp.features)
        self.normalizer = StandardScaler().fit(train_features)

        # TODO: This can't be efficient...
        #return train_data.map(lambda lp: lp.label).zip(self.norm(train_features)).map(lambda r: LabeledPoint(r[0], r[1]))
        labels = train_data.map(lambda lp: lp.label).collect()
        features = self.norm(train_features).collect()
        return get_df(zip(
            labels, features)).rdd.map(lambda r: LabeledPoint(r[0], r[1]))

    def norm(self, data):
        return self.normalizer.transform(data)

    def __str__(self):
        return 'StandardScaler'
    def __init__(self):
        Dataset.__init__(self)

        trainDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_train.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.split(' ')[:-1])
        labels = train.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(_[0], _[1]))

        testDirectory = HDFS_DIRECTORY + 'rotated_checkerboard2x2_test.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(_[0], _[1]))
    def __init__(self):
        Dataset.__init__(self)

        # preparing the Data (Train and Test) : formatting and scaling then making it an RDD of LabeledPoints

        trainDirectory = HDFS_DIRECTORY + 'checkerboard2x2_train.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.split(' ')[:-1])
        labels = train.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features))\
            .map(lambda _: LabeledPoint(_[0], _[1]))

        testDirectory = HDFS_DIRECTORY + 'checkerboard2x2_test.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features))\
            .map(lambda _: LabeledPoint(_[0], _[1]))
        ''' this block is for testing '''
    def __init__(self):
        Dataset.__init__(self)

        trainDirectory = HDFS_DIRECTORY + 'striatum_train_mini.txt'
        train = sc.textFile(trainDirectory)
        features = train.map(lambda _: _.strip().split(' ')[:-1])
        labels = train.map(lambda _: _.strip().split(' ')[-1])
        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.trainSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))

        testDirectory = HDFS_DIRECTORY + 'striatum_test_mini.txt'
        test = sc.textFile(testDirectory)
        features = test.map(lambda _: _.split(' ')[:-1])
        labels = test.map(lambda _: _.split(' ')[-1])

        # AN ISSUE HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        # in original LAL code they scaled testset with the scaler fitted from TRAINING set, but why?

        scaler = StandardScaler(withMean=True, withStd=True).fit(features)
        self.testSet = labels.zip(scaler.transform(features)) \
            .map(lambda _: LabeledPoint(0 if _[0] == '-1' else 1, _[1]))
Beispiel #26
0
# 24 = mode
# 27 = tempo
# 28 = time_signature

allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data)))
allData.take(3)

# label data

# only uses one feature for now
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]]))
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)]))

labels = allData.map(lambda (tr, (rocks, data)): rocks)
features = allData.map(lambda (tr, (rocks, data)): data)
std = StandardScaler(True, True).fit(features)
scaledFeatures = std.transform(features)

labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data))

# uses all extracted
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data]))

labeledData.take(3)

# make sample sizes equal
labeledRock = labeledData.filter(lambda p: p.label == 1.0)
labeledRock.count()
labeledRock.map(lambda p: p.features[0]).mean()
nrock = labeledRock.count()
Beispiel #27
0
    parts = line.strip().split("::")
    return (int(parts[0]) - 1, int(parts[1]) - 1, float(parts[2]))


#load in input file
path = sys.argv[1]

#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)

#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features)  #data needs to be dense (zeros included)
scaler = StandardScaler(withMean=False, withStd=True).fit(
    features)  #becomes dense if using withMean. may run out of memory locally

#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(
    scaler.transform(features))  #use this line if having memory issues

#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])

#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect()) / num_folds
                 )  #parameterize this value as num_folds (in loop as well)

#train/validate 10 times on each k
Beispiel #28
0
df = sqlContext.createDataFrame(dictList)
df.show()
pdf = df.toPandas

table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean)
print table.values
# For Testing
#df.show()
#df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show()
df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi')
#df.show()
temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]]))

# Scale the data
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)

lab = df.map(lambda row: row[0])

transformedData = lab.zip(features_transform)

transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))

trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
Beispiel #29
0
print(model.predict(array([8.0, 0.0])))

#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler

sc = SparkContext()

vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]

dataset = sc.parallelize(vs)

#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect():
    print r

print("\n")

#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect():
    print r

print("\n")
def main(argv):

	verbose = False

	dbpath = '/root/data/AdditionalFiles/'
	tagstring = 'rock'
	usealldata = False

	holdout = 0.1
	model_iterations = 100
	model_step = 1.0
	model_intercept = True

	# possible types logistic and svm
	model_type = 'logistic'

	try:
		opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
	except getopt.GetoptError:
		print 'rockTag.py -d <data path> -t <tag string>'
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			print('rockTag.py -d <data path> -t <tag string>')
			sys.exit()
		elif opt in ("-v", "--verbose"):
			verbose = True
		elif opt in ("-d", "--datapath"):
			dbpath = arg
		elif opt in ("-t", "--tagstring"):
			tagstring = str(arg).lower()
		elif opt in ("-a", "--alldata"):
			usealldata = True
		elif opt in ("-m", "--model"):
			if str(arg).lower() in ['logistic','svm']:
				model_type = str(arg).lower
			else:
				print('valid models are logistic and svm')
				sys.exit()
		elif opt in ("-s", "--step"):
			model_step = float(arg)
		elif opt in ("-i", "--iterations"):
			model_iterations = int(arg)
		elif opt in ("-o", "--holdout"):
			holdout = float(arg)
			if holdout <= 0 | holdout >= 1:
				print('holdout must be greater than 0 and less than 1')
		elif opt in ("-c", "--intercept"):
			model_intercept = True

	if verbose:
		print('data path: ' + dbpath)
		print('tag string: ' + tagstring)

	labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)

	# scale features
	std = StandardScaler(True, True).fit(features)
	features = std.transform(features)

	# make labeled data
	labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
	if verbose: labeledData.take(3)

	# rebalance samples
	equalSampleData = rebalanceSample(labeledData, verbose=verbose)

	# split data
	trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
	if verbose: trainData.map(lambda p: (p.label, p.features)).take(3)

	# train model
	if model_type == 'logistic':
		model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
	elif model_type == 'svm':
		model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)

	evalString = evaluateModel(model, testData)
	print(evalString)
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # Without converting the features into dense vectors, transformation with zero mean will raise
    # exception on sparse vector.
    # data2 will be unit variance and zero mean.
    data2 = label.zip(
        scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    # step 1 - create spark context
    conf = SparkConf().setAppName("KMeans-Content")\
       .set("spark.executor.memory","1g")
    sc = SparkContext()


    # step 2 - load in input file
    data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
    labels = data.map(lambda x:x.label)
    features = data.map(lambda x:x.features)

  
    # step 3 - standarize the data with unit values and 0 mean
    scaler = StandardScaler(withMean=False,withStd=True).fit(features)

    data2 = labels.zip(scaler.transform(features))

    numFeatures = len(data2.values().take(10)[0])
    print "Type of data2: ",type(data2) #RDD
    print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
    print "Sample: ",data2.values().take(1)[0]

    # splitting up the data to training, validation and testing models.
    train,val,test = data2.randomSplit([.80,.10,.10])


    print "Training Dataset Size:",train.count()
    print "Validation Dataset size:",val.count()
    print "Test Dataset Size:",test.count()
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )

def parsePoint(data):
	#return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
	return LabeledPoint(data[0],data[1:])

# store the data from cassandra to a data frame and remove the NA value 
data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF()

data=data.filter("year>0").na.drop()
print data.count()


# Scale the features with Standard Scaler
data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array
scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column
scaledData = scaler.transform(data2)# transform our data

# Transform to a labelled vector
parsedData = scaledData.map(parsePoint)

# # Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True)

# Evaluate the model on training data
print ("intercept",model.intercept)
print zip(["loudness","year","sentiment","tempo","unique_words"],model.weights)

sc.stop()
Beispiel #34
0
## and source plots(Uniform, Gaussian). In case of Gaussian they look alike while 
## uncorrelated Uniform needs a rotation to get there. By removing correlation
## in the gaussian case, we have achieved independence between variables.
## If the source variables are gaussian ICA is not required and PCA is sufficient.
    
    
# Code for PCA and whitening the dataset.

from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix
from sklearn import datasets
# create the standardizer model for standardizing the dataset

X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) )
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
num_rows = X_rm.numRows()
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # data2 will be unit variance and zero mean.
    data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
# This should be the maximum possible time
max_time = 23 * 3600 + 59 * 60 + 59
#max_time = 16 * 60
low = 0
high = 15 * 60
modelList = []

while low < max_time: # Temp should run once
	timeseries = df.filter(lambda x: low < x.timestamp < high)	

	#if timeseries.count() > 0:
	features = timeseries.map(lambda row: row[1:])
		#print "Possible points"
		#print features.collect()

	model = StandardScaler().fit(features)
	features_t = model.transform(features)
	
	label = timeseries.map(lambda row: row[0])
	labeled_data = label.zip(features_t)

	final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1]))
	
	model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True)
		#model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True)
		#model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True)
	modelList.append(model)
		

		#print ""
		#print "Model1 weights " + str(model.weights)
Beispiel #37
0
    sc = SparkContext(conf=conf)
    sc.setLogLevel("warn")
    user_map = load_user_map(sc)
    # 加载训练数据
    train_data = load_train_data(sc)
    # 设置数据的用户信息数据
    train_data_user_info = set_train_user_info(train_data, user_map)
    # user_id  merchant_id age_range gender label
    train_data_user_info.cache()
    stand_train_data_user_info = train_data_user_info.map(
        lambda user: user[0:4])
    stand_train_data_user_info_label = train_data_user_info.map(
        lambda user: user[4])

    #训练数据标准化
    std_scaler = StandardScaler(True, True).fit(stand_train_data_user_info)
    stand_train_data_user_info = std_scaler.transform(
        stand_train_data_user_info)

    train_data_user_info = stand_train_data_user_info_label.zip(
        stand_train_data_user_info)
    # 构建标签数据
    train_data_user_info = build_point(train_data_user_info)
    numIterations = 100

    train_data_user_info.cache()
    #训练模型
    model = SVMWithSGD.train(train_data_user_info, numIterations)
    #model = DecisionTree.trainClassifier(train_data_user_info,numIterations,2,{})

    # 加载测试数据
Beispiel #38
0
# 예제 11-9 파이썬에서 벡터 정량화

from pyspark.mllib.feature import StandardScaler

vectors = [Vectors.dense([-2.0, 5.0, 1.0]), Vectors.dense([2.0, 0.0, 1.0])]
dataset = sc.parallelize(vectors)
scaler = StandardScaler(withMean=True, withStd=True)
model = scaler.fit(dataset)
result = model.transform(dataset)

# 결과: {[-0.7071, 0.7071, 0.0], [0.7071, -0.7071, 0.0])
    parts = line.strip().split("::")
    return (int(parts[0])-1, int(parts[1])-1, float(parts[2]))

#load in input file
path = sys.argv[1]

#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)

labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)


#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features)  #data needs to be dense (zeros included)
scaler = StandardScaler(withMean = False, withStd = True).fit(features)  #becomes dense if using withMean. may run out of memory locally

#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(scaler.transform(features))   #use this line if having memory issues

#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])

#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect())/num_folds)   #parameterize this value as num_folds (in loop as well)

#train/validate 10 times on each k
i = 0
j = partitionSize
# print the top line of each RDD to confirm that the transformation was successful
weighted = ep.transform(vecrdd)

print weighted.take(1)
print vecrdd.take(1)

# call the colStats method of the Statistics object on vecrdd and print the
# mean, variance, and number of non-zero values
stats = Statistics.colStats(vecrdd)

print stats.mean()
print stats.variance()
print stats.numNonzeros()

# instantiate a StandardScaler object and set withMean and withStd to 'True'
ss = StandardScaler(withMean=True, withStd=True)

# call the fit method of the StandardScaler object to create a StandardScalerModel
model = ss.fit(vecrdd)

# call the transform method of the StandardScalerModel to center and scale the data
# in vecrdd RDD
scaled = model.transform(vecrdd)

# call colStats method of the Statistics object and print the mean, variance,
# and number of non-zero values to confirm that vecrdd was scaled and centered
scaledStats = Statistics.colStats(scaled)

print scaledStats.mean()
print scaledStats.variance()
print scaledStats.numNonzeros()
Beispiel #41
0
def main():
    appName = "BadOrGood;zl"
    
    conf = (SparkConf()
            .setAppName(appName)
            .set("spark.executor.memory", "5g")
            .set("spark.executor.cores","3")
            .set("spark.executor.instance", "3")
            )
    sc = SparkContext(conf = conf)
    hc = HiveContext(sc)

    #fetch data
    #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
    #fetchDataToFile(hc, filepath)
    
    #load data
    # AllDataRawrdd = sc.pickleFile(filepath) \
                    # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
                    # .repartition(10)
    
    AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
    
    
    #standardizer for train and test data
    model = StandardScaler(True, True) \
            .fit( AllDataRawrdd \
                  .map( lambda _: Vectors.dense(_['feature']) ) 
            )
    labels = AllDataRawrdd.map(lambda _: _['label'])
    featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
    AllDataRawrdd = labels \
                    .zip(featureTransformed) \
                    .map( lambda _: { 'label':_[0], 'feature':_[1] } )
    #sampling
    trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
    trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
    testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
    
    #prediction & test
    lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
    resultrdd = test(lrmLBFGS, testDatardd)
    lrmLBFGSFone = fone(resultrdd)
    lrmLBFGSac = accuracy(resultrdd)

    lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
    resultrdd = test(lrmSGD, testDatardd)
    lrmSGDFone = fone(resultrdd)
    lrmSGDac = accuracy(resultrdd)
  
    dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
    resultrdd = test(dt, testDatardd)
    dtFone = fone(resultrdd)
    dtac = accuracy(resultrdd)
  
    rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
    resultrdd = test(rf, testDatardd)
    rfFone = fone(resultrdd)
    rfac = accuracy(resultrdd)

    print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
    print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
    print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
    print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)

    print lrmLBFGS.weights
    print lrmSGD.weights

    sc.stop()
Beispiel #42
0
    print("Loading RAW data...")
    raw_data = sc.textFile(data_file)

    labels = raw_data.map(lambda line: line.strip().split(",")[-1])

    # Prepare data for clustering input
    # the data contains non-numeric features, we want to exclude them since
    # k-means works with numeric features. These are the first three and the last
    # column in each data row
    print("Parsing dataset...")
    parsed_data = raw_data.map(parse_interaction)
    parsed_data_values = parsed_data.values().cache()

    # Standardize data
    print("Standardizing data...")
    standardizer = StandardScaler(True, True)
    standardizer_model = standardizer.fit(parsed_data_values)
    standardized_data_values = standardizer_model.transform(parsed_data_values)

    # Evaluate values of k from 5 to 40
    print(
        "Calculating total in within cluster distance for different k values (10 to %(max_k)d):"
        % {"max_k": max_k})
    scores = map(lambda k: clustering_score(standardized_data_values, k),
                 range(10, max_k + 1, 10))

    # Obtain min score k
    min_k = min(scores, key=lambda x: x[2])[0]
    print("Best k value is %(best_k)d" % {"best_k": min_k})

    # Use the best model to assign a cluster to each datum
Beispiel #43
0
    label_counts = labels.countByValue()
    sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
    for label, count in sorted_labels.items():
        print label, count

    # Prepare data for clustering input
    # the data contains non-numeric features, we want to exclude them since
    # k-means works with numeric features. These are the first three and the last
    # column in each data row
    print "Parsing dataset..."
    parsed_data = raw_data.map(parse_interaction)
    parsed_data_values = parsed_data.values().cache()

    # Standardize data
    print "Standardizing data..."
    standardizer = StandardScaler(True, True)
    standardizer_model = standardizer.fit(parsed_data_values)
    standardized_data_values = standardizer_model.transform(parsed_data_values)

    # Evaluate values of k from 5 to 40
    print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
    scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10))

    # Obtain min score k
    min_k = min(scores, key=lambda x: x[2])[0]
    print "Best k value is %(best_k)d" % {"best_k": min_k}

    # Use the best model to assign a cluster to each datum
    # We use here standardized data - it is more appropriate for exploratory purposes
    print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k}
    best_model = min(scores, key=lambda x: x[2])[1]
#Section 7.4.4
from pyspark.mllib.regression import LabeledPoint
def toLabeledPoint(x):
  a = x.toArray()
  return LabeledPoint(a[-1], Vectors.dense(a[0:-1]))

housingData = housingVals.map(toLabeledPoint)

#Section 7.4.5
sets = housingData.randomSplit([0.8, 0.2])
housingTrain = sets[0]
housingValid = sets[1]

#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))

#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)

#Section 7.5.1

#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler

sc = SparkContext()

vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]

dataset = sc.parallelize(vs)

#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r

print("\n")

#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r

print("\n")

#divides the length of vector
def norm(features):
    scaler = StandardScaler(withMean=False, withStd=False).fit(features)
    return scaler.transform(features)
Beispiel #47
0
# Size:
shape = reader.shape(flist[0])
shape['n_samples_per_file'] = shape['n_samples']
shape['n_samples'] = shape['n_samples'] * len(flist)
print "Will load a dataset of size:\n\t", shape

rdd_data = sc.parallelize(flist).flatMap(reader('TEMP'))
first = rdd_data.first()

# In[Scaling]:

# Compute scaling parameters:
from pyspark.mllib.feature import StandardScaler, StandardScalerModel

scaler = StandardScaler(withMean=True, withStd=True).fit(rdd_data)

sample_mean = scaler.call('mean')

# Effectively scale the dataset:
rdd_norm = scaler.transform(rdd_data)

# In[Reduction]:

# Compute PCA new dimensions:
from pyspark.mllib.feature import PCA as PCAmllib

Neof = 20
reducer = PCAmllib(Neof).fit(rdd_norm)
# print type(reducer)