Exemple #1
0
    def train(self, df):
        df = self.build_features_vectors(df)

        scaler = StandardScaler()
        scaler.setInputCol(self.features_values_column)
        scaler.setOutputCol(self.features_values_scaled)
        scaler.setWithMean(self.scaler_with_mean)
        scaler.setWithStd(self.scaler_with_std)
        self.scaler_model = scaler.fit(df)
        df = self.scaler_model.transform(df).persist(
            StorageLevelFactory.get_storage_level(self.storage_level))
        if len(self.categorical_features):
            self._create_indexes(df)
            self._add_categorical_features(df, self.features_values_scaled)

        iforest = IForest(
            featuresCol=self.features_values_scaled,
            predictionCol=self.prediction_column,
            # anomalyScore=self.score_column,
            numTrees=self.num_trees,
            maxSamples=self.max_samples,
            maxFeatures=self.max_features,
            maxDepth=self.max_depth,
            contamination=self.contamination,
            bootstrap=self.bootstrap,
            approxQuantileRelativeError=self.
            approximate_quantile_relative_error,
            # numCategoricalFeatures=len(self.categorical_features)
        )
        iforest.setSeed(self.seed)
        params = {'threshold': self.threshold}
        self.iforest_model = iforest.fit(df, params)
        df.unpersist()
Exemple #2
0
imputer.setInputCols(["total_bedrooms", "bedrooms_per_room"])
imputer.setOutputCols(["out_total_bedrooms", "out_bedrooms_per_room"])
imputedHousing = imputer.setStrategy('median').setMissingValue(414).fit(renamedHousing).transform(renamedHousing)
imputedHousing = imputedHousing.drop('total_bedrooms').drop('bedrooms_per_room')

for c in imputedHousing.columns:
    print(c, " has null values : ", imputedHousing.filter(imputedHousing[c].isNull()).count())

colNum_to_scale = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population',
          'households', 'median_income', 'rooms_per_household','population_per_household','out_total_bedrooms','out_bedrooms_per_room']
va = VectorAssembler().setInputCols(colNum_to_scale).setOutputCol('features')
featuredHousing = va.transform(imputedHousing)
featuredHousing.show()

scaler = StandardScaler(withMean=True, withStd=True)
scaler.setInputCol("features").setOutputCol("scaled_features")
scaledHousing = scaler.fit(featuredHousing).transform(featuredHousing)
scaledHousing.select('scaled_features').show()

# 3-2 bu tai ming bai zhe li?????????????????????????????
distinct = renamedHousing.select('ocean_proximity').distinct().collect()
print(distinct)
renamedHousing.agg(countDistinct("ocean_proximity")).show()

indexer = StringIndexer().setInputCol('ocean_proximity').setOutputCol('idx_ocean_proximity')
idxHousing = indexer.fit(renamedHousing).transform(renamedHousing)
idxHousing.show()

encoder = OneHotEncoder().setInputCol('idx_ocean_proximity').setOutputCol('one_hot_ocean_proximity')
ohHousing = encoder.fit(idxHousing).transform(idxHousing)
ohHousing.show()