Beispiel #1
0
    def buildModel(self,save_pipe_path=None):
        df=self.getModelData()

        label_index=fea.StringIndexer(inputCol='user_type',outputCol='label')
        reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',')
        cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector')
        vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature")
        scaler=fea.StandardScaler(inputCol='feature',outputCol='features')

        if not save_pipe_path:
            lr=LogisticRegression()
            grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build()
            evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR")

            pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler])
            pipe = pipeline.fit(df)
            pipe_out=pipe.transform(df)

            cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator)
            model=cv.fit(pipe_out)

            print evaluator.evaluate(model.transform(pipe_out))
            print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam()

            predict_result=model.transform(pipe_out).select('probability','label').toPandas()
            predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False)
        else:
            lr=LogisticRegression(elasticNetParam=1.0)

            pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr])
            model=pipeline.fit(df)

            model.save(save_pipe_path)
            print 'pipe saved'
Beispiel #2
0
def std_scaler(X, inputCol="features", outputCol="resFeatures"):
    scaler = normalizer.StandardScaler(inputCol=inputCol, outputCol=outputCol,
                                       withStd=True, withMean=False)
    scalerModel = scaler.fit(X)
    scaledData = scalerModel.transform(X)

    return scaledData
def _scale_data_frame(df, vector=None):
    if vector:
        df = df.withColumn(vector, udf(_to_dense, ml_linalg.VectorUDT())(vector))
        scale = feature.StandardScaler(
            withMean=True, withStd=True,
            inputCol=vector, outputCol='std_vector')
        model = scale.fit(df)
        return (model
            .transform(df)
            .select([i for i in df.columns if i != vector] + [scale.getOutputCol()])
            .withColumnRenamed(existing=scale.getOutputCol(), new=vector))
Beispiel #4
0
    def _vector_scale(self, df):
        to_dense_udf = F.udf(self._to_dense, linalg.VectorUDT())
        feature_str = 'features'

        vector_df = df.withColumn(colName=feature_str,
                                  col=to_dense_udf(*self._list_feature))
        if self._bool_standardize:
            scaling_model = features.StandardScaler(
                inputCol=feature_str,
                outputCol="scaled_features",
                withMean=True,
                withStd=True).fit(vector_df)
        else:
            scaling_model = features.StandardScaler(
                inputCol=feature_str,
                outputCol="scaled_features",
                withMean=False,
                withStd=False).fit(vector_df)

        scaled_df = scaling_model.transform(vector_df)

        return scaled_df
    'EUMEAT', 'EUPRPMEL', 'TUACTIVITY_N', 'tuactdur24', 'tewhere', 'TESEX'
],
                                    outputCol='features')

# COMMAND ----------

vecIntercept = feature.VectorAssembler(inputCols=[], outputCol='emptyFeatures')

# COMMAND ----------

# MAGIC %md
# MAGIC Scaling stage to scale features from Vector Assembler

# COMMAND ----------

scaled = feature.StandardScaler(inputCol='features',
                                outputCol='sclaedFeatures')

# COMMAND ----------

# MAGIC %md
# MAGIC Three Linear Regression Pipleline stage
# MAGIC 1 - LR with just the intercept
# MAGIC 2 - LR with all features unscaled
# MAGIC 3 - LR with all features and scaled stage

# COMMAND ----------

regIntercept = regression.LinearRegression(labelCol='ERBMI',
                                           featuresCol='emptyFeatures')

# COMMAND ----------
Beispiel #6
0
X_Opt = X[:,[1,4,5]]
regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit()
regressor_OLS.summary()


#choice feature cols
feature_cols = [df_train.columns[1], df_train.columns[2], df_train.columns[3], df_train.columns[4]]
#feature_cols = df.columns[1:]
assembler = feature.VectorAssembler(inputCols=feature_cols, outputCol='features')
df_train = assembler.setHandleInvalid("skip").transform(df_train)
df_train = df_train.withColumnRenamed('Survived', 'label')
df_train = df_train.select('features', 'label')

# scaling
scaler = feature.StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)
scaler = scaler.fit(df_train)
df_train =scaler.transform(df_train)
df_train = df_train.drop('features').withColumnRenamed('scaledFeatures','features')



# TEST
# reading data_train
df_test = spark.read.csv("test.csv",header=True, inferSchema=True)
df_test = df_test.drop('Name', 'Ticket', 'Cabin')

# convert categorical to numeric
chn = []
for col in df_test.columns:
    count = df_test.filter('{} is null'.format(col)).count()
Beispiel #7
0
    x for x in encoded.columns if x not in {'userId', 'userChurnFlag'}
]
assembler = smf.VectorAssembler(inputCols=feature_cols, outputCol='features')
encoded = assembler.transform(encoded)
encoded = encoded.drop(*feature_cols)
encoded = encoded.withColumnRenamed('userChurnFlag', 'label')
encoded = encoded.persist()

encoded_sample = encoded.limit(1000).toPandas()

# Split out validation dataset
train, val = encoded.randomSplit([3.0, 1.0], seed=42)

# Set up pipeline for model training/evaluation
scaler = smf.StandardScaler(withStd=True,
                            withMean=False,
                            inputCol='features',
                            outputCol='scaledFeatures')

# Use PCA to reduce dimensionality of scaled vectors
reducer = smf.PCA(k=10,
                  inputCol=scaler.getOutputCol(),
                  outputCol='selectedFeatures')

# Use a classifier to generate the final predictions
classifier = smc.GBTClassifier(labelCol='label',
                               featuresCol=reducer.getOutputCol(),
                               predictionCol='predictedLabel')

# Combine all steps in a pipeline
pipeline = sm.Pipeline(stages=[scaler, reducer, classifier])
Beispiel #8
0
    ft.StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in nomial_features
]
encoders = [
    ft.OneHotEncoder(inputCol=indexer.getOutputCol(),
                     outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]
assembler_onehot = ft.VectorAssembler(
    inputCols=[encoder.getOutputCol() for encoder in encoders],
    outputCol="onehot_features")

#scaler
assembler_numeric = ft.VectorAssembler(inputCols=numeric_features,
                                       outputCol="numeric_features")
std_scaler = ft.StandardScaler(inputCol="numeric_features",
                               outputCol="numeric_features_scaled")

assembler_final = ft.VectorAssembler(
    inputCols=['onehot_features', 'numeric_features_scaled'],
    outputCol="final_features")

pca_model = ft.PCA(k=6, inputCol="final_features", outputCol="pca_features")

pipeline = Pipeline(stages=indexers + encoders + [
    assembler_onehot, assembler_numeric, std_scaler, assembler_final, pca_model
])
preprocess_model = pipeline.fit(df)
scaledData = preprocess_model.transform(df)

# 保存和加载模型,save model load model
from pyspark.ml import PipelineModel