Beispiel #1
0
    def buildModel(self,save_pipe_path=None):
        df=self.getModelData()

        label_index=fea.StringIndexer(inputCol='user_type',outputCol='label')
        reTokenizer=fea.RegexTokenizer(inputCol='appnames',outputCol='appname_token',pattern=',')
        cnt_vector=fea.CountVectorizer(inputCol='appname_token',outputCol='appname_vector')
        vecAssembler = fea.VectorAssembler(inputCols=['appname_vector'], outputCol="feature")
        scaler=fea.StandardScaler(inputCol='feature',outputCol='features')

        if not save_pipe_path:
            lr=LogisticRegression()
            grid=ParamGridBuilder().addGrid(lr.elasticNetParam,[0,1]).build()
            evaluator=BinaryClassificationEvaluator(metricName="areaUnderPR")

            pipeline = Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler])
            pipe = pipeline.fit(df)
            pipe_out=pipe.transform(df)

            cv=CrossValidator(estimator=lr,estimatorParamMaps=grid,evaluator=evaluator)
            model=cv.fit(pipe_out)

            print evaluator.evaluate(model.transform(pipe_out))
            print 'Best Param (regParam): ', model.bestModel._java_obj.getElasticNetParam()

            predict_result=model.transform(pipe_out).select('probability','label').toPandas()
            predict_result.to_csv('/home/chenchen/data/predict_result1.csv',index=False)
        else:
            lr=LogisticRegression(elasticNetParam=1.0)

            pipeline=Pipeline(stages=[label_index,reTokenizer, cnt_vector,vecAssembler,scaler,lr])
            model=pipeline.fit(df)

            model.save(save_pipe_path)
            print 'pipe saved'
Beispiel #2
0
def classification_ml():
	if False:
		spark = SparkSession.builder.appName('classification-ml') \
			.config('spark.jars.packages', 'org.xerial:sqlite-jdbc:3.23.1') \
			.getOrCreate()

		df = spark.read \
			.format('jdbc') \
			.option('url', 'jdbc:sqlite:iris.db') \
			.option('driver', 'org.sqlite.JDBC') \
			.option('dbtable', 'iris') \
			.load()
	else:
		spark = SparkSession.builder.appName('classification-ml').getOrCreate()
		df = spark.read.option('header', 'true').option('inferSchema', 'true').format('csv').load('dataset/iris.csv')
	spark.sparkContext.setLogLevel('WARN')
	df.show()

	labels = [
		('index', types.IntegerType()),
		('a1', types.FloatType()),
		('a2', types.FloatType()),
		('a3', types.FloatType()),
		('a4', types.FloatType()),
		('id', types.StringType()),
		('label', types.StringType())
	]

	stringIndexer = ml_feature.StringIndexer(inputCol='label', outputCol='label_int')
	featuresCreator = ml_feature.VectorAssembler(inputCols=[col[0] for col in labels[1:5]], outputCol='features')

	# Create a model.
	logistic = ml_classification.LogisticRegression(featuresCol=featuresCreator.getOutputCol(), labelCol=stringIndexer.getOutputCol(), maxIter=10, regParam=0.01)

	# Create a pipeline.
	pipeline = Pipeline(stages=[stringIndexer, featuresCreator, logistic])

	# Split the dataset into training and testing datasets.
	df_train, df_test = df.randomSplit([0.7, 0.3], seed=666)

	# Run the pipeline and estimate the model.
	model = pipeline.fit(df_train)
	test_result = model.transform(df_test)  # Dataframe.

	#print(test_result.take(1))
	#test_result.show(5, truncate=True, vertical=False)
	test_result.show(truncate=False)

	# Save and load.
	lr_path = './lr'
	logistic.write().overwrite().save(lr_path)
	lr2 = ml_classification.LogisticRegression.load(lr_path)
	print('Param =', lr2.getRegParam())

	model_path = './lr_model'
	model.write().overwrite().save(model_path)
	model2 = PipelineModel.load(model_path)
	print('Stages =', model.stages)
	print(model.stages[2].coefficientMatrix == model2.stages[2].coefficientMatrix)
	print(model.stages[2].interceptVector == model2.stages[2].interceptVector)
Beispiel #3
0
    def exercise_in_machine_learning(self):
        self.static_data_frame.printSchema()

        prepped_data_frame = self.static_data_frame.na.fill(0). \
            withColumn("day_of_week", functions.date_format(functions.col("InvoiceDate"), "EEEE")).coalesce(5)

        train_data_frame = prepped_data_frame.where(
            "InvoiceDate < '2011-03-01'")
        test_data_frame = prepped_data_frame.where(
            "InvoiceDate >= '2011-03-01'")

        print(f"TRAINING items: {train_data_frame.count()}")
        print(f"TEST DATA items: {test_data_frame.count()}")

        transformation_pipeline = Pipeline().setStages([
            feature.StringIndexer().setInputCol("day_of_week").setOutputCol(
                "day_of_week_index"),
            feature.OneHotEncoder().setInputCol(
                "day_of_week_index").setOutputCol("day_of_week_encoded"),
            feature.VectorAssembler().setInputCols(
                ["UnitPrice", "Quantity",
                 "day_of_week_encoded"]).setOutputCol("features"),
        ])

        fitted_pipeline = transformation_pipeline.fit(train_data_frame)
        transformed_training = fitted_pipeline.transform(train_data_frame)
        # transformed_training.cache()

        kmeans = clustering.KMeans().setK(2).setSeed(2)
        km_model = kmeans.fit(transformed_training)
        print(f"Training cost: {km_model.summary.trainingCost}")

        transformed_test = fitted_pipeline.transform(test_data_frame)
        transformed_test.summary().show()
Beispiel #4
0
def feature_extract(train_t):
    stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')

    sw_remover1 = spark_ft.StopWordsRemover(inputCol='ntokens1',
                                            outputCol='clean_tokens1',
                                            stopWords=stopWords)

    text2vec1 = spark_ft.Word2Vec(vectorSize=50,
                                  minCount=1,
                                  seed=123,
                                  inputCol='ntokens1',
                                  outputCol='text_vec1',
                                  windowSize=1,
                                  maxSentenceLength=100)

    assembler1 = spark_ft.VectorAssembler(inputCols=['text_vec1'],
                                          outputCol='features1')

    sw_remover2 = spark_ft.StopWordsRemover(inputCol='ntokens2',
                                            outputCol='clean_tokens2',
                                            stopWords=stopWords)

    text2vec2 = spark_ft.Word2Vec(vectorSize=50,
                                  minCount=1,
                                  seed=123,
                                  inputCol='ntokens2',
                                  outputCol='text_vec2',
                                  windowSize=1,
                                  maxSentenceLength=100)

    assembler2 = spark_ft.VectorAssembler(inputCols=['text_vec2'],
                                          outputCol='features2')

    feature_pipeline = Pipeline(stages=[
        sw_remover1, text2vec1, assembler1, sw_remover2, text2vec2, assembler2
    ])

    feature_model = feature_pipeline.fit(train_t)

    train_featurized = feature_model.transform(train_t).persist()
    tA = train_featurized.select('text_vec1').collect()
    tA_array = np.array(tA)
    tB = train_featurized.select('text_vec2').collect()
    tB_array = np.array(tB)

    return tA_array, tB_array
Beispiel #5
0
def compute_corr(df, columns, method="pearson"):
    assembler = feature.VectorAssembler(inputCols=columns, 
        outputCol="featuresCorrelation")
    corr_featurized_df = assembler.transform(df)
    corr_df = stat.Correlation.corr(corr_featurized_df, "featuresCorrelation", method)
    corr_matrix = corr_df.first()[0].toArray()
    corr_pddf = pd.DataFrame(corr_matrix, columns=columns, index=columns)
    return corr_pddf
Beispiel #6
0
def df(spark):
    df = spark.read.parquet('df')
    pred_cols = [
        x for x in df.columns if x not in ['features', 'label', 'response']
    ]
    assembler = mlf.VectorAssembler(inputCols=pred_cols, outputCol='features')
    df = assembler.transform(df)  # type pyspark.sql.DataFrame
    df.cache()
    return df
Beispiel #7
0
def test_stackedml_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \
        classification.\
        RandomForestClassifier()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
Beispiel #8
0
def train_evaluate(train_data, test_data):
    # 将文字的分类特征转为数字
    stringIndexer = ft.StringIndexer(inputCol='alchemy_category',
                                     outputCol="alchemy_category_Index")

    encoder = ft.OneHotEncoder(dropLast=False,
                               inputCol='alchemy_category_Index',
                               outputCol="alchemy_category_IndexVec")

    assemblerInputs = ['alchemy_category_IndexVec'] + train_data.columns[4:-1]
    assembler = ft.VectorAssembler(inputCols=assemblerInputs,
                                   outputCol="features")

    # dt = cl.DecisionTreeClassifier(labelCol="label",
    #                             featuresCol="features")
    rf = cl.RandomForestClassifier(labelCol="label", featuresCol="features")

    evaluator = ev.BinaryClassificationEvaluator(
        rawPredictionCol="probability",
        labelCol='label',
        metricName='areaUnderROC')

    grid_search = tune.ParamGridBuilder()\
        .addGrid(rf.impurity, [ "gini","entropy"])\
        .addGrid(rf.maxDepth, [ 5,10,15])\
        .addGrid(rf.maxBins, [10, 15,20])\
        .addGrid(rf.numTrees, [10, 20,30])\
        .build()

    rf_cv = tune.CrossValidator(estimator=rf,
                                estimatorParamMaps=grid_search,
                                evaluator=evaluator,
                                numFolds=5)

    # rf_tvs = tune.TrainValidationSplit(
    #     estimator=rf,
    #     estimatorParamMaps=grid_search,
    #     evaluator=evaluator,
    #     trainRatio=0.7
    # )
    pipeline = Pipeline(stages=[stringIndexer, encoder, assembler, rf_cv])
    cv_pipeline_model = pipeline.fit(train_data)

    best_model = cv_pipeline_model.stages[-1]
    best_parm = get_best_param(best_model)

    AUC, AP = evaluate_model(cv_pipeline_model, test_data)

    return AUC, AP, best_parm, cv_pipeline_model
Beispiel #9
0
def test_multi_model_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    models = (classification.LogisticRegression(),
              classification.RandomForestClassifier(),
              classification.LogisticRegression().setElasticNetParam(0.2),
              classification.GBTClassifier())
    ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \
        classification.LogisticRegression()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
Beispiel #10
0
def n_gram(df, input_col, n=2):
    """
    Converts the input array of strings inside of a Spark DF into an array of n-grams.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to analyzer.
    :param n: number of elements per n-gram >=1.
    :return: Spark DataFrame with n-grams calculated.
    """

    is_dataframe(df)

    tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover()
    count = feature.CountVectorizer()
    gram = feature.NGram(n=n) | feature.CountVectorizer()
    tf = tokenizer | (count, gram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    df_model = tfidf_model.transform(df)
    return df_model, tfidf_model
Beispiel #11
0
def test_unigram_and_bigram():
    df = SPARK_SESSION.sparkContext. \
        parallelize([['this is the best sentence ever'],
                     ['this is however the worst sentence available']]). \
        toDF(schema=types.StructType().add('sentence', types.StringType()))
    import requests
    stop_words = requests.get(
        'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
    ).text.split()

    tokenizer = feature.Tokenizer().setInputCol(
        'sentence') | feature.StopWordsRemover(stopWords=stop_words)
    unigram = feature.CountVectorizer()
    bigram = feature.NGram() | feature.CountVectorizer()
    trigram = feature.NGram(n=3) | feature.CountVectorizer()
    tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    assert_equal(
        tfidf_model.transform(df).select('sentence', 'features').count(), 2)
def main(path_data, path_parameters, dir_models):
    logger = logging.getLogger(__name__)
    spark = (
        pyspark.sql.SparkSession
            .builder
            .appName("Python Spark Random Forest model training")
            .enableHiveSupport()
            .getOrCreate()
    )

    logger.info("Reading parquet data and splitting into test and train datasets")
    data_df = spark.read.parquet(path_data)
    splits = data_df.randomSplit([0.7, 0.3])
    training_df = splits[0]
    validation_df = splits[1]

    logger.info("Constructing pipeline for prediction model")
    with open(path_parameters) as json_file:
        parameters = json.load(json_file)
    feature_columns = parameters['feature_columns']
    rf_params = parameters['rf_params']
    assembler = feature.VectorAssembler(
        inputCols=feature_columns,
        outputCol="features")

    rf = classification.RandomForestClassifier(
        labelCol="churn", **rf_params)

    rf_pipeline = pipeline.Pipeline(stages=[assembler, rf])
    logger.info("Training prediction model")
    pipeline_model = rf_pipeline.fit(training_df)

    logger.info("Calculating model metrics")
    train_predictions_df = pipeline_model.transform(training_df)
    validation_predictions_df = pipeline_model.transform(validation_df)

    accuracy_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="accuracy", labelCol="churn", predictionCol="prediction")

    precision_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedPrecision", labelCol="churn", predictionCol="prediction")

    recall_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="weightedRecall", labelCol="churn", predictionCol="prediction")

    f1_evaluator = evaluation.MulticlassClassificationEvaluator(
        metricName="f1", labelCol="churn", predictionCol="prediction")

    auroc_evaluator = evaluation.BinaryClassificationEvaluator(metricName='areaUnderROC', labelCol="churn")

    logger.info("Saving model and metrics data")
    train_metrics = {
        "accuracy": accuracy_evaluator.evaluate(train_predictions_df),
        "precision": precision_evaluator.evaluate(train_predictions_df),
        "recall": recall_evaluator.evaluate(train_predictions_df),
        "f1": f1_evaluator.evaluate(train_predictions_df),
        "auroc": auroc_evaluator.evaluate(train_predictions_df)
    }
    validation_metrics = {
        "accuracy": accuracy_evaluator.evaluate(validation_predictions_df),
        "precision": precision_evaluator.evaluate(validation_predictions_df),
        "recall": recall_evaluator.evaluate(validation_predictions_df),
        "f1": f1_evaluator.evaluate(validation_predictions_df),
        "auroc": auroc_evaluator.evaluate(validation_predictions_df)
    }

    rf_model = pipeline_model.stages[-1]
    model_params = rf_model.extractParamMap()
    model_description = {
        "name": "Random Forest",
        "params": {param.name: value for param, value in model_params.items()},
    }

    dir_model = pathlib.Path(dir_models)
    dir_model.mkdir(parents=True, exist_ok=True)

    path_pipeline_model = pathlib.Path(dir_model).joinpath("pipeline_model")
    path_train_metrics = pathlib.Path(dir_model).joinpath("metrics_train.json")
    path_validation_metrics = pathlib.Path(dir_model).joinpath("metrics_validation.json")
    path_model_description = pathlib.Path(dir_model).joinpath("model_description.json")

    pipeline_model.save(str(path_pipeline_model))
    with open(path_train_metrics, "w") as f:
        json.dump(train_metrics, f)
    with open(path_validation_metrics, "w") as f:
        json.dump(validation_metrics, f)
    with open(path_model_description, "w") as f:
        json.dump(model_description, f)
def _transform_data(sparse_features, continuous_features, data: DataFrame,
                    feature_dict_path: str, sc: SparkContext):
    """
    转换数据, 生成deepFM所需格式的TFRecord
    数据格式形如: trace_id, feature_index, feature_values
    :param sparse_features:
    :param continuous_features:
    :param data:
    :param feature_dict_path:
    :param sc:
    :return:
    """
    def _get_feature_value_index_udf(broadcast_feature_dict, feature_name):
        feature_dict = broadcast_feature_dict.value

        def _get_feature_value_index_wrapper(feature_value):
            # 离散变量返回对应值的index
            if feature_name in sparse_features:
                if str(feature_value) in feature_dict[feature_name].keys():
                    return int(feature_dict[feature_name][str(feature_value)])
                else:
                    return int(
                        feature_dict[feature_name][str(UNKNOWN_VALUE_KEY)])
            # 连续变量只有一个index
            else:
                return feature_dict[feature_name]

        return F.udf(lambda x: _get_feature_value_index_wrapper(x))

    broadcast_feature_dict = \
        sc.broadcast(json.loads(utils.read_from_hdfs(sc, feature_dict_path)))

    features = broadcast_feature_dict.value.keys(
    )  # 由于dict无需,这里统一获取feature list

    for col in features:
        data = data \
            .withColumn("feature_index_" + col,
                        _get_feature_value_index_udf(
                            broadcast_feature_dict, col)(data[col]).cast("float"))
        if col in continuous_features:
            data = data.withColumn("feature_value_" + col,
                                   data[col].cast("float"))
        else:
            data = data.withColumn("feature_value_" + col,
                                   F.lit(1).cast("float"))

    data = data.cache()
    print("[INFO] transformed features: ")
    data.show(5, False)
    # vectorAssembler
    feature_index_vector_assembler = feature.VectorAssembler(
        inputCols=["feature_index_" + f for f in features],
        outputCol="feature_index")
    feature_value_vector_assembler = feature.VectorAssembler(
        inputCols=["feature_value_" + f for f in features],
        outputCol="feature_value")
    data = feature_index_vector_assembler.transform(data)
    data = feature_value_vector_assembler.transform(data)
    data = data.select("trace_id", "feature_index", "feature_value", "label")

    return features, data
Beispiel #14
0
df_conv01 = convDf(df01)

#prepare for ml
df_prepped01 = prep(df_conv01)
df_prepped02 = df02.select("name").distinct()

#function to apply labels
df_labeled = get_labels(df_prepped01, df_prepped02)
df_labeled = df_labeled.na.drop().drop("version_idx")
cols_for_ml = df_prepped01.drop("name").drop("version_idx").schema.names

#pipline stages
#index the label
labelIndexer = mlf.StringIndexer(inputCol="Label", outputCol="Label_idx")
#vectorise the input
toVec = mlf.VectorAssembler(inputCols=cols_for_ml, outputCol="Features")
#classify
classifier = DecisionTreeClassifier(labelCol="Label_idx",
                                    featuresCol="Features",
                                    maxDepth=10,
                                    maxBins=200)

#create pipline of the stages and use it to train and test
pipeline = ml.Pipeline(stages=[labelIndexer, toVec, classifier])
train, test = df_labeled.randomSplit([0.7, 0.3], seed=12345)
df_pip = pipeline.fit(train)
predicted = df_pip.transform(test)
#print result
predicted.select("name", "Label_idx", "prediction", "rawPrediction",
                 "probability").show(30, False)
Beispiel #15
0
def main(spark):
    n = len(sys.argv) - 1
    if n < 1:
        print('\nParameters are needed!!\n')
        sys.exit()
    else:
        i = sys.argv[1]
        batch = sys.argv[2]
        end_date = sys.argv[3]
        end_date_1w = sys.argv[4]
        end_date_2w = sys.argv[5]
        input_train_data_table = sys.argv[6]
        input_predict_data_table = sys.argv[7]
        output_cd3_score_table = sys.argv[8]
        output_train_result_table = sys.argv[9]
        output_predict_result_table = sys.argv[10]
        predict_date = sys.argv[11]

    spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
    spark.sql("set spark.sql.hive.mergeFiles=true")
    spark.sql("set hive.exec.orc.split.strategy=BI")
    spark.sql("set mapred.job.priority = HIGH")
    spark.sql("set hive.default.fileformat=Orc")
    spark.sql("set hive.exec.parallel=true")
    spark.sql("set hive.auto.convert.join=true")
    spark.sql("set hive.merge.mapfiles = true")
    spark.sql("set hive.merge.mapredfiles = true")
    spark.sql("set hive.merge.size.per.task = 256000000")
    spark.sql("set hive.merge.smallfiles.avgsize=128000000")
    spark.sql("set hive.merge.orcfile.stripe.level=false")
    spark.sql("set hive.exec.dynamic.partition=true")
    spark.sql("set hive.exec.max.dynamic.partitions=1000000")
    spark.sql("set hive.exec.max.dynamic.partitions.pernode=1000000")
    spark.sql("set hive.exec.max.created.files=1000000")
    spark.sql("set mapreduce.job.counters.limit=10000")
    spark.sql("set mapred.output.compress=true")
    spark.sql("set hive.exec.compress.output=true")
    spark.sql("set spark.shuffle.service.enabled = true")
    spark.sql("set spark.sql.broadcastTimeout = 10000")

    # 要计算的全部品类
    cd3_list_df = spark.sql("""
                          select 
                              item_third_cate_cd
                          from 
                              """ + input_predict_data_table + """
                          group by 
                              item_third_cate_cd
                          order by 
                              split(item_third_cate_cd,'')[length(item_third_cate_cd)-1], split(item_third_cate_cd,'')[length(item_third_cate_cd)-2], item_third_cate_cd
                          """)

    # 目前还没有计算完的品类
    # cd3_list_df = spark.sql("""
    # select
    # item_third_cate_cd
    # from
    # (
    # select
    # a.item_third_cate_cd,label
    # from
    # (
    # select
    # item_third_cate_cd
    # from
    # """ + input_predict_data_table + """
    # group by
    # item_third_cate_cd
    # )a
    # left JOIN
    # (select item_third_cate_cd, 1 as label from """ + output_cd3_score_table + """ group by item_third_cate_cd)b
    # on
    # a.item_third_cate_cd=b.item_third_cate_cd
    # )t
    # where
    # label is null
    # order by
    # split(item_third_cate_cd,'')[length(item_third_cate_cd)-1], split(item_third_cate_cd,'')[length(item_third_cate_cd)-2], item_third_cate_cd
    # """)

    # 是否有应该出数但是没有出数的品类
    # cd3_list_df = spark.sql("""
    # select
    # t1.item_third_cate_cd
    # from
    # (	select
    # item_third_cate_cd
    # from
    # app.app_vdp_ai_sink_search_old_model_cd3_score
    # where
    # sku_count > 0)t1
    # left join
    # (select item_third_cate_cd,1 as index from app.app_vdp_ai_sink_search_old_model_predict_result group by item_third_cate_cd)t2
    # on t1.item_third_cate_cd=t2.item_third_cate_cd
    # where index is null or index=''
    # order by t1.item_third_cate_cd
    # """)

    cd3_list = cd3_list_df.rdd.map(lambda row: row[0]).collect()
    cd3_list_batch = get_scope_id_batch(int(i), int(batch), cd3_list)

    for cd3 in cd3_list_batch:
        print('\ncd3 = {} 开始计算\n'.format(cd3))

        try:
            ### 验证当前品类是否跑过
            if_finish = spark.sql("select * from " + output_cd3_score_table +
                                  " where item_third_cate_cd = '" + cd3 + "'")
            if if_finish.count() > 0:
                print('already finished yet')
                continue

            ### 构建训练和预测样本

            # 当周正样本
            data_now = spark.sql(
                """
                      select * 
                      from """ + input_train_data_table + """ 
                      where end_date = '""" + end_date +
                """' and label > 0 and item_third_cate_cd = '""" + cd3 + """'
            """)

            # 提前1周的独有正样本
            data_1w = spark.sql(
                """
                        select 
                            a.*
                        from 
                            (
                            select 
                                *
                            from 
                                """ + input_train_data_table + """ 
                            where 
                                end_date = '""" + end_date_1w +
                """' and label > 0 and item_third_cate_cd = '""" + cd3 + """'
                            )a
                        left join 
                            (
                            select 
                                item_sku_id,1 as index
                            from 
                                """ + input_train_data_table + """ 
                            where 
                                end_date = '""" + end_date +
                """' and label > 0 and item_third_cate_cd = '""" + cd3 + """'
                            )b 
                        on 
                            a.item_sku_id=b.item_sku_id
                        where 
                            index is null or index = ''
                        """)

            # 提前2周的独有正样本
            data_2w = spark.sql(
                """
                        select 
                            a.*
                        from 
                            (
                            select 
                                *
                            from 
                                """ + input_train_data_table + """ 
                            where 
                                end_date = '""" + end_date_2w +
                """' and label > 0 and item_third_cate_cd = '""" + cd3 + """'
                            )a
                        left join 
                            (
                            select 
                                item_sku_id,1 as index
                            from 
                                """ + input_train_data_table + """ 
                            where 
                                end_date = '""" + end_date +
                """' and label > 0 and item_third_cate_cd = '""" + cd3 + """'
                            )b 
                        on 
                            a.item_sku_id=b.item_sku_id
                        where 
                            index is null or index = ''
                        """)

            # 合并正样本
            data = data_now.union(data_1w).union(data_2w)
            data_filter = data.filter("otc_days >= 0").filter(
                "sku_status_cd = 3001").filter("label <= 1")
            data_filter.cache()
            data_count = data_filter.count()

            # 构建待预测样本
            data_test = spark.sql("select * from " + input_predict_data_table +
                                  " where item_third_cate_cd = '" + cd3 + "'")
            data_test.cache()
            data_test_count = data_test.count()
            data_test = data_test.repartition(
                get_best_partition(data_test_count))

            # 判断是否缺少训练正样本或预测样本
            if data_count == 0 or data_test_count == 0:
                print('No train data or no predict data')
                spark.sql("""
                              insert overwrite table """ +
                          output_cd3_score_table + """ 
                              partition(dt='""" + predict_date +
                          """',item_third_cate_cd='""" + cd3 + """') 
                              values ({0},{1},{2},{3},{4},{5})
                          """.format(0, -1, -1, -1.0, -1.0, -1.0))
                continue

            # 补充负样本
            data_neg = spark.sql(
                """
                      select * 
                      from """ + input_train_data_table + """
                      where end_date = '""" + end_date_1w +
                """' and label = 0 and item_third_cate_cd = '""" + cd3 + """'
                      """)
            data_neg.cache()
            data_neg_count = data_neg.count()
            neg_sample_ratio = min(data_count / data_neg_count,
                                   1.0) if data_neg_count > 0 else 0.0
            data_neg_sample = data_neg.sample(neg_sample_ratio, seed=66)

            # 合并正负样本
            data_union = data_filter.union(data_neg_sample).orderBy(
                func.rand(seed=66))

            # 合并sku embedding特征
            sku_vec = spark.sql(
                "select * from tmp.tmp_qzl_sink_search_08_sku2vec_features")
            vec_size = len(sku_vec.columns) - 1
            data_union_sku2vec = data_union.join(sku_vec,
                                                 on='item_sku_id',
                                                 how='left')

            ### 训练模型

            # 特征分类
            # 非特征
            features_useless = [
                'item_first_cate_name', 'item_second_cate_cd',
                'item_second_cate_name', 'item_third_cate_cd',
                'item_third_cate_name', 'barndname_full', 'sku_name',
                'item_sku_id', 'uv_value_label', 'first_into_otc_tm',
                'end_date', 'sku_status_cd', 'red_price',
                'red_price_level_rank'
            ]
            # 类别型特征
            features_catagory = ['item_first_cate_cd']
            # embedding特征
            features_embedding = ['sku_vec_' + str(i) for i in range(vec_size)]
            # 数值型特征
            features_numerical = [
                f for f in data_union_sku2vec.columns if f not in ['label'] +
                features_useless + features_catagory + features_embedding
            ]

            # 处理缺失值
            fillna_value = {c: -1 for c in features_numerical}
            fillna_value.update({c: -10 for c in features_embedding})
            data_union_sku2vec_fillna = data_union_sku2vec.fillna(fillna_value)

            # 数据预处理
            featuresCreator = ft.VectorAssembler(inputCols=features_numerical +
                                                 features_embedding,
                                                 outputCol='features')
            pipeline = Pipeline(stages=[featuresCreator])
            data_transformer = pipeline.fit(data_union_sku2vec_fillna)
            data_transformed = data_transformer.transform(
                data_union_sku2vec_fillna)
            data_transformed.cache()
            data_union_count = data_transformed.count()
            data_filter.unpersist()
            data_neg.unpersist()

            p_num = get_best_partition(data_union_count)
            data_transformed = data_transformed.repartition(p_num)

            # 开始训练
            best_depth = get_best_depth(data_union_count)
            best_iter = get_best_iter(data_union_count)
            gbdt = GBTRegressor(featuresCol='features',labelCol='label',predictionCol='prediction',lossType='squared',seed=66,maxMemoryInMB=2048,cacheNodeIds=True, \
                                maxDepth=best_depth,maxIter=best_iter,featureSubsetStrategy='0.8',subsamplingRate=0.8,stepSize=0.01)
            gbdt_model = gbdt.fit(data_transformed)

            # 模型评估
            evaluator = RegressionEvaluator(predictionCol='prediction',
                                            labelCol='label',
                                            metricName='rmse')
            gbdt_pred = gbdt_model.transform(data_transformed)
            train_rmse = evaluator.evaluate(
                gbdt_pred, {evaluator.metricName: 'rmse'})  # 训练集rmse
            # 训练集label与predict的相关系数
            corr_result = gbdt_pred.corr('label', 'prediction')
            train_corr = corr_result if np.isnan(corr_result) == False else 1.0
            # 训练集label与predict的top 50%重合比例
            data_pred_df = gbdt_pred.select(
                ['item_sku_id', 'label', 'prediction']).toPandas()
            top_n = max(int(data_union_count * 0.5), 1)
            sku_label_top = data_pred_df.sort_values(
                by=['label'],
                ascending=False)['item_sku_id'].values.tolist()[:top_n]
            sku_pred_top = data_pred_df.sort_values(
                by=['prediction'],
                ascending=False)['item_sku_id'].values.tolist()[:top_n]
            top_cover_ratio = len(set(sku_label_top)
                                  & set(sku_pred_top)) / top_n

            ### 预测候选商品转化率

            # 处理预测样本
            data_test_sku2vec = data_test.join(sku_vec,
                                               on='item_sku_id',
                                               how='left')
            fillna_value_test = {c: -1 for c in features_numerical}
            fillna_value_test.update({c: -10 for c in features_embedding})
            data_test_fillna = data_test_sku2vec.fillna(fillna_value_test)
            data_transformer_test = pipeline.fit(data_test_fillna)
            data_transformed_test = data_transformer_test.transform(
                data_test_fillna)
            data_transformed_test.cache()
            data_test.unpersist()

            # 得到并输出候选商品池的预测结果
            gbdt_pred_test = gbdt_model.transform(data_transformed_test)
            features_result = [
                'item_sku_id', 'prediction', 'red_price',
                'red_price_level_rank', 'otc_days'
            ]
            result = gbdt_pred_test.select(features_result).withColumn(
                'new_old',
                func.when(func.col('otc_days') < 90, 'new').otherwise('old'))
            result.createOrReplaceTempView("result_df")
            spark.sql("""
                     insert overwrite table """ + output_predict_result_table +
                      """ 
                     partition(dt='""" + predict_date +
                      """',item_third_cate_cd='""" + cd3 + """') 
                     select * from result_df
            """)

            # 输出训练集样本的预测结果
            features_result_train = ['item_sku_id', 'label', 'prediction']
            train_result = gbdt_pred.select(features_result_train)
            train_result.createOrReplaceTempView("train_result_df")
            spark.sql("""
                     insert overwrite table """ + output_train_result_table +
                      """ 
                     partition(dt='""" + predict_date +
                      """',item_third_cate_cd='""" + cd3 + """') 
                     select * from train_result_df
            """)

            # 输出品类训练模型的验证结果
            spark.sql("""
                          insert overwrite table """ + output_cd3_score_table +
                      """ 
                          partition(dt='""" + predict_date +
                      """',item_third_cate_cd='""" + cd3 + """') 
                          values ({0},{1},{2},{3},{4},{5})
                      """.format(data_union_count, best_depth, best_iter,
                                 train_rmse, train_corr, top_cover_ratio))

            data_transformed.unpersist()
            data_transformed_test.unpersist()

        except Exception as e:
            print('Error:', e)
            continue
Beispiel #16
0
    # miss_pool.toPandas().to_csv('/tmp/xieyulong/miss_{}.csv'.format(time.time()),index=False)

    ##static_variance:
    data_rdd = data.rdd.map(lambda row: [x for x in row])
    mllib_st = st.Statistics.colStats(data_rdd)
    for col, m, v in zip(data.columns, mllib_st.mean(), mllib_st.variance()):
        print('{0}: \t{1:.2f} \t{2:.2f}'.format(col, m, np.sqrt(v)))

    ##static_corr:

    ##train_model:
    fea_pool = data.columns
    fea_pool.remove('y')

    ##featuerCreator:
    featuerCreator = ft.VectorAssembler(inputCols=fea_pool,
                                        outputCol='features')

    ##weightCol:
    data = data.withColumn('weight',
                           fn.when(data['y'] == 1, 1.0).otherwise(0.02))

    train, test = data.randomSplit([0.7, 0.3], seed=1234)  #42
    lr_model = cl.LogisticRegression(
        # maxIter=10,
        # regParam=0.01,
        elasticNetParam=0,
        family='binomial',
        threshold=0.5,
        weightCol='weight',
        labelCol='y')
# COMMAND ----------

test.count()

# COMMAND ----------

# MAGIC %md
# MAGIC 2 Vector Assembler Pipeline stages
# MAGIC 1 - With all features
# MAGIC 2 - With just the intercept

# COMMAND ----------

vecScaled = feature.VectorAssembler(inputCols=[
    'ERTPREAT', 'ERTSEAT', 'EUDIETSODA', 'EUEXERCISE', 'TEAGE', 'EEINCOME1',
    'EUEXFREQ', 'EUFASTFD', 'EUFFYDAY', 'EUFDSIT', 'EUGENHTH', 'EUGROSHP',
    'EUMEAT', 'EUPRPMEL', 'TUACTIVITY_N', 'tuactdur24', 'tewhere', 'TESEX'
],
                                    outputCol='features')

# COMMAND ----------

vecIntercept = feature.VectorAssembler(inputCols=[], outputCol='emptyFeatures')

# COMMAND ----------

# MAGIC %md
# MAGIC Scaling stage to scale features from Vector Assembler

# COMMAND ----------

scaled = feature.StandardScaler(inputCol='features',
Beispiel #18
0
    carr_indexer = features.StringIndexer(inputCol="carrier",
                                          outputCol="carrier_index")
    # Create a OneHotEncoder
    carr_encoder = features.OneHotEncoder(inputCol="carrier_index",
                                          outputCol="carrier_fact")

    # Create a StringIndexer
    dest_indexer = features.StringIndexer(inputCol="dest",
                                          outputCol="dest_index")
    # Create a OneHotEncoder
    dest_encoder = features.OneHotEncoder(inputCol="dest_index",
                                          outputCol="dest_fact")

    # Make a VectorAssembler
    vec_assembler = features.VectorAssembler(inputCols=[
        "month", "air_time", "carrier_fact", "dest_fact", "plane_age"
    ],
                                             outputCol="features")

    # Make the pipeline
    flights_pipe = Pipeline(stages=[
        dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler
    ])
    # Fit and transform the data
    piped_data = flights_pipe.fit(model_data).transform(model_data)
    # Split the data into training and test sets
    training, test = piped_data.randomSplit([.6, .4])

    # Create a LogisticRegression Estimator
    lr = LogisticRegression()
    # Create a BinaryClassificationEvaluator
    evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")
Beispiel #19
0
# COMMAND ----------

trainingdf = store_num_data_ind_enc.filter(store_num_data_ind_enc.Date_Date <'2014-04-28')

# COMMAND ----------

validationdf = store_num_data_ind_enc.filter(store_num_data_ind_enc.Date_Date >= "2014-04-28").filter(store_num_data_ind_enc.Date_Date <= "2014-10-31")

# COMMAND ----------

testdf = store_num_data_ind_enc.filter(store_num_data_ind_enc.Date_Date >= "2014-10-31")

# COMMAND ----------

va = feature.VectorAssembler(inputCols=['typeNumVec','ClusterVec', 'StoreNumVec',"ItemFamilyNumVec","MonthVec","DayVec","DOWNum"], outputCol='features')
lr = regression.LinearRegression(featuresCol='features', labelCol='sum(Units)', regParam=0.5, elasticNetParam=0.3, fitIntercept= True)
pipe = Pipeline(stages=[va, lr])
model = pipe.fit(trainingdf)

# COMMAND ----------

# MAGIC %md *** Calculating RMSE***

# COMMAND ----------

rmse = (fn.avg((fn.col('sum(Units)') - fn.col('prediction'))**2))**.5

# COMMAND ----------

rmse1 = (fn.avg((fn.col('sum(sum(Units))') - fn.col('sum(prediction)'))**2))**.5
Beispiel #20
0
 ]
 schema = typ.StructType([
         typ.StructField(e[0], e[1], False) for e in labels        
 ])
 
 data = spark.read.csv("file:///home/hadoop/zhcao/workspace/spark_test/watermelon.csv",
                       header = True,
                       schema = schema)
 data.createOrReplaceTempView("data_temp")
 data.printSchema()
 data.cache()
 data.show()
 
 #
 featuresCreator = ft.VectorAssembler(
         inputCols = ['VIB1', 'VIB2'],
         outputCol = 'features'
 )
 
 pca = ft.PCA(k = 1,
              inputCol = 'features',
              outputCol = 'pca_features'
             )
 
 pipeline = Pipeline(stages = [
         featuresCreator,
         pca
 ])
 
 model = pipeline.fit(data)
 
 result = model.transform(data)
Beispiel #21
0
def skl_predict(spark): 

    print (1111)
    
    
    data = [(list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 1])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0])),
        (list([-0.7016797, 1.22524766, -0.7123829, -0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101,-0.6565101, 0]))
        ]
    labels = ['_1', '_2', '_3', '_4','_5','_6','_7','_8','_9','_10','_11','_12','_13','_14','_15','_16','_17','_18','_19','_20','_21','_22','_23','_24','_25','_26','_27','_28','_29','_30', 'INFANT_ALIVE_AT_REPORT']
    df = spark.createDataFrame(data, schema = labels)

    # df = df.withColumn( "age", df['age']+1 ) 
    df.show()
    # df.select("age").distinct().show() 
    # df.count()

    # 列数据合并
    from pyspark.sql.functions import split, explode, concat, concat_ws
    df_concat = df.withColumn("_concat", concat(df['_1'], df['_2'], df['_3'], df['_4']))
    print ('df_concat>>>>>>>>>>>>>>>>>>>')
    df_concat.show()


    # 将所有的特征整和到一起
    featuresCreator = ft.VectorAssembler( inputCols=[ col for col in labels], outputCol='features' )


    # 创建评估器
    import pyspark.ml.classification as cl
    logistic = cl.LogisticRegression(
        maxIter=10, 
        regParam=0.01, 
        labelCol='INFANT_ALIVE_AT_REPORT')
    print ('logistic:', logistic)


    # 创建一个管道
    from pyspark.ml import Pipeline
    
    pipeline = Pipeline(stages=[
            featuresCreator, 
            logistic
        ])

    # fit 
    births_train, births_test = df.randomSplit([0.7, 0.3], seed=666)

    print ('births_train', births_train)
    print ( 'births_test', births_test )

    # 运行管道,评估模型。
    model = pipeline.fit(births_train)
    test_model = model.transform(births_test)

    print ('test_model:', test_model) 

    
    test_model.take(1)

    print ('test_model.take(1):', test_model.take(1))






    '''
Beispiel #22
0
display(reviewAnalysisDF.select('review_stars').distinct())

# In[44]:

get_ipython().run_line_magic('fs', 'ls tmp/reviewAnalysisDf')

# ### 3: Vector Assembly
# Once we are through with the encoder creation step, it is time to essemble the encoders and all the input and output columns to form a final vector_generator that will be passed as input to the machine learning pipeline.

# In[46]:

import pyspark.ml.feature as ft

featuresCreator = ft.VectorAssembler(inputCols=[
    'cool', 'funny', 'useful', 'is_open', 'business_review_count',
    'business_stars', 'average_stars', 'fans', 'user_review_count',
    'sentiment_score'
],
                                     outputCol='features')

# ### 4: Estimator Creation
# This is the step where we select the machine learning model that we wish to utilize. Here, we create an Estimator object that contains the machine learning model along with all the hyper optimization parameters that need to be passed to it. Here, we are using LogisticRegression.

# In[48]:

import pyspark.ml.classification as cl

logistic_regression_model = cl.LogisticRegression(maxIter=10,
                                                  regParam=0.01,
                                                  labelCol='review_stars',
                                                  family='multinomial')
print(type(logistic_regression_model))
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.ml.feature as mlf

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

sys.path.append(os.path.abspath('../'))
import importlib

spark = SparkSession.builder.master("local[2]").getOrCreate()
df = spark.read.parquet('testing/df')
non_pred_cols = ['label', 'response', 'features']
pred_cols = [x for x in df.columns if x not in non_pred_cols]
assembler = mlf.VectorAssembler(inputCols=pred_cols, outputCol='features')
df = assembler.transform(df)
df.cache()


sys.path.append(os.path.abspath('../'))
import importlib
import propensity_matching



importlib.reload(propensity_matching)
from propensity_matching.estimator import PropensityEstimator
estimator = PropensityEstimator()
model, df2 = estimator.fit(df)
df3, match_info = model.transform(df2)
                                       how='inner')
# So far, data_matrix contains Row(date, grid_square, topic_distributions, complaint_count).

# Get weekday from date.

get_weekday_udf = functions.udf(lambda d: d.weekday(),
                                returnType=types.IntegerType())
data_matrix = data_matrix.withColumn('weekday',
                                     get_weekday_udf(data_matrix['date']))

# Assemble the feature vectors.

weekday_one_hot_encoder = feature.OneHotEncoder(inputCol='weekday',
                                                outputCol='weekday_vector')
feature_vector_assembler = feature.VectorAssembler(
    inputCols=['weekday_vector', 'topic_distribution'],
    outputCol='final_feature_vector')
feature_assembly_pipeline = (ml.Pipeline(
    stages=[weekday_one_hot_encoder, feature_vector_assembler]).fit(
        data_matrix))

data_matrix = (feature_assembly_pipeline.transform(data_matrix).select(
    'date', 'grid_square', 'final_feature_vector', 'complaint_count'))

LOGGER.debug(
    str(data_matrix.count()) + " rows like " + str(data_matrix.take(1)))

#logistic_regression = classification.LogisticRegression(
#    maxIter=10, regParam=0.3, elasticNetParam=0.8,
#    featuresCol='final_feature_vector', labelCol='complaint_count',
#    probabilityCol='predicted_probability')
Beispiel #25
0
          ('CIG_2_TRI', typ.IntegerType()), ('CIG_3_TRI', typ.IntegerType()),
          ('MOTHER_HEIGHT_IN', typ.IntegerType()),
          ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
          ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()),
          ('MOTHER_WEIGHT_GAIN', typ.IntegerType()),
          ('DIABETES_PRE', typ.IntegerType()),
          ('DIABETES_GEST', typ.IntegerType()),
          ('HYP_TENS_PRE', typ.IntegerType()),
          ('HYP_TENS_GEST', typ.IntegerType()),
          ('PREV_BIRTH_PRETERM', typ.IntegerType())]

births_transformed = "file:///home/yuty/yangzz/births_transformed.csv"
schema = typ.StructType([typ.StructField(e[0], e[1], False) for e in labels])
births = spark.read.csv(births_transformed, header=True, schema=schema)
featuresCreator = ft.VectorAssembler(
    inputCols=[col[0] for col in labels[1:]],
    outputCol='features').transform(births).select('features').collect()

from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import BisectingKMeans

data = [(Vectors.dense([10, 10]), ), (Vectors.dense([3.0, 5.0]), ),
        (Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
        (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
df = spark.createDataFrame(data, ["features"])
bkm = BisectingKMeans(k=2, minDivisibleClusterSize=1.0)
model = bkm.fit(df)
centers = model.clusterCenters()
len(centers)
model.computeCost(df)
model.hasSummary
    def _prep_data(self, df: DataFrame):
        r"""
        remove highly collinear features, bin the features, and
        reduce the dimensionality if necessary and in that order

        Parameters
        ----------
        df : pyspark.sql.DataFrame
        self.fit_data_prep_args : dict
            arguments around preparing the data to be fit
            default args are
            default_fit_data_prep_args = {
                'class_balance': 1,
                'train_prop': .8,
                'bin_features':True,
                'remove_redundant_features':True,
            }

            'class balance' is ratio of control_candidates : treatment
            to train the model on

           train_prop is the proportion of the population (post-rebalance)
           that is in the training set

           'bin_features' can be bool, dict, or absent.
            if you do not want to bin them here, they MUST be binned
            prior. Unbinned features will undermine validity of outcome.
            if bin_features is absent or True, bin_features will be run
            with default args. If it is a dict, it will be passed as
            kwargs to bin_features. see utils.bin_features for arg details

            'remove_redundant_features' can be bool, dict or absent
            True or absent will run remove redundant features with default
            args. Dict will passed as kwargs instead.
            see utils.remove_redundant_features for arg details


        Returns
        -------
        df : pyspark.sql.DataFrame
            prepared dataframe


        Raises
        ------
        UncaughtExceptions

        See Also
        --------
        remove_redundant_features
        bin_features
        reduce_dimensionality
        """

        features_col = self.probability_estimator.getOrDefault('featuresCol')
        label_col = self.probability_estimator.getOrDefault('labelCol')

        if ('remove_redundant_features' not in self.fit_data_prep_args) | (
                self.fit_data_prep_args['remove_redundant_features'] is True):
            logging.getLogger(__name__).info(
                "removing redundant features with default args")
            df, pred_cols = remove_redundant_features(
                df=df, features_col=features_col)
        elif isinstance(self.fit_data_prep_args['remove_redundant_features'],
                        dict):
            logging.getLogger(__name__).info(
                "removing redundant features with specified args")
            df, pred_cols = remove_redundant_features(
                df=df, **self.fit_data_prep_args['remove_redundant_features'])
        elif self.fit_data_prep_args['remove_redundant_features'] is False:
            logging.getLogger(__name__).info("not removing redundant features")
        else:
            logging.getLogger(__name__).critical(
                "illegal arg for remove_redundant_features")
            raise ValueError(
                'illegal argument for "remove_redundant_features" in fit_data_prep_args'
            )

        if ('bin_features' not in self.fit_data_prep_args) | (
                self.fit_data_prep_args['bin_features'] is True):
            logging.getLogger(__name__).info(
                "binning features with default args")
            df, pred_cols = bin_features(df=df, features_col=features_col)
        elif isinstance(self.fit_data_prep_args['bin_features'], dict):
            logging.getLogger(__name__).info(
                "binning features with specified args")
            df, pred_cols = bin_features(
                df=df, **self.fit_data_prep_args['bin_features'])
        elif self.fit_data_prep_args['bin_features'] is False:
            logging.getLogger(__name__).info("not binning features")
        else:
            logging.getLogger(__name__).critical(
                "illegal arg for bin_features")
            raise ValueError(
                'illegal argument for "bin_features" in fit_data_prep_args')

        # leakage note: evaluation of informativeness of predictors includes test set
        # not ideal but minimal impact and is expedient for architecture right now.

        # num cols is limited by size of training set. To get it we must first rebalance, and multiply by train prop.
        # reduce dim on whole pop df, then apply the same transform to the rebalanced df
        self.df = df
        self._rebalance_df()
        ncols = int(
            (self.rebalanced_df.where(F.col(label_col) == 1).count() *
             self.fit_data_prep_args['train_prop']) // SAMPLES_PER_FEATURE)
        red_dim_args = {
            'df': self.df,
            'label_col': label_col,
            'binned_features_col': features_col,
            'ncols': ncols
        }
        logging.getLogger(__name__).info("reducing dimensionality of df")
        self.df, pred_cols = reduce_dimensionality(args=red_dim_args)

        assembler = mlf.VectorAssembler(inputCols=pred_cols,
                                        outputCol=features_col)
        self.rebalanced_df = assembler.transform(
            self.rebalanced_df.drop(features_col))

        return True
Beispiel #27
0
births = births     .withColumn(       'BIRTH_PLACE_INT',
                births['BIRTH_PLACE'] \
                    .cast(typ.IntegerType()))

# In[4]:

encoder = ft.OneHotEncoder(inputCol='BIRTH_PLACE_INT',
                           outputCol='BIRTH_PLACE_VEC')

# In[5]:


featuresCreator = ft.VectorAssembler(
    inputCols=[
        col[0]
        for col
        in labels[2:]] + \
    [encoder.getOutputCol()],
    outputCol='features'
)

# In[6]:

import pyspark.ml.classification as cl

# In[7]:

logistic = cl.LogisticRegression(maxIter=10,
                                 regParam=0.01,
                                 labelCol='INFANT_ALIVE_AT_REPORT')

# In[8]:
Beispiel #28
0
            inputCols=['land_condition_index', 'foundation_type_index', 'roof_type_index', 'ground_floor_type_index', 'position_index'], \
            outputCols=['land_condition_vec', 'foundation_type_vec', 'roof_type_vec', 'ground_floor_type_vec', 'position_vec'])
data_sample = encoder.fit(data_sample).transform(data_sample)

# 使用Binarizer将因变量y(4个分类)转换为二分类变量
# 其中0,1,2三类合并成一类记作0类,表示不需要重建,将因变量为3的类别记作1类,表示需要重建。
data_sample = data_sample.withColumn('y_double',
                                     data_sample['y'].cast(typ.DoubleType()))
binarizer = ft.Binarizer(threshold=2, inputCol="y_double", outputCol="label")
data_sample = binarizer.transform(data_sample)
data_sample.take(1)

# 使用VectorAssembler创建特征向量
featuresCreator = ft.VectorAssembler(
    inputCols=['floors_before', 'floors_after', 'age', 'area', 'height_before', 'height_after', \
               'land_condition_vec', 'foundation_type_vec', 'roof_type_vec', 'ground_floor_type_vec', 'position_vec'],
    outputCol='features'
)

# 使用VectorIndexer自动识别分类变量,设定最大分类数为5
indexer = ft.VectorIndexer(inputCol="features",
                           outputCol="indexed",
                           maxCategories=5)

# 划分训练集和测试集
data_train, data_test = data_sample.randomSplit([0.8, 0.2], seed=42)

############################### 描述性统计分析 ###############################

data_sample.printSchema()  # 查看数据Schema
Beispiel #29
0
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
airlineCleanDF = sqlCtx.createDataFrame(airlineCleanDFP)

# In[10]:

training, validation, testing = airlineCleanDF.randomSplit([0.6, 0.3, 0.1],
                                                           seed=0)

# In[9]:

# 2. Linear regression with avg_overall
vaAvgOverall = feature.VectorAssembler(inputCols=[
    'overall_ratingf', 'seat_comfort_ratingf', 'cabin_staff_ratingf',
    'food_beverages_ratingf', 'inflight_entertainment_ratingf',
    'ground_service_ratingf', 'wifi_connectivity_ratingf',
    'value_money_ratingf'
],
                                       outputCol='features')
lrAvgOverall = regression.LinearRegression(featuresCol='features',
                                           labelCol='recommendedi')
pipelineAvgOverall = Pipeline(stages=[vaAvgOverall, lrAvgOverall])
pipeline_modelAvgOverall = pipelineAvgOverall.fit(training)
pipeline_modelAvgOverall.transform(validation).select(
    fn.avg((fn.col('prediction') -
            fn.col('recommendedi'))**2).alias('MSE_Avg_Overall')).show()

# In[12]:

# 2. Linear regression with avg_overall
vaAvgOverall = feature.VectorAssembler(inputCols=[
Beispiel #30
0
regressor_OLS.summary()


X_Opt = X[:,[1,4,5,6]]
regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit()
regressor_OLS.summary()

X_Opt = X[:,[1,4,5]]
regressor_OLS = sm.OLS(endog=y, exog=X_Opt).fit()
regressor_OLS.summary()


#choice feature cols
feature_cols = [df_train.columns[1], df_train.columns[2], df_train.columns[3], df_train.columns[4]]
#feature_cols = df.columns[1:]
assembler = feature.VectorAssembler(inputCols=feature_cols, outputCol='features')
df_train = assembler.setHandleInvalid("skip").transform(df_train)
df_train = df_train.withColumnRenamed('Survived', 'label')
df_train = df_train.select('features', 'label')

# scaling
scaler = feature.StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)
scaler = scaler.fit(df_train)
df_train =scaler.transform(df_train)
df_train = df_train.drop('features').withColumnRenamed('scaledFeatures','features')



# TEST
# reading data_train