Exemple #1
0
sql_sc = SQLContext(sc)

data = [(2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,0,1,2)]
#print('param', str(sys.argv[1]))
#csv_df = sql_sc.read.format("csv").option("header","true").load("hdfs:///project_data/pets/train/train.csv")
#image_path = str(sys.argv[1])
#mage_DF = dl.readImages(image_path)
#image_DF.printSchema()
#image_DF.show()

#df=pd.DataFrame(data=datainput)
header1 = ['Type','Breed1','Breed2','Gender','Color1','Color2','Color3','MaturitySize','FurLength','Vaccinated','Dewormed','Sterilized','Health','Quantity','Fee','State','VideoAmt','PhotoAmt']
#feature = VectorAssembler(inputCols=input_cols,outputCol="features")
#feature_vector= feature.transform(df)
df1 = spark.createDataFrame(data,header1)
df1.show(n=2)
df1.first()
df1.count()
df1.printSchema()
df11.show()
df3 = VectorAssembler(inputCols=['Type','Breed1','Breed2','Gender','Color1','Color2','Color3','MaturitySize','FurLength','Vaccinated','Dewormed','Sterilized','Health','Quantity','Fee','State','VideoAmt','PhotoAmt'],outputCol='Features').transform(df3)
df3.show()
##lr_test = DecisionTreeClassificationModel.load("treemodelofcsv")

#p_lr_test = PipelineModel()
#tested_lr_test = p_lr_test.transform(image_DF)
#predict_value = tested_lr_test.select('prediction').head()[0]


    def annotate_pval_dataset(self, cur_df):
        import pyspark
        try:
            tr_inst = self.spark.read.parquet(self.training_temp_dir)
            te_inst = self.spark.read.parquet(self.testing_temp_dir)
            return tr_inst, te_inst
        except pyspark.sql.utils.AnalysisException as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")
            self.logger.debug("NOTEXISTS ANNOTATE_FILE")
            self.logger.debug("RUN_PROCESS")
        except:
            self.logger.info("TEST_PURPOSE")

        from pyspark.ml.feature import VectorAssembler
        postfix = self.postfix.format(self.sel_top)
        obs_df = cur_df

        cur_cols = obs_df.columns
        for i in self.non_feature_column:
            cur_cols.remove(i)
            self.logger.debug("feature_columns")
        cur_cols = sorted(cur_cols)
        self.logger.debug(cur_cols)
        import json

        json.dump({"non_demo_features": cur_cols},
                  open(self.json_feature_dump_loc, "w"))

        obs_df = VectorAssembler(
            inputCols=cur_cols, outputCol="features_imputed").transform(obs_df)

        cur_time_list = obs_df.select("ID", "TIME_SPAN")
        of_annotated = obs_df
        of_excl_training = dict()

        demo_feature = self.add_demo()

        of_annotated = VectorAssembler(
            inputCols=["features_imputed", "demo_feature"],
            outputCol="features").transform(
                of_annotated.join(demo_feature, "ID"))

        of_annotated.show()

        from pyspark.sql.functions import col, lit, when
        self.logger.debug("ANNOTATED")

        cur_test_ids = self.get_target_test_id()
        self.logger.debug(cur_test_ids)
        # TODO CHECK why I put 'why 0' comment over here?
        self.logger.debug(len(cur_test_ids))
        tr_inst, te_inst = self.cur_annotator.prep_TR_TE(
            of_annotated, test_id_list=cur_test_ids)

        self.logger.debug("IDS")
        self.logger.debug(
            tr_inst.select("ID").distinct().count(),
            te_inst.select("ID").distinct().count())

        self.logger.debug("TR_TE_CNT:{0}_{1}".format(tr_inst.count(),
                                                     te_inst.count()))

        train_data_ID = tr_inst.select("ID").distinct().rdd.flatMap(
            list).collect()

        testing_data_ID = te_inst.select("ID").distinct().rdd.flatMap(
            list).collect()

        self.action_df.show()

        train_action_df = self.action_df.where(
            col("ID").isin(train_data_ID)).persist()

        self.logger.debug(train_action_df.select("ID").distinct().count())

        train_terminal_outcome = self.terminal_outcome.where(
            col("ID").isin(train_data_ID)).persist()

        self.logger.debug(
            train_terminal_outcome.select("ID").distinct().count())

        intv_w_p_val = self.identify_relevant_action(
            train_action_df, train_terminal_outcome,
            tr_inst.select("ID").distinct().count())
        intv_w_p_val.join(
            self.def_df.where(col("SOURCE").isin(["CPT", "MED", "PROC"])),
            self.itemid).orderBy("p_val").show(100, truncate=False)

        from pyspark.sql.functions import sum, rand, max, lit
        from pyspark.ml.feature import VectorAssembler
        cur_annot_topk = self.sel_top

        self.action_df.show()
        self.terminal_outcome.show()

        annot_df = self.action_df.join(self.terminal_outcome, "ID").persist()
        annot_df.show()
        pos_inst_dict = dict()
        from pyspark.sql.functions import count
        for cur_of in [self.target_disch_col]:
            # For debug purpose, pass if target_of is not identified
            self.logger.debug(cur_of)
            intv_w_p_val.where("DISCH_DX == '{0}'".format(cur_of)).orderBy(
                col("p_val").cast("double")).show(50, truncate=False)
            target_annot_criteria = intv_w_p_val.where(
                "DISCH_DX == '{0}'".format(cur_of)).orderBy(
                    col("p_val").cast("double")).limit(cur_annot_topk)
            target_annot_criteria.write.save(self.annot_intv_dir.format(
                cur_of, cur_annot_topk),
                                             mode="overwrite")
            target_annot_criteria = target_annot_criteria.select(
                self.itemid).rdd.flatMap(list).collect()
            if len(target_annot_criteria) == 0:
                self.logger.info(
                    "NO TERMINAL DX {0} idenfieid from pts".format(cur_of))
                pos_inst_dict[cur_of] = None
                continue
            self.logger.debug(target_annot_criteria)
            self.logger.debug(len(target_annot_criteria))
            self.logger.debug("selected intv!!")
            self.def_df.where(col(
                self.itemid).isin(target_annot_criteria)).show(cur_annot_topk,
                                                               truncate=False)
            pos_inst_dict[cur_of] = annot_df.where((col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of))\
                .select("ID", col("TIME_OBS").cast("date").alias("TIME_OBS"), lit("1").cast("double").alias("{0}_label".format(cur_of)))\
                .distinct().persist()
            pos_inst_dict[cur_of].groupBy("{0}_label".format(cur_of)).agg(
                count("*")).show()
            from pyspark.sql.functions import broadcast

            true_inst = annot_df.where(
                (col(self.itemid).isin(target_annot_criteria))
                & (col("DISCH_DX") == cur_of))
            excl_id = annot_df.withColumn("IS_TARGET_OF",when(col("DISCH_DX") ==cur_of,lit("1").cast("double")).otherwise(lit("0").cast("double")))\
                .withColumn("IS_REL_INTV", when(col(self.itemid).isin(target_annot_criteria), lit("1").cast("double")).otherwise(lit("0").cast("double")))\
                .groupBy("ID").agg(sum("IS_TARGET_OF").alias("SUM_IS_TARGET_OF"),sum("IS_REL_INTV").alias("SUM_IS_REL_INTV"))\
                .where("(SUM_IS_TARGET_OF <> 0) AND (SUM_IS_REL_INTV == 0)").select("ID").distinct().rdd.flatMap(list).collect()
            self.logger.debug("NUM_PTS_EXCLUDED:{0}".format(len(excl_id)))
            self.logger.debug("TRAINING_INST_COUNT:{0}".format(
                tr_inst.count()))
            tr_inst = tr_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\
                .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\
                .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0,subset=["{0}_label".format(cur_of)]).persist()
            print(tr_inst.count())
            tr_inst.groupBy("{0}_label".format(cur_of),
                            "{0}_excl".format(cur_of)).agg(count("*")).show()
            te_inst = te_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\
                .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\
                .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0, subset=["{0}_label".format(cur_of)]).persist()
            print(te_inst.count())
            te_inst.groupBy("{0}_label".format(cur_of),
                            "{0}_excl".format(cur_of)).agg(count("*")).show()

            tr_inst.groupBy("ID").agg(
                max("{0}_label".format(cur_of)).alias(
                    "{0}_label".format(cur_of)),
                max("{0}_excl".format(cur_of)).alias(
                    "{0}_excl".format(cur_of))).groupBy(
                        "{0}_label".format(cur_of),
                        "{0}_excl".format(cur_of)).agg(count("*")).show()
            te_inst.groupBy("ID").agg(
                max("{0}_label".format(cur_of)).alias(
                    "{0}_label".format(cur_of)),
                max("{0}_excl".format(cur_of)).alias(
                    "{0}_excl".format(cur_of))).groupBy(
                        "{0}_label".format(cur_of),
                        "{0}_excl".format(cur_of)).agg(count("*")).show()

        tr_inst.write.save(self.training_temp_dir, mode="overwrite")
        te_inst.write.save(self.testing_temp_dir, mode="overwrite")

        tr_inst = self.spark.read.parquet(self.training_temp_dir)
        te_inst = self.spark.read.parquet(self.testing_temp_dir)
        #te_inst.show()

        return (tr_inst, te_inst)
Exemple #3
0
                    outputCol='color1').fit(df2).transform(df2)
df3.show()
df3.printSchema()
df3 = OneHotEncoder(inputCol='color1', outputCol='color2',
                    dropLast=False).transform(df3)
df3.printSchema()
df4 = StringIndexer(inputCol='type', outputCol='type1').fit(df2).transform(df3)
df4.show()
df4.printSchema()

# Vector assembler
df5 = VectorAssembler(inputCols=[
    'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2'
],
                      outputCol='Features').transform(df4)
df5.show(truncate=False)
df5.printSchema()
# --------------------------------------------------------------------------

# data processing complete---
# 6 .Model building
training = df5
#training.show(truncate=False,n=5)
from pyspark.ml.classification import RandomForestClassifier
df1 = RandomForestClassifier(featuresCol='Features',
                             labelCol='type1',
                             numTrees=86,
                             maxDepth=10)
model22 = df1.fit(training)
model22.getNumTrees
#model22.numFeatures
Exemple #4
0
from pyspark import SparkContext, SparkConf
from os import environ
from environment import spark

from pyspark.ml.feature import VectorAssembler

# environ['JAVA_HOME'] = 'D:\Program Files\Java\jdk1.8.0_181'
# environ['HADOOP_HOME'] = 'D:\hadoop-3.1.2'
# environ['SPARK_HOME'] = 'D:\spark-2.4.3-bin-hadoop2.7\spark-2.4.3-bin-hadoop2.7'
#
# conf = SparkConf() \
#     .setAppName("demo") \
#     .setMaster("spark://192.168.30.247:7077") \
#     .set("spark.driver.host", "192.168.30.109") \
#     .set("spark.cores.max", "4") \
#     .set("spark.executor.memory", "512m")
# sc = SparkContext(conf=conf)
#
# data = sc.parallelize(['a', 'a', 'd', 'd', 'b', 'c', 'd', 'e', 'f', 'g'])
# pairs = data.map(lambda s: (s, 1))
# counts = pairs.reduceByKey(lambda a, b: a + b)
# print(counts.sortByKey().collect())

df = spark.createDataFrame([(1, 11, 19)], ["id", "name", "age"])
ddf = VectorAssembler(inputCols=["name", "age"],
                      outputCol='features').transform(df)
ddf.show()
    def gen_lr_sort_model(self):
        self.spark_session.sql("use portal")
        # 用户文章点击行为
        sql = "select user_id, article_id, channel_id, click_flag from t_user_behaviour"
        user_article_click_df = self.spark_session.sql(sql)
        user_article_click_df.show()
 
        # 获取用户画像 基础数据、偏好喜好数据
        sql = "select split(user_id, ':')[1] user_id, basic_info.gender, basic_info.age, preference_info " \
              "from t_user_profile"
        user_profile_df = self.spark_session.sql(sql)
        user_profile_df.show()
 
        user_article_click_df = user_article_click_df.join(user_profile_df, on=["user_id"], how="left")
 
        # 抽取文章所属频道关键词向量特征
        def extract_channel_keyword_feature(partition):
            from pyspark.ml.linalg import Vectors
            for row in partition:
                try:
                    weights = sorted([row.preference_info[key] for key in row.preference_info.keys()
                                     if key.split(':')[0] == row.channel_id], reverse=True)[:10]
                except Exception as e:
                    print e.message
                    weights = [0.0] * 10
                yield row.article_id, row.channel_id, row.user_id, int(row.gender), int(row.age), \
                    Vectors.dense(weights if weights else [0.0] * 10), row.click_flag
 
        user_article_click_df = user_article_click_df.rdd.mapPartitions(extract_channel_keyword_feature) \
            .toDF(["article_id", "channel_id", "user_id", "gender", "age", "channel_weights", "click_flag"])
        user_article_click_df.show()
 
        # 获取文章画像
        article_profile_df = self.spark_session.sql("select * from t_article_profile")
 
        # 抽取文章关键词向量特征
        def extract_feature(partition):
            from pyspark.ml.linalg import Vectors
            for row in partition:
                try:
                    weights = sorted(row.keywords.values(), reverse=True)[:10]
                except Exception as e:
                    print e.message
                    weights = [0.0] * 10
                yield row.article_id, Vectors.dense(weights if weights else [0.0] * 10)
        article_profile_df = article_profile_df.rdd.mapPartitions(extract_feature).toDF(["article_id", "article_weights"])
        article_profile_df.show()
 
        user_article_click_df = user_article_click_df.join(article_profile_df, on=["article_id"], how="inner")
        user_article_click_df.show()
 
        # 获取文章向量
        article_vector_df = self.spark_session.sql("select article_id, vector from t_article_vector")
 
        def array_to_vector(partition):
            from pyspark.ml.linalg import Vectors
            for row in partition:
                yield row.article_id, Vectors.dense(row.vector)
        article_vector_df = article_vector_df.rdd.mapPartitions(array_to_vector).toDF(["article_id", "article_vector"])
        article_vector_df.show()
 
        user_article_click_df = user_article_click_df.join(article_vector_df, on=["article_id"], how="inner")
        user_article_click_df.show()
 
        # 收集特征
        from pyspark.ml.feature import VectorAssembler
        input_cols = ["channel_id", "gender", "age", "channel_weights", "article_weights", "article_vector"]
        user_article_click_df = VectorAssembler().setInputCols(input_cols) \
                                                 .setOutputCol("features") \
                                                 .transform(user_article_click_df)
        user_article_click_df.show()
 
        # Logistic Regression
        from pyspark.ml.classification import LogisticRegression
        logistic_regression = LogisticRegression()
        logistic_regression_model = logistic_regression.setFeaturesCol("features") \
                                                       .setLabelCol("click_flag")\
                                                       .fit(user_article_click_df)
        logistic_regression_model.write().overwrite().save(
            "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model")
 
        from pyspark.ml.classification import LogisticRegressionModel
        logistic_regression_model = LogisticRegressionModel.load(
            "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model")
        logistic_regression_result = logistic_regression_model.transform(user_article_click_df)
        logistic_regression_result.select(["click_flag", "probability", "prediction"]).show()
 
        # ROC
        def vector_to_double(row):
            return float(row.click_flag), float(row.probability[1])
        score_labels = logistic_regression_result.select(["click_flag", "probability"]).rdd.map(vector_to_double)
        score_labels.collect()
        from pyspark.mllib.evaluation import BinaryClassificationMetrics
        binary_classification_metrics = BinaryClassificationMetrics(scoreAndLabels=score_labels)
        area_under_roc = binary_classification_metrics.areaUnderROC
        print area_under_roc