sql_sc = SQLContext(sc) data = [(2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,0,1,2)] #print('param', str(sys.argv[1])) #csv_df = sql_sc.read.format("csv").option("header","true").load("hdfs:///project_data/pets/train/train.csv") #image_path = str(sys.argv[1]) #mage_DF = dl.readImages(image_path) #image_DF.printSchema() #image_DF.show() #df=pd.DataFrame(data=datainput) header1 = ['Type','Breed1','Breed2','Gender','Color1','Color2','Color3','MaturitySize','FurLength','Vaccinated','Dewormed','Sterilized','Health','Quantity','Fee','State','VideoAmt','PhotoAmt'] #feature = VectorAssembler(inputCols=input_cols,outputCol="features") #feature_vector= feature.transform(df) df1 = spark.createDataFrame(data,header1) df1.show(n=2) df1.first() df1.count() df1.printSchema() df11.show() df3 = VectorAssembler(inputCols=['Type','Breed1','Breed2','Gender','Color1','Color2','Color3','MaturitySize','FurLength','Vaccinated','Dewormed','Sterilized','Health','Quantity','Fee','State','VideoAmt','PhotoAmt'],outputCol='Features').transform(df3) df3.show() ##lr_test = DecisionTreeClassificationModel.load("treemodelofcsv") #p_lr_test = PipelineModel() #tested_lr_test = p_lr_test.transform(image_DF) #predict_value = tested_lr_test.select('prediction').head()[0]
def annotate_pval_dataset(self, cur_df): import pyspark try: tr_inst = self.spark.read.parquet(self.training_temp_dir) te_inst = self.spark.read.parquet(self.testing_temp_dir) return tr_inst, te_inst except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.info("PROCESS") self.logger.debug("NOTEXISTS ANNOTATE_FILE") self.logger.debug("RUN_PROCESS") except: self.logger.info("TEST_PURPOSE") from pyspark.ml.feature import VectorAssembler postfix = self.postfix.format(self.sel_top) obs_df = cur_df cur_cols = obs_df.columns for i in self.non_feature_column: cur_cols.remove(i) self.logger.debug("feature_columns") cur_cols = sorted(cur_cols) self.logger.debug(cur_cols) import json json.dump({"non_demo_features": cur_cols}, open(self.json_feature_dump_loc, "w")) obs_df = VectorAssembler( inputCols=cur_cols, outputCol="features_imputed").transform(obs_df) cur_time_list = obs_df.select("ID", "TIME_SPAN") of_annotated = obs_df of_excl_training = dict() demo_feature = self.add_demo() of_annotated = VectorAssembler( inputCols=["features_imputed", "demo_feature"], outputCol="features").transform( of_annotated.join(demo_feature, "ID")) of_annotated.show() from pyspark.sql.functions import col, lit, when self.logger.debug("ANNOTATED") cur_test_ids = self.get_target_test_id() self.logger.debug(cur_test_ids) # TODO CHECK why I put 'why 0' comment over here? self.logger.debug(len(cur_test_ids)) tr_inst, te_inst = self.cur_annotator.prep_TR_TE( of_annotated, test_id_list=cur_test_ids) self.logger.debug("IDS") self.logger.debug( tr_inst.select("ID").distinct().count(), te_inst.select("ID").distinct().count()) self.logger.debug("TR_TE_CNT:{0}_{1}".format(tr_inst.count(), te_inst.count())) train_data_ID = tr_inst.select("ID").distinct().rdd.flatMap( list).collect() testing_data_ID = te_inst.select("ID").distinct().rdd.flatMap( list).collect() self.action_df.show() train_action_df = self.action_df.where( col("ID").isin(train_data_ID)).persist() self.logger.debug(train_action_df.select("ID").distinct().count()) train_terminal_outcome = self.terminal_outcome.where( col("ID").isin(train_data_ID)).persist() self.logger.debug( train_terminal_outcome.select("ID").distinct().count()) intv_w_p_val = self.identify_relevant_action( train_action_df, train_terminal_outcome, tr_inst.select("ID").distinct().count()) intv_w_p_val.join( self.def_df.where(col("SOURCE").isin(["CPT", "MED", "PROC"])), self.itemid).orderBy("p_val").show(100, truncate=False) from pyspark.sql.functions import sum, rand, max, lit from pyspark.ml.feature import VectorAssembler cur_annot_topk = self.sel_top self.action_df.show() self.terminal_outcome.show() annot_df = self.action_df.join(self.terminal_outcome, "ID").persist() annot_df.show() pos_inst_dict = dict() from pyspark.sql.functions import count for cur_of in [self.target_disch_col]: # For debug purpose, pass if target_of is not identified self.logger.debug(cur_of) intv_w_p_val.where("DISCH_DX == '{0}'".format(cur_of)).orderBy( col("p_val").cast("double")).show(50, truncate=False) target_annot_criteria = intv_w_p_val.where( "DISCH_DX == '{0}'".format(cur_of)).orderBy( col("p_val").cast("double")).limit(cur_annot_topk) target_annot_criteria.write.save(self.annot_intv_dir.format( cur_of, cur_annot_topk), mode="overwrite") target_annot_criteria = target_annot_criteria.select( self.itemid).rdd.flatMap(list).collect() if len(target_annot_criteria) == 0: self.logger.info( "NO TERMINAL DX {0} idenfieid from pts".format(cur_of)) pos_inst_dict[cur_of] = None continue self.logger.debug(target_annot_criteria) self.logger.debug(len(target_annot_criteria)) self.logger.debug("selected intv!!") self.def_df.where(col( self.itemid).isin(target_annot_criteria)).show(cur_annot_topk, truncate=False) pos_inst_dict[cur_of] = annot_df.where((col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of))\ .select("ID", col("TIME_OBS").cast("date").alias("TIME_OBS"), lit("1").cast("double").alias("{0}_label".format(cur_of)))\ .distinct().persist() pos_inst_dict[cur_of].groupBy("{0}_label".format(cur_of)).agg( count("*")).show() from pyspark.sql.functions import broadcast true_inst = annot_df.where( (col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of)) excl_id = annot_df.withColumn("IS_TARGET_OF",when(col("DISCH_DX") ==cur_of,lit("1").cast("double")).otherwise(lit("0").cast("double")))\ .withColumn("IS_REL_INTV", when(col(self.itemid).isin(target_annot_criteria), lit("1").cast("double")).otherwise(lit("0").cast("double")))\ .groupBy("ID").agg(sum("IS_TARGET_OF").alias("SUM_IS_TARGET_OF"),sum("IS_REL_INTV").alias("SUM_IS_REL_INTV"))\ .where("(SUM_IS_TARGET_OF <> 0) AND (SUM_IS_REL_INTV == 0)").select("ID").distinct().rdd.flatMap(list).collect() self.logger.debug("NUM_PTS_EXCLUDED:{0}".format(len(excl_id))) self.logger.debug("TRAINING_INST_COUNT:{0}".format( tr_inst.count())) tr_inst = tr_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\ .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\ .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0,subset=["{0}_label".format(cur_of)]).persist() print(tr_inst.count()) tr_inst.groupBy("{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() te_inst = te_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\ .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\ .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0, subset=["{0}_label".format(cur_of)]).persist() print(te_inst.count()) te_inst.groupBy("{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() tr_inst.groupBy("ID").agg( max("{0}_label".format(cur_of)).alias( "{0}_label".format(cur_of)), max("{0}_excl".format(cur_of)).alias( "{0}_excl".format(cur_of))).groupBy( "{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() te_inst.groupBy("ID").agg( max("{0}_label".format(cur_of)).alias( "{0}_label".format(cur_of)), max("{0}_excl".format(cur_of)).alias( "{0}_excl".format(cur_of))).groupBy( "{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() tr_inst.write.save(self.training_temp_dir, mode="overwrite") te_inst.write.save(self.testing_temp_dir, mode="overwrite") tr_inst = self.spark.read.parquet(self.training_temp_dir) te_inst = self.spark.read.parquet(self.testing_temp_dir) #te_inst.show() return (tr_inst, te_inst)
outputCol='color1').fit(df2).transform(df2) df3.show() df3.printSchema() df3 = OneHotEncoder(inputCol='color1', outputCol='color2', dropLast=False).transform(df3) df3.printSchema() df4 = StringIndexer(inputCol='type', outputCol='type1').fit(df2).transform(df3) df4.show() df4.printSchema() # Vector assembler df5 = VectorAssembler(inputCols=[ 'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2' ], outputCol='Features').transform(df4) df5.show(truncate=False) df5.printSchema() # -------------------------------------------------------------------------- # data processing complete--- # 6 .Model building training = df5 #training.show(truncate=False,n=5) from pyspark.ml.classification import RandomForestClassifier df1 = RandomForestClassifier(featuresCol='Features', labelCol='type1', numTrees=86, maxDepth=10) model22 = df1.fit(training) model22.getNumTrees #model22.numFeatures
from pyspark import SparkContext, SparkConf from os import environ from environment import spark from pyspark.ml.feature import VectorAssembler # environ['JAVA_HOME'] = 'D:\Program Files\Java\jdk1.8.0_181' # environ['HADOOP_HOME'] = 'D:\hadoop-3.1.2' # environ['SPARK_HOME'] = 'D:\spark-2.4.3-bin-hadoop2.7\spark-2.4.3-bin-hadoop2.7' # # conf = SparkConf() \ # .setAppName("demo") \ # .setMaster("spark://192.168.30.247:7077") \ # .set("spark.driver.host", "192.168.30.109") \ # .set("spark.cores.max", "4") \ # .set("spark.executor.memory", "512m") # sc = SparkContext(conf=conf) # # data = sc.parallelize(['a', 'a', 'd', 'd', 'b', 'c', 'd', 'e', 'f', 'g']) # pairs = data.map(lambda s: (s, 1)) # counts = pairs.reduceByKey(lambda a, b: a + b) # print(counts.sortByKey().collect()) df = spark.createDataFrame([(1, 11, 19)], ["id", "name", "age"]) ddf = VectorAssembler(inputCols=["name", "age"], outputCol='features').transform(df) ddf.show()
def gen_lr_sort_model(self): self.spark_session.sql("use portal") # 用户文章点击行为 sql = "select user_id, article_id, channel_id, click_flag from t_user_behaviour" user_article_click_df = self.spark_session.sql(sql) user_article_click_df.show() # 获取用户画像 基础数据、偏好喜好数据 sql = "select split(user_id, ':')[1] user_id, basic_info.gender, basic_info.age, preference_info " \ "from t_user_profile" user_profile_df = self.spark_session.sql(sql) user_profile_df.show() user_article_click_df = user_article_click_df.join(user_profile_df, on=["user_id"], how="left") # 抽取文章所属频道关键词向量特征 def extract_channel_keyword_feature(partition): from pyspark.ml.linalg import Vectors for row in partition: try: weights = sorted([row.preference_info[key] for key in row.preference_info.keys() if key.split(':')[0] == row.channel_id], reverse=True)[:10] except Exception as e: print e.message weights = [0.0] * 10 yield row.article_id, row.channel_id, row.user_id, int(row.gender), int(row.age), \ Vectors.dense(weights if weights else [0.0] * 10), row.click_flag user_article_click_df = user_article_click_df.rdd.mapPartitions(extract_channel_keyword_feature) \ .toDF(["article_id", "channel_id", "user_id", "gender", "age", "channel_weights", "click_flag"]) user_article_click_df.show() # 获取文章画像 article_profile_df = self.spark_session.sql("select * from t_article_profile") # 抽取文章关键词向量特征 def extract_feature(partition): from pyspark.ml.linalg import Vectors for row in partition: try: weights = sorted(row.keywords.values(), reverse=True)[:10] except Exception as e: print e.message weights = [0.0] * 10 yield row.article_id, Vectors.dense(weights if weights else [0.0] * 10) article_profile_df = article_profile_df.rdd.mapPartitions(extract_feature).toDF(["article_id", "article_weights"]) article_profile_df.show() user_article_click_df = user_article_click_df.join(article_profile_df, on=["article_id"], how="inner") user_article_click_df.show() # 获取文章向量 article_vector_df = self.spark_session.sql("select article_id, vector from t_article_vector") def array_to_vector(partition): from pyspark.ml.linalg import Vectors for row in partition: yield row.article_id, Vectors.dense(row.vector) article_vector_df = article_vector_df.rdd.mapPartitions(array_to_vector).toDF(["article_id", "article_vector"]) article_vector_df.show() user_article_click_df = user_article_click_df.join(article_vector_df, on=["article_id"], how="inner") user_article_click_df.show() # 收集特征 from pyspark.ml.feature import VectorAssembler input_cols = ["channel_id", "gender", "age", "channel_weights", "article_weights", "article_vector"] user_article_click_df = VectorAssembler().setInputCols(input_cols) \ .setOutputCol("features") \ .transform(user_article_click_df) user_article_click_df.show() # Logistic Regression from pyspark.ml.classification import LogisticRegression logistic_regression = LogisticRegression() logistic_regression_model = logistic_regression.setFeaturesCol("features") \ .setLabelCol("click_flag")\ .fit(user_article_click_df) logistic_regression_model.write().overwrite().save( "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model") from pyspark.ml.classification import LogisticRegressionModel logistic_regression_model = LogisticRegressionModel.load( "hdfs://192.168.0.1:9000/user/models/logistic_regression/lr.model") logistic_regression_result = logistic_regression_model.transform(user_article_click_df) logistic_regression_result.select(["click_flag", "probability", "prediction"]).show() # ROC def vector_to_double(row): return float(row.click_flag), float(row.probability[1]) score_labels = logistic_regression_result.select(["click_flag", "probability"]).rdd.map(vector_to_double) score_labels.collect() from pyspark.mllib.evaluation import BinaryClassificationMetrics binary_classification_metrics = BinaryClassificationMetrics(scoreAndLabels=score_labels) area_under_roc = binary_classification_metrics.areaUnderROC print area_under_roc