def predict_prob(lrModelPath, test_data): lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) result = predictions.select(['_c0', '_c1', '_c2', 'probability']) print('*************** result **************') print(result.show(5)) # result.write.csv('file:///opt/int_group/result123') vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1]) prob_1 = vs.transform(result) print('*************** prob_1 **************') print(prob_1.show(5)) result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1']) print('*************** result_prob1 **************') print(result_prob1.show(5)) new_result_prob1 = result_prob1.select([ '_c0', '_c1', '_c2', result_prob1['prob_1'].cast('string').alias('prob_1_str') ]) print('*************** new_result_prob1 **************') print(new_result_prob1.show(10)) print(new_result_prob1) # find null rows final_null_rows = new_result_prob1.filter(new_result_prob1._c0.isNull() | new_result_prob1._c1.isNull()\ | new_result_prob1._c2.isNull() | new_result_prob1.prob_1_str.isNull()) print('########### find null rows #############') final_null_rows.show(100)
def test_vector_slicer(self): data = self.spark.createDataFrame( [(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ), (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ), (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"]) model = VectorSlicer(inputCol="features", outputCol="sliced", indices=[1, 4]) feature_count = data.first()[0].array.size model_onnx = convert_sparkml( model, 'Sparkml VectorSlicer', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().sliced.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlVectorSlicer") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['sliced'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def test_list_int(self): for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]), SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), pyarray.array('d', [1.0, 2.0])]: vs = VectorSlicer(indices=indices) self.assertListEqual(vs.getIndices(), [1, 2]) self.assertTrue(all([type(v) == int for v in vs.getIndices()])) self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
def slice_win_source_to(source, destination): df_w = spark.read.parquet(os.path.join("datasets", source)) for j in range(8): slicer = VectorSlicer(inputCol="f"+str(j), outputCol="f_sl"+str(j), indices=[i for i in range(50,76)]) df_w = slicer.transform(df_w).drop("f"+str(j)) cols = ["f_sl"+str(i) for i in range(8)] assembler = VectorAssembler(inputCols=cols, outputCol="f") df_w = assembler.transform(df_w) df_w.write.mode("overwrite").parquet(os.path.join("datasets", destination)) df_w.printSchema()
def predict_prob(lrModelPath, test_data): lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) result = predictions.select(['_c0', '_c1', '_c2', 'probability']) print('*************** result **************') print(result.show(5)) # result.write.csv('file:///opt/int_group/result123') vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1]) prob_1 = vs.transform(result) print('*************** prob_1 **************') print(prob_1.show(5)) result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1']) print('*************** result_prob1 **************') print(result_prob1.show(5)) # for i in range(800, 802): # g = i / 1000 # h = g + 0.001 # sqlTrans = SQLTransformer(statement="SELECT _c0, _c1, _c2, prob_1[0] AS prob FROM __THIS__ WHERE prob_1[0] < h AND prob_1[0] >= g") # dd = sqlTrans.transform(result_prob1) # dd.write.csv('file:///opt/int_group/sql_test') new_result_prob1 = result_prob1.select([ '_c0', '_c1', '_c2', result_prob1['prob_1'].cast('string').alias('prob_1_str') ]) print('*************** new_result_prob1 **************') print(new_result_prob1.show(5)) print(new_result_prob1) dd = new_result_prob1.head(1000) dd_df = spark.createDataFrame(dd) dd_df.write.csv('file:///opt/int_group/head_1kw_test') # for i in [1,2,3,4,5]: # dd = new_result_prob1.head(i) # dd_df = spark.createDataFrame(dd) # dd_df.write.csv('file:///opt/int_group/head_test', mode='append') # DataFrame[_c0: string, _c1: string, _c2: string, prob_1_str: string] ### ''' Error: Exception: Python in worker has different version 2.7 than that in driver 3.6, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set. ''' # new_result_prob1.toPandas().to_csv('file:///opt/int_group/result.csv') # new_result_prob1.toPandas().to_csv('hdfs://bcg/opt/int_group/result/result.csv') ### '''
def df_train_test(): df_train = spark.read.parquet(os.path.join("datasets", "train.vector.parquet")) df_test = spark.read.parquet(os.path.join("datasets", "test.vector.parquet")) cols = ["vx"+str(i) for i in range(10)] assembler = VectorAssembler(inputCols=cols, outputCol="vx_t1") dct = DCT(inverse=False, inputCol="vx_t1", outputCol="vx_t2") slicer = VectorSlicer(inputCol="vx_t2", outputCol="vx_t3", indices=[i for i in range(40000)]) scaler = StandardScaler(inputCol="vx_t3", outputCol="vx", withStd=True, withMean=False) pipeline = Pipeline(stages=[assembler, dct, slicer, scaler]) p_model = pipeline.fit(df_train) df_train = p_model.transform(df_train) df_train = df_train.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4") df_train = df_train.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9") df_train = df_train.drop("vx_t1").drop("vx_t2").drop("vx_t3") df_test = p_model.transform(df_test) df_test = df_test.drop("vx0").drop("vx1").drop("vx2").drop("vx3").drop("vx4") df_test = df_test.drop("vx5").drop("vx6").drop("vx7").drop("vx8").drop("vx9") df_test = df_test.drop("vx_t1").drop("vx_t2").drop("vx_t3") df_train.write.mode("overwrite").parquet(os.path.join("datasets", "train.vector.dct.parquet")) df_test.write.mode("overwrite").parquet(os.path.join("datasets", "test.vector.dct.parquet")) df_train.printSchema() df_test.printSchema()
def _fit(self, dataset): model = self.estimator.fit(dataset) feature_importances = model._call_java("featureImportances") topfeatures = self._get_top_features(feature_importances) indices = topfeatures.loc[ topfeatures['cumulative_feature_importances'] < self.pct, 'indices'].values indices.sort() self.indices = indices vs = VectorSlicer(inputCol=self.inputCol, outputCol=self.outputCol, indices=[i.item() for i in self.indices]) return vs
def _fit(self, dataset): df = dataset.select("*", rand(self._seed).alias(self._rnd)) feature_importances = list() for i, test_instances in enumerate(self._cv(df)): train = df.filter(~test_instances).drop(self._rnd) model = self.estimator.fit(train) topfeatures = RandomForestCartSelector._get_top_features(model) feature_importances.append(topfeatures) self.indices = self._set_threshold(feature_importances) vs = VectorSlicer(inputCol=self.inputCol, outputCol=self.outputCol, indices=[i.item() for i in self.indices]) return vs
def predict_prob(lrModelPath, test_data): lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) result = predictions.select(['_c0', '_c1', '_c2', 'probability']) print('*************** result **************') print(result.show(5)) # result.write.csv('file:///opt/int_group/result123') vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1]) prob_1 = vs.transform(result) print('*************** prob_1 **************') print(prob_1.show(5)) result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1']) print('*************** result_prob1 **************') print(result_prob1.show(5)) new_result_prob1 = result_prob1.select(['_c0', '_c1', '_c2', result_prob1['prob_1'].cast('string').alias('prob_1_str')]) print('*************** new_result_prob1 **************') print(new_result_prob1.show(10)) print(new_result_prob1) new_result_prob1 = new_result_prob1.na.drop() # print('######################### printing num - {} #########################'.format(new_path)) new_result_prob1.write.csv('/opt/int_group/hanmo.wang/3key_all_v3', nullValue=None, mode='append')
def predict_prob(lrModel, test_data): # lrModel = LogisticRegressionModel.load(lrModelPath) predictions = lrModel.transform(test_data) result = predictions.select(['id', 'name', 'cell', 'probability']) print('*************** result **************') print(result.show(5)) vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1]) prob_1 = vs.transform(result) print('*************** prob_1 **************') print(prob_1.show(5)) result_prob1 = prob_1.select(['id', 'name', 'cell', 'prob_1']) print('*************** result_prob1 **************') print(result_prob1.show(5)) new_result_prob1 = result_prob1.select([ 'id', 'name', 'cell', result_prob1['prob_1'].cast('string').alias('prob_1_str') ]) print('*************** new_result_prob1 **************') print(new_result_prob1.show(5)) print(new_result_prob1) return new_result_prob1
def _fit(self, dataset): est = self.getOrDefault(self.estimator) nfeatures = self.getOrDefault(self.numTopFeatures) threshold = self.getOrDefault(self.threshold) selectorType = self.getOrDefault(self.selectorType) outputCol = self.getOrDefault(self.outputCol) if ((est.__class__.__name__ != 'DecisionTreeClassifier') & (est.__class__.__name__ != 'DecisionTreeRegressor') & (est.__class__.__name__ != 'RandomForestClassifier') & (est.__class__.__name__ != 'RandomForestRegressor') & (est.__class__.__name__ != 'GBTClassifier') & (est.__class__.__name__ != 'GBTRegressor')): raise NameError( "Estimator must be either DecisionTree, RandomForest or RandomForest Model" ) else: # Fit classifier & extract feature importance mod = est.fit(dataset) dataset2 = mod.transform(dataset) varlist = ExtractFeatureImp(mod.featureImportances, dataset2, est.getFeaturesCol()) if (selectorType == "numTopFeatures"): varidx = [x for x in varlist['idx'][0:nfeatures]] elif (selectorType == "threshold"): varidx = [ x for x in varlist[varlist['score'] > threshold]['idx'] ] else: raise NameError("Invalid selectorType") # Extract relevant columns return VectorSlicer(inputCol=est.getFeaturesCol(), outputCol=outputCol, indices=varidx)
def _evaluate(self, dataset, metric="AP"): def precision(y_true, y_scores, k): act_set = set(y_true) pred_set = set(y_scores[:k]) result = len(act_set & pred_set) / float(k) return result def recall(y_true, y_scores, k): act_set = set(y_true) pred_set = set(y_scores[:k]) result = len(act_set & pred_set) / float(len(act_set)) return result neg_slicer = VectorSlicer(inputCol="probability", outputCol="0_prob", indices=[0]) pos_slicer = VectorSlicer(inputCol="probability", outputCol="1_prob", indices=[1]) output_stg1 = neg_slicer.transform(dataset) output = pos_slicer.transform(output_stg1) Ranked_prediction = output.sort(col("1_prob").desc()) y_true = Ranked_prediction.select("label").rdd.flatMap( lambda x: x).collect() y_scores = Ranked_prediction.select("prediction").rdd.flatMap( lambda x: x).collect() score = 0 if metric == "AP": score = average_precision_score(y_true, y_scores) elif metric == "P100": score = precision(y_true, y_scores, 100) return score
###Feature slicer extract the original features from the feature vector from pyspark.ml.feature import VectorSlicer from pyspark.ml.linalg import Vectors from pyspark.sql.types import Row df = spark.createDataFrame([ Row(userFeatures=Vectors.sparse(3, { 0: -2.0, 1: 2.3 })), Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0])) ]) slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1]) output = slicer.transform(df) output.select("userFeatures", "features").show() # COMMAND ---------- ###R formula selects columns specified by R formula from pyspark.ml.feature import RFormula dataset = spark.createDataFrame([(7, "US", 18, 1.0), (8, "CA", 12, 0.0), (9, "NZ", 15, 0.0)], ["id", "country", "hour", "clicked"])
# from __future__ import print_function # $example on$ from pyspark.ml.feature import VectorSlicer from pyspark.mllib.linalg import Vectors from pyspark.sql.types import Row # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("VectorSlicerExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3}),), Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]),)]) slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1]) output = slicer.transform(df) output.select("userFeatures", "features").show() # $example off$ spark.stop()
# COMMAND ---------- # MAGIC %md # MAGIC After comparing the results of four tested models, we can see that we are getting quite satisfying results with Random Forest, even for the small number of trees. Random Forest is performing much better than the other tested algorithms. In that manner, we will try to select only the most relevant features, and try to lower the dimensionality of the problem. <br> # MAGIC After evaluating the results of Random Forest, we have identified the importance of the used features: # MAGIC SparseVector(17, {0: 0.0589, 1: 0.0454, 2: 0.028, 3: 0.0834, 4: 0.1145, 5: 0.0469, 6: 0.0257, 7: 0.019, 8: 0.0159, 9: 0.003, 10: 0.0059, 11: 0.0009, 12: 0.0524, 13: 0.0211, 14: 0.0021, 15: 0.2103, 16: 0.2664}).<br> # MAGIC Here we are going to lower the dimensionality of the problem based on feature importance, and using Vector Slicer Feature Selecion. Let's test how our datasets behave with less features, and try to make more robust model while using smaller number of features. # COMMAND ---------- from pyspark.ml.feature import VectorSlicer from pyspark.ml.linalg import Vectors from pyspark.sql.types import Row slicer = VectorSlicer(inputCol="features", outputCol="selectedFeatures").setIndices([3, 4, 15, 16]) # We are using the same datasets as for the other algorithms output = slicer.transform(transformed) otestData = slicer.transform(testData) otrainData = slicer.transform(trainingData) # Let's make our model rf = RandomForestClassifier(labelCol="label", featuresCol="selectedFeatures", numTrees=10) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer, rf]) # Train model. This also runs the indexers.
labelCol="cust_age") result_df = chisq_selector.fit(scaled_df).transform(scaled_df) result_df.select("selected_features").display() # COMMAND ---------- # MAGIC %md Feature Selection using VectorSclicer # COMMAND ---------- from pyspark.ml.feature import VectorSlicer vec_slicer = VectorSlicer(inputCol="scaled_features", outputCol="selected_features", indices=[1]) result_df = vec_slicer.transform(scaled_df) result_df.select("scaled_features", "selected_features").display() # COMMAND ---------- # MAGIC %md ###Delta Lake as Feature Store # COMMAND ---------- spark.sql("CREATE DATABASE IF NOT EXISTS feature_store ") (result_df.write.format("delta").mode("overwrite").option( "location", "/FileStore/shared_uploads/delta/retail_features.delta").saveAsTable(
from keras.utils.training_utils import multi_gpu_model from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, LSTM, GRU, CuDNNGRU, RNN, ConvLSTM2D, Conv1D, Reshape, MaxPooling1D, SimpleRNNCell, Flatten from keras.callbacks import Callback from keras.callbacks import ModelCheckpoint import matplotlib.pyplot as plt import config from sklearn.metrics import mean_absolute_error import pandas import csv from pyspark.ml.feature import VectorSlicer spark = config.get_config() df_f = spark.read.parquet(os.path.join("datasets", "train.vector.fbin.2.parquet")) slicer = VectorSlicer(inputCol='f', outputCol="fsl", indices=[i for i in range(0,26,2)]) df_f = slicer.transform(df_f).drop('f') df_y = spark.read.parquet(os.path.join("datasets", "train.target.parquet")) df_f = df_f.selectExpr("*").drop("_c0") df_y = df_y.selectExpr("seg AS seg2", "y as label").drop("seg") df_train = df_f df_train = df_train.join(df_y, df_train.seg.cast(IntegerType()) == df_y.seg2.cast(IntegerType())).drop("seg2") df_train.printSchema() n_dim = 13 #26 99 119 99+14 20+26 14+26 # vect_cols = ["f"] # vectorAssembler = VectorAssembler(inputCols=vect_cols, outputCol="features")
pd.DataFrame(Data.take(5), columns=Data.columns) testset,trainset = Data.randomSplit([0.3,0.7], seed=25) print("Training Dataset Count: " + str(trainset.count())) print("Test Dataset Count: " + str(testset.count())) ### GENERALIZED LINEAR REGRESSION FOR FEATURE SELECTION from pyspark.ml.regression import GeneralizedLinearRegression glr = GeneralizedLinearRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features",family="binomial", link="logit", maxIter=10,regParam=0.01) model = glr.fit(Data) summary = model.summary print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors)) print("P Values: " + str(summary.pValues)) #Removing all the columns that had a p-value above 0.05 vs = VectorSlicer(inputCol="features", outputCol="selected_features", indices=[0,2,9,18,21,23,24,26,27,28,31,32,37,41]) Training_set= vs.transform(trainset) Test_set = vs.transform(testset) #### LOGISTIC REGRESSION logReg = LogisticRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features", maxIter=20,regParam=0.01, elasticNetParam=0.8, family="binomial") logReg_model = logReg.fit(Training_set) trainingSummary = logReg_model.summary roc = trainingSummary.roc.toPandas() print('Training set ROC: ' + str(trainingSummary.areaUnderROC)) predictions = logReg_model.transform(Test_set) predictions.select('features', 'label', 'rawPrediction', 'Predicted_median', 'probability').show(10) evaluator = BinaryClassificationEvaluator() print("Test_SET (Area Under ROC): " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))
def slice_source_to(source, destination): slicer = VectorSlicer(inputCol="f_t1", outputCol="f", indices=[i for i in range(50,76)]) df_v = spark.read.parquet(os.path.join("datasets", source)) df_v = df_v.selectExpr("*", "f AS f_t1").drop("_c0").drop("f") df_v = slicer.transform(df_v).drop("f_t1") df_v.write.mode("overwrite").parquet(os.path.join("datasets", destination))
rdd = sc.textFile("/hive/warehouse/wlcredit.db/t_credit_feature_merge/ds=" + today + "_cms1234_anf") rdd1 = rdd.map(lambda x:x.split("\001")[0] + " " + x.split("\001")[1]) rdd1.saveAsTextFile("/user/wrt/credit/allexample.libsvm") data_svm_sql = sqlContext.read.format("libsvm").load("/user/wrt/credit/allexample.libsvm") data_svm = data_svm_sql.map(lambda row:LabeledPoint(int(row.label),row.features)) features = data_svm.map(lambda x: x.features) stat = Statistics.colStats(features) coverage = (stat.numNonzeros()/stat.count()).tolist() std = numpy.sqrt(stat.variance()).tolist() features_nums = data_svm.map(lambda x: x.features.size).take(1)[0] features_arr = range(0, features_nums) re = zip(zip(coverage,std),features_arr) filteredIndexes = map(lambda m: m[1],filter(lambda a:a[0][0] >=0.005,re)) slicer = VectorSlicer(inputCol="features", outputCol="featuresFiltered", indices=filteredIndexes) output_df = slicer.transform(data_svm_sql) data_svm_filtered = output_df.select("label","featuresFiltered") data_svm_labelpoint = data_svm_filtered.map(lambda row:LabeledPoint(int(row.label),row.featuresFiltered)) MLUtils.saveAsLibSVMFile(data_svm_labelpoint,"/user/wrt/credit/allexample_filter.libsvm") rdd_r = sc.textFile("/user/wrt/credit/allexample_filter.libsvm")\ .map(lambda x :x.split()[0].split('.')[0] + '\001' + ' '.join(x.split()[1:])) rdd_r.saveAsTextFile("/user/wrt/credit/allexample_filter_telindex_features") feature_raw = sc.textFile("/hive/warehouse/wlcredit.db/t_wrt_credit_all_features_name/ds=" + today + "_cms1234_anf")\ .map(lambda x:valid_jsontxt(x.split("\t")[0])).collect() fea_all_index = [] j = 1 for i in filteredIndexes: fea_all_index.append(feature_raw[i] + "\t" + str(j)) j += 1 sc.parallelize(fea_all_index).saveAsTextFile('/user/wrt/temp/filter_feature_name')
labelCol=targetName) gridGlmnet = ParamGridBuilder().baseOn( [glmnet.labelCol, targetName], [glmnet.elasticNetParam, 0.0]).addGrid( glmnet.regParam, [0.0, 0.025, 0.5, 0.1, 0.2, 0.4]).build() cvGlmnet = CrossValidator(estimator=glmnet, estimatorParamMaps=gridGlmnet, evaluator=modelEvaluator) GLMNET = cvGlmnet.fit(ClassData) ### Define variables with non-zero coefficients coef = GLMNET.bestModel.coefficients idxs = [idx for idx, x in enumerate(coef.toArray()) if not x == 0.0] ### Define function, which extracts variables with non-zero coefficients glmnetChoose = VectorSlicer(inputCol='features', outputCol='glmnetFeatures', indices=idxs) def glmnetSelect(df, vecSlc, withIdxCol=False): if withIdxCol: return vecSlc.transform(df).select( idVars + [targetName, 'indexed' + targetName, 'glmnetFeatures']).withColumnRenamed( 'glmnetFeatures', 'features') else: return vecSlc.transform(df).select( idVars + [targetName, 'glmnetFeatures']).withColumnRenamed( 'glmnetFeatures', 'features')
def vec_dct_to_win(): df_train = spark.read.parquet(os.path.join("datasets", "train.vector.parquet")) df_test = spark.read.parquet(os.path.join("datasets", "test.vector.parquet")) for j in range(8): cols = ["vx"+str(j+i) for i in range(3)] assembler = VectorAssembler(inputCols=cols, outputCol="vx_w"+str(j)) dct = DCT(inverse=False, inputCol="vx_w"+str(j), outputCol="fr_w"+str(j)) slicer = VectorSlicer(inputCol="fr_w"+str(j), outputCol="fs_w"+str(j), indices=[i for i in range(12000)]) scaler = StandardScaler(inputCol="fs_w"+str(j), outputCol="fn_w"+str(j), withStd=True, withMean=False) pipeline = Pipeline(stages=[assembler, dct, slicer, scaler]) pw_model = pipeline.fit(df_train) df_train = pw_model.transform(df_train).drop("vx"+str(j)).drop("vx_w"+str(j)).drop("fr_w"+str(j)).drop("fs_w"+str(j)) df_test = pw_model.transform(df_test).drop("vx"+str(j)).drop("vx_w"+str(j)).drop("fr_w"+str(j)).drop("fs_w"+str(j)) df_train.write.mode("overwrite").parquet(os.path.join("datasets", "train.win.vector.dct.parquet")) df_test.write.mode("overwrite").parquet(os.path.join("datasets", "test.win.vector.dct.parquet")) df_train.printSchema() df_test.printSchema() def bin_win_source_to(source, destination): df_w = spark.read.parquet(os.path.join("datasets", source)) for j in range(8): rdd_w = df_w.rdd.map(lambda row, j=j:(row.seg, row["fn_w"+str(j)], [])) for i in range(0,99): rdd_w = rdd_w.map(lambda row, i=i:(row[0], row[1], row[2] + [float(sum(abs(row[1].toArray()[i*120:i*120+240]))/120)])) rdd_w = rdd_w.map(lambda row: Row(seg=row[0],f=Vectors.dense(row[2]))) df_tmp = rdd_w.toDF() df_tmp = df_tmp.selectExpr("seg AS seg2", "f AS f"+str(j)).drop("seg").drop("_c0") df_w = df_w.join(df_tmp, df_w.seg.cast(IntegerType()) == df_tmp.seg2.cast(IntegerType())).drop("seg2").drop("fn_w"+str(j)) df_w = df_w.drop("vx8").drop("vx9") df_w.write.mode("overwrite").parquet(os.path.join("datasets", destination)) def slice_win_source_to(source, destination): df_w = spark.read.parquet(os.path.join("datasets", source)) for j in range(8): slicer = VectorSlicer(inputCol="f"+str(j), outputCol="f_sl"+str(j), indices=[i for i in range(50,76)]) df_w = slicer.transform(df_w).drop("f"+str(j)) cols = ["f_sl"+str(i) for i in range(8)] assembler = VectorAssembler(inputCols=cols, outputCol="f") df_w = assembler.transform(df_w) df_w.write.mode("overwrite").parquet(os.path.join("datasets", destination)) df_w.printSchema() def label_win_to(): df_train = spark.read.parquet(os.path.join("datasets", "train.parquet")) df_train.createOrReplaceTempView("data") df_target = spark.sql(""" SELECT d0.seg, d0.y AS y0, d1.y AS y1, d2.y AS y2, d3.y AS y3, d4.y AS y4, d5.y AS y5, d6.y AS y6, d7.y AS y7 FROM data AS d0 INNER JOIN data AS d1 ON d1.no = 60000 AND d1.seg = d0.seg INNER JOIN data AS d2 ON d2.no = 75000 AND d2.seg = d0.seg INNER JOIN data AS d3 ON d3.no = 90000 AND d3.seg = d0.seg INNER JOIN data AS d4 ON d4.no = 105000 AND d4.seg = d0.seg INNER JOIN data AS d5 ON d5.no = 120000 AND d5.seg = d0.seg INNER JOIN data AS d6 ON d6.no = 135000 AND d6.seg = d0.seg INNER JOIN data AS d7 ON d7.no = 150000 AND d7.seg = d0.seg WHERE d0.no = 45000 ORDER BY d0.seg """ ) df_target.write.mode("overwrite").parquet(os.path.join("datasets", "train.win.target.parquet")) df_target.show() def stats_feature_win_to(): df_train = spark.read.parquet(os.path.join("datasets", "train.parquet")) df_train.createOrReplaceTempView("data") df_stat = spark.sql(""" SELECT seg, INT(no/1000) AS seq, AVG(x) AS x_avg, PERCENTILE(x, 0.02) AS x_p02, PERCENTILE(x, 0.98) AS x_p98, PERCENTILE(ABS(x), 0.95) AS xa_p95 FROM data GROUP BY seg, INT(no/1000) ORDER BY seg, INT(no/1000) """ ) df_agg = sc.parallelize([]) rdd_temp = df_stat.rdd.map(lambda row:(row.seg, row.x_avg, row.x_p02, row.x_p98, row.xa_p95)) \ .map(lambda data: (data[0], ([ data[1] ], [ data[2] ] , [ data[3] ] , [ data[4] ] ))) \ .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]) ) \ .map(lambda row: Row(seg=row[0], stx1=Vectors.dense(row[1][0]), stx2=Vectors.dense(row[1][1]), stx3=Vectors.dense(row[1][2]), stx4=Vectors.dense(row[1][3]))) # if df_agg.count() == 0: df_agg = rdd_temp.toDF(["seg","stx1","stx2","stx3","stx4"]) # df_agg.show() df_agg = df_agg.select("*").where("seg != 4194") scaler = StandardScaler(inputCol="stx1", outputCol="stxn1", withStd=True, withMean=False) scalerModel = scaler.fit(df_agg) df_agg = scalerModel.transform(df_agg).drop("stx1") scaler = StandardScaler(inputCol="stx2", outputCol="stxn2", withStd=True, withMean=False) scalerModel = scaler.fit(df_agg) df_agg = scalerModel.transform(df_agg).drop("stx2") scaler = StandardScaler(inputCol="stx3", outputCol="stxn3", withStd=True, withMean=False) scalerModel = scaler.fit(df_agg) df_agg = scalerModel.transform(df_agg).drop("stx3") scaler = StandardScaler(inputCol="stx4", outputCol="stxn4", withStd=True, withMean=False) scalerModel = scaler.fit(df_agg) df_agg = scalerModel.transform(df_agg).drop("stx4") df_agg.write.mode("overwrite").parquet(os.path.join("datasets", "train.win.stat.parquet")) vec_dct_to_win() print("vec_dct_to_win finish!!!!!!!!!!") bin_win_source_to("train.win.vector.dct.parquet", "train.win.vector.fbin.parquet") print("bin_win_source_to train finish!!!!") bin_win_source_to("test.win.vector.dct.parquet", "test.win.vector.fbin.parquet") print("bin_win_source_to test finish!!!!") slice_win_source_to("train.win.vector.fbin.parquet", "train.win.vector.fbin.2.parquet") print("slice_win_source_to train finish!!!!!!!") slice_win_source_to("test.win.vector.fbin.parquet", "test.win.vector.fbin.2.parquet") print("slice_win_source_to test finish!!!!!!!") label_win_to() print("label_win_to finish!!!!!!!") stats_feature_win_to() print("stats_feature_win_to finish!!!!!!!!!!!")