def prepare_df_for_prediction(self, dataframe: DataFrame, label_to_predict, categorical_features, continuous_features): # stages = self.build_pipeline_stages(categorical_features, continuous_features) # pipelined_dataframe = self.pipeline_dataframe(stages, dataframe) hasher = FeatureHasher( inputCols=[*categorical_features, *continuous_features], outputCol='features') featurized = hasher.transform(dataframe) label_features = featurized.select(label_to_predict, 'features').withColumnRenamed( label_to_predict, 'label') return label_features
def ml_transformer(df,feature_all,response_feature): ''' preprocess the data ready for the logistic regression model ''' feature_only = feature_all.remove(response_feature) hasher = FeatureHasher(inputCols=feature_only,outputCol="features") df_featurized = hasher.transform(df) df_train, df_test = df_featurized.randomSplit([0.8, 0.2], seed=12345) df_size = float(df_train.select(response_feature).count()) num_positives = df_train.select(response_feature).where('{}==1'.format(response_feature)).count() num_negatives = df_train.select(response_feature).where('{}==0'.format(response_feature)).count() balance_ratio = 1- num_positives/df_size df_train=df_train.withColumn("classWeights", when(df_train.response_feature == 1,balance_ratio).otherwise(1-balance_ratio)) return df_train,df_test
def featurehasher(request): print("into featurehasher") value = "featurehasher" file_id = request.GET['fileid'] print(file_id) spark = sparkSession(request) print("Created Spark Session") spark.sql('use hivedb') formFile = get_object_or_404(CSVFile, id=file_id) filePath = BASE_DIR + '\\' + str(formFile.file) filename = filePath projectid = formFile.project_fk.id csvpath = filename seperate = csvpath.split('/') for temp in seperate: pass splitcsv = temp.split('.') csvname = splitcsv[0] fid = str(file_id) pid = str(projectid) tablename = csvname + '_' + fid + '_' + pid print(tablename) datapreprocess = tablename + '_prerocessing' #spark.sql("select * from "+str(datapreprocess)+"").show() df = spark.table(datapreprocess) header = df.columns print(header) hasher = FeatureHasher(inputCols=header, outputCol="features") featurized = hasher.transform(df) featurized.show(truncate=False) dff = featurized scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(dff) # Normalize each feature to have unit standard deviation. scaledData = scalerModel.transform(dff) scaledData.show(truncate=False) return JsonResponse({"success": True}, status=200)
def create_featureHasher(input_col:str, nq:int) -> FeatureHasher: """ Create a Feature Hasher for a specified column Uses as output colum the input + _encoded (creates oneHotEncodings for strings) Parameters ---------- input_col: str Name of the Input Column nq: Int Number of Quantiles to use Return ------ FeatureHasher """ output_col = input_col + "_encoded" return FeatureHasher(numFeatures=nq, inputCols=[input_col], outputCol=output_col)
result = model.transform(df) result.show(truncate=False) # COMMAND ---------- ###Feature hashing from pyspark.ml.feature import FeatureHasher dataset = spark.createDataFrame([(2.2, True, "1", "foo"), (3.3, False, "2", "bar"), (4.4, False, "3", "baz"), (5.5, False, "4", "foo")], ["real", "bool", "stringNum", "string"]) hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"], outputCol="features") featurized = hasher.transform(dataset) featurized.show(truncate=False) # COMMAND ---------- ####Feature transformer (transforming sentences into words) from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType sentenceDataFrame = spark.createDataFrame( [(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat")], ["id", "sentence"])
df.cache() total_detections = df.select("HasDetections").where(df.HasDetections==1).count() print("Total Rows: {0}".format(df.count())) print("Total HasDetections: {0}".format(total_detections)) print("****** Crosstabulations ******") df.crosstab("HasDetections", "SkuEdition").show(truncate=False) df.crosstab("HasDetections", "ProductName").show(truncate=False) df.crosstab("HasDetections", "AVProductsEnabled").show(truncate=False) df.crosstab("HasDetections", "IsBeta").show(truncate=False) df.crosstab("HasDetections", "Platform").show(truncate=False) df.crosstab("HasDetections", "Census_DeviceFamily").show(truncate=False) df.crosstab("HasDetections", "Census_OSInstallTypeName").show(truncate=False) all_columns = df.columns label_col = ["HasDetections"] meta_cols = ["MachineIdentifier"] #feature_cols = ["SkuEdition", "ProductName", "AVProductsEnabled", "IsBeta", "Platform", "Census_DeviceFamily", "Census_OSInstallTypeName"] feature_cols = list(set(all_columns) - set(label_col) - set(meta_cols)) ordered_cols = list(label_col + meta_cols + feature_cols) #hasher = FeatureHasher(numFeatures=len(feature_cols), inputCols=feature_cols, outputCol="features", categoricalCols=feature_cols) hasher = FeatureHasher(numFeatures=len(feature_cols), inputCols=feature_cols, outputCol="features", categoricalCols=feature_cols) df_features = df.sample(fraction=0.50, seed=3) df_features = df.select(*ordered_cols) df_features = hasher.transform(df_features) chi_test = ChiSquareTest.test(df_features, "features", "HasDetections")
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import FeatureHasher # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("FeatureHasherExample")\ .getOrCreate() # $example on$ dataset = spark.createDataFrame([ (2.2, True, "1", "foo"), (3.3, False, "2", "bar"), (4.4, False, "3", "baz"), (5.5, False, "4", "foo") ], ["real", "bool", "stringNum", "string"]) hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"], outputCol="features") featurized = hasher.transform(dataset) featurized.show(truncate=False) # $example off$ spark.stop()
drop_cols = list( set(df.columns) - set(meta_cols) - set(continuous_cols) - set(categorical_cols) - set(feature_cols)) df = df.drop(*drop_cols) df.cache() print("Creating Splits") train, test = df.randomSplit([0.7, 0.3]) print("Selected Features Count: {0}".format(len(feature_cols))) print("Selected Features: {0}".format(feature_cols)) print("Building Pipeline") categorical_hasher = FeatureHasher(inputCols=categorical_cols, outputCol="categorical_features", categoricalCols=categorical_cols) continuous_vector = VectorAssembler(inputCols=continuous_cols, outputCol="continuous_vector") scaler = MinMaxScaler(min=0.0, max=1.0, inputCol=continuous_vector.getOutputCol(), outputCol="continuous_features") features = VectorAssembler(inputCols=feature_cols, outputCol="features") classifier = LogisticRegression(regParam=0.01, maxIter=100, aggregationDepth=2, fitIntercept=True, family="binomial", elasticNetParam=0.0) one_rest = OneVsRest(classifier=classifier,
df = df.withColumnRenamed("click", "label") df_train, df_test = df.randomSplit([0.7, 0.3], 42) df_train.cache() df_test.cache() categorical = df_train.columns categorical.remove('label') print(categorical) from pyspark.ml.feature import FeatureHasher hasher = FeatureHasher(numFeatures=10000, inputCols=categorical, outputCol="features") hasher.transform(df_train).select("features").show() from pyspark.ml.classification import LogisticRegression classifier = LogisticRegression(maxIter=20, regParam=0.000, elasticNetParam=0.000) stages = [hasher, classifier] from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages)
] drop_cols = list(set(df.columns) - set(selected_cols) - set(meta_cols)) print("Performing Cleanup") df = df.drop(*drop_cols) df.cache() print("Creating Splits") train, test = df.randomSplit([0.7, 0.3]) print("Selected Features Count: {0}".format(len(selected_cols))) print("Selected Features: {0}".format(selected_cols)) print("Building Pipeline") hasher = FeatureHasher(numFeatures=1024, inputCols=selected_cols, outputCol="features", categoricalCols=selected_cols) forest = RandomForestClassifier(featuresCol="features", labelCol="HasDetections", predictionCol="prediction", probabilityCol="probability") pipeline = Pipeline(stages=[hasher, forest]) evaluator = MulticlassClassificationEvaluator(labelCol="HasDetections", predictionCol="prediction", metricName="accuracy") print("Configuring Validation") params = ParamGridBuilder() \ .addGrid(hasher.numFeatures, [1024]) \ .addGrid(forest.maxDepth, [30]) \
ratio = 1.0 counts = train.select(f'_c{label_idx}').groupBy( f'_c{label_idx}').count().collect() higher_bound = counts[1][1] treshold = int(ratio * float(counts[0][1]) / counts[1][1] * higher_bound) rand_gen = lambda x: randint(0, higher_bound) if x == 0 else -1 udf_rand_gen = udf(rand_gen, IntegerType()) train = train.withColumn('rand_idx', udf_rand_gen('_c0')) train_subsample = train.filter(train['rand_idx'] < treshold) train_subsample = train_subsample.drop('rand_idx') train_subsample.select(f'_c{label_idx}').groupBy( f'_c{label_idx}').count().show(n=5) # パイプラインの構築 hasher = FeatureHasher(numFeatures=262144, inputCols=real_features + category_features, outputCol='features', categoricalCols=category_features) lr = LogisticRegression(featuresCol='features', labelCol=f'_c{label_idx}') pipeline = Pipeline(stages=[hasher, lr]) model = pipeline.fit(train_subsample) print(model.stages[-1].coefficients) predictions = model.transform(test) evaluator = MulticlassClassificationEvaluator(labelCol=f'_c{label_idx}', metricName='f1') f1 = evaluator.evaluate(predictions) print(f'f1 = {f1}')
from pyspark.ml.feature import FeatureHasher from pyspark.ml.stat import ChiSquareTest from pyspark.ml.feature import ChiSqSelector from pyspark.ml.linalg import Vectors from pyspark.sql import Column import pyspark.sql.functions as F data = [(0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar"), (0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar")] cols = ["label", "real", "bool", "stringNum", "string"] feature_cols = ["real", "bool", "stringNum", "string"] x = spark.createDataFrame(data, cols) h = FeatureHasher(numFeatures=4, inputCols=feature_cols, outputCol="features", categoricalCols=feature_cols) x_df = h.transform(x) x_df.show(truncate=False) s = ChiSqSelector(numTopFeatures=2, labelCol="label", featuresCol="features", outputCol="selectedFeatures") m = s.fit(x_df) m_df = m.transform(x_df) m_df.show(truncate=False) s_df = m_df.select(*(m_df.columns[column_index] for column_index in m.selectedFeatures)) s_df.show(truncate=False) m.selectedFeatures
"AMT_PAYMENT").cache() # joining to the master table df_train = df_train.join(df_prev_app_means, ['SK_ID_CURR'], how='left').cache() df_train = df_train.join(df_payment_means, ['SK_ID_CURR'], how='left').cache() # filling nan values df_train = df_train.na.fill(0).cache() logger.info("# Rows:" + str(df_train.count())) logger.info("# Cols:" + str(len(df_train.columns))) labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(df_train) labeled = labelIndexer.transform(df_train) hasher = FeatureHasher(inputCols=[ column for column in list(set(df_train.columns)) if column != 'label' ], outputCol="indexedFeatures", numFeatures=len([ column for column in list(set(df_train.columns)) if column != 'label' ])) featurized = hasher.transform(df_train) # Split the data into training and test sets (30% held out for testing) trainingData, testData = df_train.randomSplit([0.7, 0.3], seed=1234) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=20, maxDepth=15) # Chain indexers and forest in a Pipeline