def prepare_df_for_prediction(self, dataframe: DataFrame, label_to_predict,
                               categorical_features, continuous_features):
     # stages = self.build_pipeline_stages(categorical_features, continuous_features)
     # pipelined_dataframe = self.pipeline_dataframe(stages, dataframe)
     hasher = FeatureHasher(
         inputCols=[*categorical_features, *continuous_features],
         outputCol='features')
     featurized = hasher.transform(dataframe)
     label_features = featurized.select(label_to_predict,
                                        'features').withColumnRenamed(
                                            label_to_predict, 'label')
     return label_features
Beispiel #2
0
def ml_transformer(df,feature_all,response_feature):
    '''
    preprocess the data ready for the logistic regression model
    '''
    feature_only = feature_all.remove(response_feature)
    hasher = FeatureHasher(inputCols=feature_only,outputCol="features")
    df_featurized = hasher.transform(df)
    df_train, df_test = df_featurized.randomSplit([0.8, 0.2], seed=12345)
    df_size = float(df_train.select(response_feature).count())
    num_positives = df_train.select(response_feature).where('{}==1'.format(response_feature)).count()
    num_negatives = df_train.select(response_feature).where('{}==0'.format(response_feature)).count()
    balance_ratio = 1- num_positives/df_size
    df_train=df_train.withColumn("classWeights", when(df_train.response_feature == 1,balance_ratio).otherwise(1-balance_ratio))
    return df_train,df_test
Beispiel #3
0
def featurehasher(request):
    print("into featurehasher")
    value = "featurehasher"
    file_id = request.GET['fileid']
    print(file_id)
    spark = sparkSession(request)
    print("Created Spark Session")
    spark.sql('use hivedb')
    formFile = get_object_or_404(CSVFile, id=file_id)
    filePath = BASE_DIR + '\\' + str(formFile.file)
    filename = filePath
    projectid = formFile.project_fk.id

    csvpath = filename
    seperate = csvpath.split('/')
    for temp in seperate:
        pass
    splitcsv = temp.split('.')
    csvname = splitcsv[0]
    fid = str(file_id)
    pid = str(projectid)
    tablename = csvname + '_' + fid + '_' + pid
    print(tablename)
    datapreprocess = tablename + '_prerocessing'

    #spark.sql("select * from "+str(datapreprocess)+"").show()
    df = spark.table(datapreprocess)
    header = df.columns
    print(header)
    hasher = FeatureHasher(inputCols=header, outputCol="features")

    featurized = hasher.transform(df)
    featurized.show(truncate=False)
    dff = featurized

    scaler = StandardScaler(inputCol="features",
                            outputCol="scaledFeatures",
                            withStd=True,
                            withMean=False)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(dff)

    # Normalize each feature to have unit standard deviation.
    scaledData = scalerModel.transform(dff)
    scaledData.show(truncate=False)

    return JsonResponse({"success": True}, status=200)
# COMMAND ----------

###Feature hashing

from pyspark.ml.feature import FeatureHasher

dataset = spark.createDataFrame([(2.2, True, "1", "foo"),
                                 (3.3, False, "2", "bar"),
                                 (4.4, False, "3", "baz"),
                                 (5.5, False, "4", "foo")],
                                ["real", "bool", "stringNum", "string"])

hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                       outputCol="features")

featurized = hasher.transform(dataset)
featurized.show(truncate=False)

# COMMAND ----------

####Feature transformer (transforming sentences into words)
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

sentenceDataFrame = spark.createDataFrame(
    [(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"),
     (2, "Logistic,regression,models,are,neat")], ["id", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
Beispiel #5
0
df.cache()

total_detections = df.select("HasDetections").where(df.HasDetections==1).count()

print("Total Rows: {0}".format(df.count()))
print("Total HasDetections: {0}".format(total_detections))
print("******    Crosstabulations   ******")
df.crosstab("HasDetections", "SkuEdition").show(truncate=False)
df.crosstab("HasDetections", "ProductName").show(truncate=False)
df.crosstab("HasDetections", "AVProductsEnabled").show(truncate=False)
df.crosstab("HasDetections", "IsBeta").show(truncate=False)
df.crosstab("HasDetections", "Platform").show(truncate=False)
df.crosstab("HasDetections", "Census_DeviceFamily").show(truncate=False)
df.crosstab("HasDetections", "Census_OSInstallTypeName").show(truncate=False)

all_columns = df.columns
label_col = ["HasDetections"]
meta_cols = ["MachineIdentifier"]
#feature_cols = ["SkuEdition", "ProductName", "AVProductsEnabled", "IsBeta", "Platform", "Census_DeviceFamily", "Census_OSInstallTypeName"]
feature_cols = list(set(all_columns) - set(label_col) - set(meta_cols))
ordered_cols = list(label_col + meta_cols + feature_cols)

#hasher = FeatureHasher(numFeatures=len(feature_cols), inputCols=feature_cols, outputCol="features",  categoricalCols=feature_cols)
hasher = FeatureHasher(numFeatures=len(feature_cols), inputCols=feature_cols, outputCol="features",  categoricalCols=feature_cols)
df_features = df.sample(fraction=0.50, seed=3)
df_features = df.select(*ordered_cols)
df_features = hasher.transform(df_features)

chi_test = ChiSquareTest.test(df_features, "features", "HasDetections")

from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import FeatureHasher
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("FeatureHasherExample")\
        .getOrCreate()

    # $example on$
    dataset = spark.createDataFrame([
        (2.2, True, "1", "foo"),
        (3.3, False, "2", "bar"),
        (4.4, False, "3", "baz"),
        (5.5, False, "4", "foo")
    ], ["real", "bool", "stringNum", "string"])

    hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                           outputCol="features")

    featurized = hasher.transform(dataset)
    featurized.show(truncate=False)
    # $example off$

    spark.stop()
Beispiel #7
0
from pyspark.sql import SparkSession
from pyspark.ml.feature import FeatureHasher

spark = SparkSession.builder.appName("hash").getOrCreate()
data_set = spark.createDataFrame([(2.2, True, "1", "foo"),
                                  (3.3, False, "2", "bar"),
                                  (4.4, False, "3", "baz"),
                                  (5.5, False, "4", "foo")],
                                 ["real", "bool", "stringNum", "string"])

hash_er = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                        outputCol="feature")
features = hash_er.transform(data_set)
print(type(features))
features.show(truncate=False)
Beispiel #8
0
df_train.cache()

df_test.cache()

categorical = df_train.columns
categorical.remove('label')
print(categorical)

from pyspark.ml.feature import FeatureHasher

hasher = FeatureHasher(numFeatures=10000,
                       inputCols=categorical,
                       outputCol="features")

hasher.transform(df_train).select("features").show()

from pyspark.ml.classification import LogisticRegression

classifier = LogisticRegression(maxIter=20,
                                regParam=0.000,
                                elasticNetParam=0.000)

stages = [hasher, classifier]

from pyspark.ml import Pipeline

pipeline = Pipeline(stages=stages)

model = pipeline.fit(df_train)
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

from pyspark.sql import Column
import pyspark.sql.functions as F

data = [(0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar"),
        (0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar")]
cols = ["label", "real", "bool", "stringNum", "string"]
feature_cols = ["real", "bool", "stringNum", "string"]
x = spark.createDataFrame(data, cols)
h = FeatureHasher(numFeatures=4,
                  inputCols=feature_cols,
                  outputCol="features",
                  categoricalCols=feature_cols)
x_df = h.transform(x)
x_df.show(truncate=False)

s = ChiSqSelector(numTopFeatures=2,
                  labelCol="label",
                  featuresCol="features",
                  outputCol="selectedFeatures")
m = s.fit(x_df)
m_df = m.transform(x_df)
m_df.show(truncate=False)
s_df = m_df.select(*(m_df.columns[column_index]
                     for column_index in m.selectedFeatures))
s_df.show(truncate=False)
m.selectedFeatures
df_train = df_train.na.fill(0).cache()

logger.info("# Rows:" + str(df_train.count()))
logger.info("# Cols:" + str(len(df_train.columns)))
labelIndexer = StringIndexer(inputCol="label",
                             outputCol="indexedLabel").fit(df_train)
labeled = labelIndexer.transform(df_train)
hasher = FeatureHasher(inputCols=[
    column for column in list(set(df_train.columns)) if column != 'label'
],
                       outputCol="indexedFeatures",
                       numFeatures=len([
                           column for column in list(set(df_train.columns))
                           if column != 'label'
                       ]))
featurized = hasher.transform(df_train)

# Split the data into training and test sets (30% held out for testing)
trainingData, testData = df_train.randomSplit([0.7, 0.3], seed=1234)

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures",
                            numTrees=20,
                            maxDepth=15)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, hasher, rf])

model = pipeline.fit(trainingData)
Beispiel #11
0
all_columns = df.columns
label_col = ["HasDetections"]
meta_cols = ["MachineIdentifier"]
#feature_cols = ["SkuEdition", "ProductName", "AVProductsEnabled", "IsBeta", "Platform", "Census_DeviceFamily", "Census_OSInstallTypeName"]
feature_cols = list(set(all_columns) - set(label_col) - set(meta_cols))
ordered_cols = list(label_col + meta_cols + feature_cols)

#hasher = FeatureHasher(numFeatures=len(feature_cols), inputCols=feature_cols, outputCol="features",  categoricalCols=feature_cols)
hasher = FeatureHasher(numFeatures=len(feature_cols),
                       inputCols=feature_cols,
                       outputCol="features",
                       categoricalCols=feature_cols)
df_features = df.sample(fraction=0.50, seed=3)
df_features = df.select(*ordered_cols)
df_features = hasher.transform(df_features)

#chi_test = ChiSquareTest.test(df_features.limit(10000), "features", "HasDetections")

selector = ChiSqSelector(numTopFeatures=25,
                         labelCol="HasDetections",
                         featuresCol="features",
                         outputCol="selectedFeatures")
model = selector.fit(df_features)
model_df = model.transform(df_features)
model.selectedFeatures
print("******    ChiSquare Selected Features   ******")
[feature_cols[i] for i in model.selectedFeatures]

data = [(0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar"),
        (0, 2.0, True, "1", "foo"), (1, 3.0, False, "2", "bar")]