# COMMAND ----------

# MAGIC %md The ML package needs the label and feature vector to be added as columns to the input dataframe. We set up a pipeline to pass the data through transformers in order to extract the features and label. We index each categorical column using the `StringIndexer` to a column of number indices, then convert the indexed categories into one-hot encoded variables with at most a single one-value. These binary vectors are appended to the end of each row. Encoding categorical features allows decision trees to treat categorical features appropriately, improving performance. We then use the `StringIndexer` to encode our labels to label indices.

# COMMAND ----------

categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"]
stages = []  # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + "Index")
    # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Using the slightly older OneHotEncoder (instead of OneHotEncoderEstimator) for compatibility reasons when operationalizing within the DSVM
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                            outputCol=categoricalCol + "classVec")
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label")
stages += [label_stringIdx]

# COMMAND ----------

# MAGIC %md Now we need to use the `VectorAssembler` to combine all the feature columns into a single vector column. This includes our numeric columns as well as the one-hot encoded binary vector columns.

# COMMAND ----------

# Transform all features into a vector using VectorAssembler
numericCols = [
Example #2
0
data = sc.textFile('C:/Users/akshaykumar.kore/Downloads/data/adults.csv').map(
    lambda line: line.split(","))
#data=sc.read.csv("C:/Users/akshaykumar.kore/Downloads/data/adult1.csv", header=True, mode="DROPMALFORMED")

data = data.toDF()
data.show()
data = data.na.fill(0)

categoricalColumns = ["_2", "_4", "_6", "_7", "_8", "_9", "_10", "_14"]
stages = []  # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + "Index")
    # OneHotEncoder to convert categorical variables into binary SparseVectors
    encoder = OneHotEncoder(inputCol=categoricalCol + "Index",
                            outputCol=categoricalCol + "classVec")
    # Add stages.
    stages += [stringIndexer, encoder]

#convert label into numerical
label_stringIdx = StringIndexer(inputCol="_15", outputCol="label")
stages += [label_stringIdx]
numericCols = ["_1", "_3", "_5", "_11", "_12", "_13"]

categorical = []
numerical = []

for col in numericCols:
    data = data.withColumn(col + "index",
                           data[col].cast(DoubleType())).drop(col)
Example #3
0
if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("OneHotEncoderExample") \
        .getOrCreate()

    # 首先创建一个DataFrame,其包含一列类别性特征,需要注意的是,在使用OneHotEncoder进行转换前,
    # DataFrame需要先使用StringIndexer将原始标签数值化:
    df = spark.createDataFrame([
        (0, "a"),
        (1, "b"),
        (2, "c"),
        (3, "a"),
        (4, "a"),
        (5, "c")
    ], ["id", "category"])

    stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    # 我们创建OneHotEncoder对象对处理后的DataFrame进行编码,可以看见,编码后的二进制特征呈稀疏向量形式,
    # 与StringIndexer编码的顺序相同,
    # 需注意的是最后一个Category(”b”)被编码为全0向量,若希望”b”也占有一个二进制特征,
    # 则可在创建OneHotEncoder时指定setDropLast(false)。
    encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
    encoded = encoder.transform(indexed)
    encoded.show()

    spark.stop()
Example #4
0
        dummies_col[i],
        when((col(dummies_col[i]).isin(
            dummy_info['factor_selected'][dummies_col[i]])),
             col(dummies_col[i])).otherwise(replacement))

air = air.withColumn(Y_name[0], (air[Y_name[0]] > 0).cast('Int'))

# The index of string vlaues multiple columns
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c))
    for c in dummies_col
]
# The encode of indexed vlaues multiple columns
encoders = [
    OneHotEncoder(dropLast=True,
                  inputCol=indexer.getOutputCol(),
                  outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]

# Vectorizing encoded values

assembler = VectorAssembler(inputCols=['ActualElapsedTime', 'Distance'] +
                            [encoder.getOutputCol() for encoder in encoders],
                            outputCol="features")

pipeline = Pipeline(stages=indexers + encoders + [assembler])
model = pipeline.fit(air)
transformed = model.transform(air)
air2 = transformed[['ArrDelay', 'features']]
Example #5
0

# Filtrado:
from pyspark.ml.feature import SQLTransformer
filterer = SQLTransformer(statement="SELECT * FROM __THIS__ WHERE cancelled == 0")

# Generar los inputs:
extractor = SQLTransformer(statement="SELECT *, review IS NOT NULL AS reviewed FROM __THIS__")

# Indexar el campo  `vehicle_color`:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="vehicle_color", outputCol="vehicle_color_indexed")

# crear un dummy para la categorica de  `vehicle_color_indexed`:
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="vehicle_color_indexed", outputCol="vehicle_color_encoded")

# seleccionar los features 
from pyspark.ml.feature import VectorAssembler
features = ["reviewed", "vehicle_year", "vehicle_color_encoded", "CloudCover"]
assembler = VectorAssembler(inputCols=features, outputCol="features")

# especificar el estimador (i.e., classification algorithm):
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(featuresCol="features", labelCol="star_rating")
print(classifier.explainParams())

# espeficar los valores en el grid de hiperparametros:
from pyspark.ml.tuning import ParamGridBuilder
maxDepthList = [5, 10, 20]
numTreesList = [20, 50, 100]
Example #6
0
# The second step is to encode this numeric column as a one-hot vector using a `OneHotEncoder`. This works exactly the same way as the StringIndexer by creating an Estimator and then a Transformer. The end result is a column that encodes your categorical feature as a vector that's suitable for machine learning routines!
#

# #### Carrier
# In this exercise you'll create a` StringIndexer` and a `OneHotEncoder` to code the carrier column. To do this, you'll call the class constructors with the arguments inputCol and outputCol. The `inputCol` is the name of the column you want to index or encode, and the `outputCol` is the name of the new column that the Transformer should create.

# In[99]:

from pyspark.ml.feature import StringIndexer, OneHotEncoder

# In[100]:

#Create a StringIndexer
carr_indexer = StringIndexer(inputCol='carrier', outputCol='carrier_index')
#Create a OneHotEncoder
carr_encoder = OneHotEncoder(inputCol='carrier_index', outputCol='carr_fact')

# #### Destination

# In[101]:

# encode the dest column just like you did above
dest_indexer = StringIndexer(inputCol='dest', outputCol='dest_index')
dest_encoder = OneHotEncoder(inputCol='dest_index', outputCol='dest_fact')

# #### Assemble a  Vector
# The last step in the Pipeline is to combine all of the columns containing our features into a single column. pyspark.ml.feature submodule contains a class called VectorAssembler. This Transformer takes all of the columns you specify and combines them into a new vector column.

# In[102]:

from pyspark.ml.feature import VectorAssembler
Example #7
0
# MAGIC Then, we can apply the `OneHotEncoder` to the output of the StringIndexer [Python](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)/[Scala](https://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.OneHotEncoder).

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [
    field for (field, dataType) in trainDF.dtypes if dataType == "string"
]
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols,
                              outputCols=indexOutputCols,
                              handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols, outputCols=oheOutputCols)

# COMMAND ----------

# MAGIC %md
# MAGIC Now we can combine our OHE categorical features with our numeric features.

# COMMAND ----------

from pyspark.ml.feature import VectorAssembler

numericCols = [
    field for (field, dataType) in trainDF.dtypes
    if ((dataType == "double") & (field != "price"))
]
assemblerInputs = oheOutputCols + numericCols
Example #8
0
    def encoder(self,
                raw,
                test,
                features,
                multi_value_category_feature,
                number_features=None):

        logger.info(' schema %s', raw.schema)
        logger.info(' columns %s', raw.schema.names)
        logger.info('features: %s', features)
        logger.info('multi_value_category_feature: %s',
                    multi_value_category_feature)
        logger.info(f'input number feature:{number_features}')
        logger.info(f'input features :{features}')
        self._category_feature = [
            i for i in features
            if raw.agg(functions.countDistinct(raw[i]).alias('cnt')).collect()
            [0].cnt > 1
        ]
        logger.info('drop unique feature %s',
                    set(features) - set(self._category_feature))
        if number_features:
            self._number_features = [
                number_feature for number_feature in number_features
                if number_feature in self._category_feature
            ]
        else:
            self._number_features = []
        logger.info(f'number feature:{self._number_features}')

        self._category_feature = list(
            (set(self._category_feature) - set(self._number_features)))
        logger.info(f'one_hot feature:{self._category_feature}')

        # 这边对先拆分,原始数据,再进行特征转换,是为了避免训练时,用到了测试集的一些OneHot的特征信息。
        # 原因是,我们的算法对
        train, test = raw, test
        train_count, test_count = train.count(), test.count()
        logger.info('[%s] data statics: total = %d, train = %d, test = %d',
                    self._job_id, train_count + test_count, train_count,
                    test_count)

        # one hot encoding
        string_indexers = [
            StringIndexer(inputCol=c,
                          outputCol="{}_idx".format(c),
                          handleInvalid='keep') for c in self._category_feature
        ]
        encoders = [
            OneHotEncoder(inputCol="{}_idx".format(c),
                          outputCol="{}_vec".format(c),
                          dropLast=True) for c in self._category_feature
        ]

        if multi_value_category_feature:
            self._multi_value_category_feature = multi_value_category_feature
            multi_enc = [
                MultiCategoryEncoder(inputCol=i, outputCol=f'{i}_vec')
                for i in self._multi_value_category_feature
            ]
        else:
            multi_enc = None

        vec_cols = ['{}_vec'.format(c) for c in self.get_feature_names()]

        if len(self._number_features) > 0:
            vec_cols += self._number_features

        assembler = VectorAssembler(inputCols=vec_cols, outputCol='feature')

        if multi_value_category_feature:
            stages = string_indexers + encoders + multi_enc + [assembler]
        else:
            stages = string_indexers + encoders + [assembler]

        pipeline = Pipeline(stages=stages)

        self._model = pipeline.fit(train)

        self._extract_vocabulary()

        train_res = self._model.transform(train)

        # 这边使用训练集的转换模型去转换测试集。
        test_res = self._model.transform(test)
        print(test_res)

        v = train_res.select('feature').take(1)[0].feature
        logger.info('feature vector size = %d', v.size)
        if v.size != self.feature_dim():
            raise RuntimeError(
                f'feature vector size not match,'
                f' real({v.size}) != calc({self.feature_dim()})')

        return train_res, test_res
Example #9
0
                       sf.when(sf.col('tip_amount') > 0, 1).otherwise(0))
taxi = taxi.drop('tip_amount')
taxi.show(5)
taxi.columns

categorical_columns = taxi.columns[:-1]
categorical_columns
stringindexer_stages = [
    StringIndexer(inputCol=c, outputCol='stringindexed_' + c)
    for c in categorical_columns
]
# encode label column and add it to stringindexer stages
stringindexer_stages += [StringIndexer(inputCol='tip', outputCol='label')]

onehotencoder_stages = [
    OneHotEncoder(inputCol='stringindexed_' + c, outputCol='onehot_' + c)
    for c in categorical_columns
]

feature_columns = ['onehot_' + c for c in categorical_columns]
vectorassembler_stage = VectorAssembler(inputCols=feature_columns,
                                        outputCol='features')

all_stages = stringindexer_stages + onehotencoder_stages + [
    vectorassembler_stage
]
pipeline = Pipeline(stages=all_stages)

pipeline_model = pipeline.fit(taxi)

final_columns = feature_columns + ['features', 'label']
Example #10
0
c16I = StringIndexer(inputCol="C16", outputCol="iC16", handleInvalid="skip")
c18I = StringIndexer(inputCol="C18", outputCol="iC18", handleInvalid="skip")
c19I = StringIndexer(inputCol="C19", outputCol="iC19", handleInvalid="skip")
c21I = StringIndexer(inputCol="C21", outputCol="iC21", handleInvalid="skip")
appcatI = StringIndexer(inputCol="app_category",
                        outputCol="i_app_category",
                        handleInvalid="skip")
devtypeI = StringIndexer(inputCol="device_type",
                         outputCol="i_device_type",
                         handleInvalid="skip")
sitecatI = StringIndexer(inputCol="site_category",
                         outputCol="i_site_category",
                         handleInvalid="skip")

#OneHotEncoder applied after the stringIndexer to form binary vector for each column
c1E = OneHotEncoder(inputCol="iC1", outputCol="C1Vector")
c15E = OneHotEncoder(inputCol="iC15", outputCol="C15Vector")
c16E = OneHotEncoder(inputCol="iC16", outputCol="C16Vector")
c18E = OneHotEncoder(inputCol="iC18", outputCol="C18Vector")
c19E = OneHotEncoder(inputCol="iC19", outputCol="C19Vector")
c21E = OneHotEncoder(inputCol="iC21", outputCol="C21Vector")
appcatE = OneHotEncoder(inputCol="i_app_category",
                        outputCol="i_app_category_Vector")
devtypeE = OneHotEncoder(inputCol="i_device_type",
                         outputCol="i_device_type_Vector")
sitecatE = OneHotEncoder(inputCol="i_site_category",
                         outputCol="i_site_category_Vector")

#Vector assembler
fAssembler = VectorAssembler(inputCols=[
    "C1Vector", "C15Vector", "C16Vector", "C18Vector", "C19Vector",
Example #11
0
    mushrooms = spark.read.csv(sys.argv[1], header=True)
    myColumnList = mushrooms.columns[0:20]
    # mushrooms = mushrooms.sample(0.9)
    mushrooms = mushrooms[myColumnList]

    in_cols = mushrooms.schema.names[1:]

    str_indexers = [
        StringIndexer(inputCol=c, outputCol=c + '_idx') for c in in_cols
    ]
    # a list of StringIndexer objects to convert strings to integer indices
    # each indexer is responsible for converting one feature column

    onehot_encoders = [
        OneHotEncoder(dropLast=False,
                      inputCol=c + '_idx',
                      outputCol=c + '_onehot') for c in in_cols
    ]
    # a list of OneHotEncoder objects to convert integer indices of cat levels to one-hot encoded columns
    # each encoder is responsible fore encoding one feature column

    onehot_cols = [c + '_onehot' for c in in_cols]

    feat_assembler = VectorAssembler(inputCols=onehot_cols,
                                     outputCol='features')
    # a VectorAssembler object that assembles all the one-hot encoded columns into one column,
    # each row of which is a vector of all the numbers in those one-hot columns.
    # e.g.
    # +-----+-----+-----+-----+---------------------+
    # |cat_0|cat_1|cat_2|cat_3|             features|
    # +-----+-----+-----+-----+---------------------+
Example #12
0
        .getOrCreate()

#load data in csv format with header
rawData = spark.read.load("./hour.csv",format="csv",header=True)
rawData.count()#17379
data=rawData
#casual+registered=cnt
rawData=rawData.drop("casual","registered")#drop columns
rawData=rawData.withColumnRenamed("cnt","label")#rename columns
cat_features=rawData.columns[2:10]

for col in cat_features:
    #must give a new column name
    indexer = StringIndexer(inputCol=col, outputCol=col+"_indexed",handleInvalid='error')
    indexed = indexer.fit(rawData).transform(rawData)
    encoder = OneHotEncoder(inputCol=col+"_indexed", outputCol=col+"Vec")
    rawData = encoder.transform(indexed)


#cast columns to float
for col in rawData.columns[2:15]:
    rawData=rawData.withColumn(col,rawData[col].cast(FloatType()))

#convert date to date format and extract week day
from pyspark.sql.functions import date_format
rawData=rawData.withColumn("dteday",rawData["dteday"].cast(DateType()))
rawData=rawData.withColumn('dteday', date_format('dteday', 'u'))

isweekend=udf(lambda x:1.0 if int(x) > 5 else 0.0,FloatType())
rawData=rawData.withColumn("isWeekend",isweekend("dteday"))#whether it is weekend
rawData=rawData.drop("dteday")
def make_regr_model(data, sc, model_path, model_name, target, ml_model='default', save=True):

    t0 = time()
    # Stages for pipline
    stages = []

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Identify categorical and numerical variables
    catCols = [x for (x, dataType) in trainingData.dtypes if ((dataType == "string") | (dataType == "boolean"))]

    numCols = [x for (x, dataType) in trainingData.dtypes if (((dataType == "int") | (dataType == "bigint")
                                                                 | (dataType == "float") | (dataType == "double"))
               & (x != "target"))]

    # OneHotEncode categorical variables
    indexers = [StringIndexer(inputCol=column, outputCol=column + "-index", handleInvalid="keep") for column in catCols]

    encoder = OneHotEncoder(
        inputCols=[indexer.getOutputCol() for indexer in indexers],
        outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
    )
    assembler_cat = VectorAssembler(
        inputCols=encoder.getOutputCols(),
        outputCol="categorical-features",
        handleInvalid="skip"
    )

    stages += indexers
    stages += [encoder, assembler_cat]



    assembler_num = VectorAssembler(
        inputCols=numCols,
        outputCol="numerical-features",
        handleInvalid="skip"
    )

    # Standardize numerical variables
    scaler = StandardScaler(inputCol="numerical-features", outputCol="numerical-features_scaled")

    # Combine all features in one vector
    assembler_all = VectorAssembler(
        inputCols=['categorical-features', 'numerical-features_scaled'],
        outputCol='features',
        handleInvalid="skip"
    )

    stages += [assembler_num, scaler, assembler_all]

    # Train a RandomForest model.
    if ml_model == 'default':
        rf = RandomForestRegressor(labelCol="target", featuresCol="features")
    else:
        rf = ml_model

    stages += [rf]

    # Chain indexers and forest in a Pipeline
    pipeline = Pipeline(stages=stages)

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    #predictions.select("prediction", "target", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="target", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("RMSE = %g" % (0.0 + rmse))

    if save:
        # Final model saving and statistics writing
        tt = time() - t0
        timestamp = int(time())
        model.write().overwrite().save(model_path)

        cluster = Cluster(['127.0.0.1'], "9042")
        session = cluster.connect("models")
        query = ("INSERT INTO %s (model_name, timestamp, target, learning_time, model_path, stat)") % ("models_statistics")
        query = query + " VALUES (%s, %s, %s, %s, %s, %s)"
        session.execute(query, (model_name, timestamp, target, tt, model_path, rmse))
        session.shutdown()
        cluster.shutdown()

        # Stop spark session
        sc.stop()

    if not save:
        return model, sc
Example #14
0
data = data.filter(lambda row: row != header)
schema = data.map(lambda x: Row(id=x[0], make=x[1], vdps=x[2], label=x[3]))
df = sqlContext.createDataFrame(schema)

# string indexer for our categorical features
# this indexes each categorical feature and we will
# save them in a data frame that maps the make name to the string
# for persistence purposes
indexer = StringIndexer(inputCol="make", outputCol="makeIDX")
df = indexer.fit(df).transform(df)
make_idx_mappings = df.select('make', 'makeIDX').distinct().show()

# one hot encoder
# this will convert the indexed strings to sparse one hot vectors
# think of this as dummy feature creation
encoder = OneHotEncoder(inputCol="makeIDX", outputCol="make_sparse_vect")
df = encoder.transform(df)

# spark models expect to see a feature vector and a prediction column
# so we need to put all our features into a vector, in this case
# the sparse vector and vdp count, we also have to do some
# data type transformations from string to double
df = df.withColumn("vdp_int", df["vdps"].cast("double"))
df = df.withColumn("label_int", df["label"].cast("double"))
assembler = VectorAssembler(inputCols=["make_sparse_vect", "vdp_int"],
                            outputCol='features')
df = assembler.transform(df)

# make the model
# the step size and iterations is touchy so results might be funky
gbt = GBTRegressor(maxIter=100,
Example #15
0
# características en este formulario, así que vamos a convertir `vehicle_color` a un conjunto
# de variables dummy. Primero se usa
# [StringIndexer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.StringIndexer)
# para convertir los códigos de cadena a codigos numéricos:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="vehicle_color", outputCol="vehicle_color_ix")
indexer_model = indexer.fit(engineered1)
list(enumerate(indexer_model.labels))
indexed = indexer_model.transform(engineered1)
indexed.select("vehicle_color", "vehicle_color_ix").show(5)

# Luego usamos la libreria
# [OneHotEncoder](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.OneHotEncoder)
# para generar variables dummy :
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="vehicle_color_ix",
                        outputCol="vehicle_color_cd")
encoded = encoder.transform(indexed)
encoded.select("vehicle_color", "vehicle_color_ix", "vehicle_color_cd").show(5)

# **Note:** `vehicle_color_cd` es guardado como un  `SparseVector`.

# Se puede  (manualmente) seleccionar las caracteristicas y etiquetas :
selected = encoded.select("reviewed", "vehicle_year", "vehicle_color_cd",
                          "star_rating", "high_rating")
features = ["reviewed", "vehicle_year", "vehicle_color_cd"]

# MLIB espera que las caracteristicas sean guardadas en una sola columna
# asi que se usa la clase
# [VectorAssembler](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler)
# para guardarlo en una columna de vector:
from pyspark.ml.feature import VectorAssembler
Example #16
0
    # df.printSchema()

    # indexers
    jobIndexer = StringIndexer(inputCol='Job', outputCol='JobIndex')
    genderIndexer = StringIndexer(inputCol='Gender', outputCol='GenderIndex')

    # age splits
    ageSplits = [0.0, 5.0, 12.0, 18.0, 30.0, 60.0, 120.0, float('inf')]
    ageBucketizer = Bucketizer(splits=ageSplits,
                               inputCol="Age",
                               outputCol="bucketedAge")

    # OneHotEncoder
    jobEncoder = OneHotEncoder(dropLast=False,
                               inputCol="JobIndex",
                               outputCol="JobVec")
    genderEncoder = OneHotEncoder(dropLast=False,
                                  inputCol="GenderIndex",
                                  outputCol="GenderVec")

    # boxcox transformer
    ageBoxCox = BoxCoxTransformer(inputCol='Age',
                                  outputCol='AgeT',
                                  alpha=0.54442)
    weightBoxCox = BoxCoxTransformer(inputCol='Weight',
                                     outputCol='WeightT',
                                     alpha=0.15431)
    heightBoxCox = BoxCoxTransformer(inputCol='Height',
                                     outputCol='HeightT',
                                     alpha=0.9695)
col_list = []                                                                     
skew_list = []
for i in range(len(skewness_list)):
  col_list.append(str(skewness_list[i]).replace('(',')').replace('=',')').split(')')[2])
  skew_list.append(str(skewness_list[i]).replace('(',')').replace('=',')').split(')')[4])
large_skew = []
for item in range(len(skew_list)):
  if float(skew_list[item]) > 0.75 or float(skew_list[item])< -0.75 :
    large_skew.append(col_list[item])
for item in large_skew:
  df = df.withColumn(item, log(df[item]+1))                                      # apply log function on columns with large skewness

df_str = df.select('Id')                                                         # one hot encoding 
for item in string_col:
  stringIndexer = StringIndexer(inputCol=item, outputCol= item + ' index' ).fit(df).transform(df)
  encoder = OneHotEncoder(inputCol= item + ' index', outputCol=item + ' onehot').transform(stringIndexer).select('Id',item + ' onehot')
  df = df.drop(item)
  df_str = df_str.join(encoder,'Id')                                             # the output of one hot encoding is a vector
                                                                                 # unlike r or python, which ask input should be a matrix with 
                                                                                 # many columns. The each line of MLlib features input is a vector.
df = df.join(df_str,'Id','inner')
df_price = df.select('Id','SalePrice')
df_variable = df.drop('SalePrice')

assembler = VectorAssembler(inputCols = df_variable.columns, outputCol = 'features')  # Assemble all vectors together as input
output = assembler.transform(df)
input_data = output.select('SalePrice','features')
input_data = input_data.selectExpr("SalePrice as label",'features as features')


from pyspark.ml import Pipeline
def encoder(features):
    encoders = [
        OneHotEncoder(inputCol="idx_{0}".format(x),
                      outputCol="enc_{0}".format(x)) for x in features
    ]
    return encoders
Example #19
0
idxIn = spark.createDataFrame([(Vectors.dense(1, 2, 3), 1),
                               (Vectors.dense(2, 5, 6), 2),
                               (Vectors.dense(1, 8, 9), 3)
                               ]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show()

# COMMAND ----------

from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
from pyspark.sql.functions import udf
def make_win_col(pslot, radwin):
    if (pslot < 50) ^ (radwin == 'False'):
        return 1
    else:
        return 0

myAtomicUDF = udf(make_win_col, IntegerType())

pmpurch = pmatch_purch.withColumn('win', myAtomicUDF(pmatch_purch.player_slot, pmatch_purch.radiant_win))

final_pmpurch= pmpurch.drop('radiant_win','player_slot','match_id')

print final_pmpurch.count()

onehotenc = OneHotEncoder(inputCol='item_id', outputCol="itemid-onehot", dropLast=False)
one_hot_df = onehotenc.transform(final_pmpurch).drop('item_id')
one_hot_df = one_hot_df.withColumnRenamed("itemid-onehot", 'item_id')

va = VectorAssembler(outputCol='features', inputCols=sorted(one_hot_df.columns)[0:-1])
explanatory_df = va.transform(one_hot_df).select('features', one_hot_df['win'].alias('label'))

lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(explanatory_df)

print lrmodel.coefficients
print lrmodel.intercept

############################################
###### ATTEMPT TO DO A RANDOM FOREST ######
#####    FAILED DUE TO NUMBER OF     ######
Example #21
0
    store = Store.create(args.work_dir)

    # Download MNIST dataset
    data_url = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/mnist.bz2'
    libsvm_path = os.path.join(args.data_dir, 'mnist.bz2')
    if not os.path.exists(libsvm_path):
        subprocess.check_output(['wget', data_url, '-O', libsvm_path])

    # Load dataset into a Spark DataFrame
    df = spark.read.format('libsvm') \
        .option('numFeatures', '784') \
        .load(libsvm_path)

    # One-hot encode labels into SparseVectors
    encoder = OneHotEncoder(inputCols=['label'],
                            outputCols=['label_vec'],
                            dropLast=False)
    model = encoder.fit(df)
    train_df = model.transform(df)

    # Train/test split
    train_df, test_df = train_df.randomSplit([0.9, 0.1])

    # Define the PyTorch model without any Horovod-specific parameters
    class Net(nn.Module):
        def __init__(self):
            super(Net, self).__init__()
            self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
            self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
            self.conv2_drop = nn.Dropout2d()
            self.fc1 = nn.Linear(320, 50)
Example #22
0
# In[101]:

df_model = df_ORG
# stringIndexer1 = StringIndexer(inputCol="Origin", outputCol="originIndex")
# model_stringIndexer = stringIndexer1.fit(df_model)
# indexedOrigin = model_stringIndexer.transform(df_model)
# encoder1 = OneHotEncoder(dropLast=False, inputCol="originIndex", outputCol="originVec")
# df_model = encoder1.transform(indexedOrigin)

# In[ ]:

stringIndexer2 = StringIndexer(inputCol="Dest", outputCol="destIndex")
model_stringIndexer = stringIndexer2.fit(df_model)
indexedDest = model_stringIndexer.transform(df_model)
encoder2 = OneHotEncoder(dropLast=False,
                         inputCol="destIndex",
                         outputCol="destVec")
df_model = encoder2.transform(indexedDest)

# We use __labeled point__ to make local vectors associated with a label/response. In MLlib, labeled points are used in supervised learning algorithms and they are stored as doubles. For binary classification, a label should be either 0 (negative) or 1 (positive).

# In[105]:

assembler = VectorAssembler(inputCols=[
    'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'Hour', 'Distance', 'destVec'
],
                            outputCol="features")
output = assembler.transform(df_model)
airlineRDD = output.map(
    lambda row: LabeledPoint([0, 1][row['DepDelayed']], row['features']))
def main(argv):

    parquet_path = argv[1]
    tf_path = argv[2]
    target = argv[3]

    spark = SparkSession.builder.config("spark.driver.memory", "32g").config("spark.executor.memory", "32g")\
        .config("spark.driver.maxResultSize", "20g").getOrCreate()

    with open("transform_spark.txt", "w") as file:
        file.write("spark context" + str(spark.sparkContext))
        file.write("===SeessionID===")
        file.write(str(id(spark)))

    print(spark)
    df = spark.read.option("header", "true") \
        .option("inferSchema", "true") \
        .parquet(parquet_path)
    df.repartition(10)

    # DATA TYPE SUMMARY
    data_types = defaultdict(list)
    for entry in df.schema.fields:
        data_types[str(entry.dataType)].append(entry.name)

    # NUMERIC PIPELINE
    numeric_features = data_types["DoubleType"] + data_types["IntegerType"]
    if target in numeric_features:
        numeric_features.remove(target)

    for c in data_types["IntegerType"]:
        df = df.withColumn(c, df[c].cast("double"))

    imputer = Imputer(inputCols=numeric_features, outputCols=[num + "_imputed" for num in numeric_features])
    numeric_imputed = VectorAssembler(inputCols=imputer.getOutputCols(), outputCol="imputed")
    scaler = StandardScaler(inputCol="imputed", outputCol="scaled")
    num_assembler = VectorAssembler(inputCols=["scaled"], outputCol="numeric")
    num_pipeline = Pipeline(stages=[imputer, numeric_imputed, scaler] + [num_assembler])
    df = num_pipeline.fit(df).transform(df)

    # CATEGORY PIPELINE
    category_features = [var for var in data_types["StringType"] if var != target]
    cat_missing = {}
    for var in category_features:
        cat_missing[var] = "unknown"  # Impute category features
    df = df.fillna(cat_missing)
    useful_category_features = []

    for var in category_features:
        # Drop if distinct values in a category column is greater than 15% of sample number.
        if df.select(var).distinct().count() < 0.15 * df.count():
            useful_category_features.append(var)

    indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid='skip')
                for c in useful_category_features]

    encoders = [
        OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol()))
        for indexer in indexers]

    cat_assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders],
                                    outputCol="category")
    cat_pipeline = Pipeline(stages=indexers + encoders + [cat_assembler])
    df = cat_pipeline.fit(df).transform(df)


    # Integrate features
    features_processed = VectorAssembler(inputCols=["category", "numeric"], outputCol="features")
    tot_pipeline = Pipeline(stages=[features_processed])
    processed = tot_pipeline.fit(df).transform(df)
    processed.write.mode("overwrite").parquet(tf_path)

    feature_info = {"numeric_features": numeric_features, "category_features": category_features}

    with open("./feature_info.pickle", "wb") as handle:
        pickle.dump(feature_info, handle)
Example #24
0
#1st decimal is an area of 11.1km
#2nd decimal is an area of 1.1km
#3rd decimal is 110 meters - USING three decimal places
#4th is 11 meters
pandas_df['x_sim'] = pandas_df['x'].str[0:8]  #non-negative data
pandas_df['x'] = pandas_df['x'].str[0:8]  #non-negative
pandas_df['y_sim'] = pandas_df['y'].str[0:6]

#encode the police dept as a feature
from pyspark.ml.feature import OneHotEncoder, StringIndexer
stringIndexer = StringIndexer(inputCol="pd_district",
                              outputCol="pd_district_Index")
model = stringIndexer.fit(data_df)
indexed = model.transform(data_df)
encoder = OneHotEncoder(dropLast=False,
                        inputCol="pd_district_Index",
                        outputCol="pd")
encoded = encoder.transform(indexed)

#encode the dependent variable - category_predict
classifyIndexer = StringIndexer(inputCol="category_predict",
                                outputCol="category")
classifymodel = classifyIndexer.fit(encoded)
encoded2 = classifymodel.transform(encoded)

#keep the following columns: x, y, hour, day, month, year, dayofweek, week, x_sim, y_sim
#drop the following
cleaned = encoded2.select([
    c for c in encoded2.columns if c not in {
        ' day_of_week', 'category_predict', 'address', 'date',
        'description_ignore', 'pd_district', 'resolution', 'pd_district_Index'
cols = df.columns
#df.printSchema()
print(cols)

#8

from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
categoricalColumns = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
    'poutcome'
]
stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()],
                            outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
label_stringIdx = StringIndexer(inputCol='deposit', outputCol='label')
stages += [label_stringIdx]
numericCols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

#9
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedCols = ['label', 'features'] + cols
df = df.select(selectedCols)
artifact_location = run.info.artifact_uri

# COMMAND ----------

azRun = Run(exp, run_id)

# COMMAND ----------

df = (spark.read.format("csv").option("inferSchema", "True").option(
    "header",
    "True").load("/databricks-datasets/bikeSharing/data-001/day.csv"))
# split data
train_df, test_df = df.randomSplit([0.7, 0.3])

# One Hot Encoding
mnth_encoder = OneHotEncoder(inputCol="mnth", outputCol="encoded_mnth")
weekday_encoder = OneHotEncoder(inputCol="weekday",
                                outputCol="encoded_weekday")

# set the training variables we want to use
train_cols = ['encoded_mnth', 'encoded_weekday', 'temp', 'hum']

# convert cols to a single features col
assembler = VectorAssembler(inputCols=train_cols, outputCol="features")

# Set linear regression model
lr = LinearRegression(featuresCol="features", labelCol="cnt")

# Create pipeline
pipeline = Pipeline(stages=[mnth_encoder, weekday_encoder, assembler, lr])
Example #27
0
df.summary().show()
df.groupBy('Platform').count().show()
df.groupBy('Country').count().show()
df.groupBy('Status').count().show()
df.groupBy('Country').mean().show()
df.groupBy('Platform').mean().show()
df.groupBy('Status').mean().show()

from pyspark.ml.feature import StringIndexer, OneHotEncoder

platform_indexer = StringIndexer(inputCol='Platform',
                                 outputCol='Platform_Num').fit(df)
df = platform_indexer.transform(df)
df.show(5)

platform_encoder = OneHotEncoder(inputCol='Platform_Num',
                                 outputCol='Platform_Vector').fit(df)
df = platform_encoder.transform(df)
df.show(5)

df.groupBy('Platform').count().orderBy('count', ascending=False).show(5)
df.groupBy('Platform_Num').count().orderBy('count', ascending=False).show(5)
df.groupBy('Platform_Vector').count().orderBy('count', ascending=False).show(5)

platform_encoder = OneHotEncoder(inputCol='Platform_Num',
                                 outputCol='Platform_Vector').fit(df)
df = platform_encoder.transform(df)
df.show(5)

country_indexer = StringIndexer(inputCol='Country',
                                outputCol='Country_Num').fit(df)
df = country_indexer.transform(df)
Example #28
0
from pyspark import SparkContext, SparkConf
import numpy

#conf = SparkConf().setAppName('MyFirstStandaloneApp').setMaster('local')
conf = SparkConf().setAppName('MyFirstStandaloneApp')
sc = SparkContext(conf=conf)

spark = SparkSession.builder.appName('EventCode').getOrCreate()
train = spark.read.csv('hdfs://tmp/DA-12-v3.csv',
                       inferSchema=True,
                       header=True)

Actor1_indexer = StringIndexer(
    inputCol='Actor1Code',
    outputCol='Actor1Code_indexer').setHandleInvalid("keep")
Actor1Code_encoder = OneHotEncoder(inputCol='Actor1Code_indexer',
                                   outputCol='Actor1CodeVec')

Actor2Code_indexer = StringIndexer(
    inputCol='Actor2Code',
    outputCol='Actor2Code_indexer').setHandleInvalid("keep")
Actor2Code_encoder = OneHotEncoder(inputCol='Actor2Code_indexer',
                                   outputCol='Actor2CodeVec')

Actor1Country_indexer = StringIndexer(
    inputCol='Actor1Geo_CountryCode',
    outputCol='Actor1Country_indexer').setHandleInvalid("keep")
Actor1Country_encoder = OneHotEncoder(inputCol='Actor1Country_indexer',
                                      outputCol='Actor1CountryVec')

Actor2Country_indexer = StringIndexer(
    inputCol='Actor2Geo_CountryCode',
Example #29
0
                                StringIndexer)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.appName("titanic").getOrCreate()

df = spark.read.csv("./files/titanic.csv", inferSchema=True, header=True)
df.show()

my_cols = df.select(
    ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
final_data = my_cols.na.drop()

gender_idx = StringIndexer(inputCol='Sex', outputCol='SexIndex')
gender_enc = OneHotEncoder(inputCol='SexIndex', outputCol='SexVec')

embark_idx = StringIndexer(inputCol='Embarked', outputCol='EmbarkIndex')
embark_enc = OneHotEncoder(inputCol='EmbarkIndex', outputCol='EmbarkVec')

assembler = VectorAssembler(inputCols=[
    'Pclass', 'SexVec', 'EmbarkVec', 'Age', 'SibSp', 'Parch', 'Fare'
],
                            outputCol='features')

log_reg = LogisticRegression(featuresCol='features', labelCol='Survived')

pipeline = Pipeline(stages=[
    gender_idx, embark_idx, gender_enc, embark_enc, assembler, log_reg
])
Example #30
0
cols = [
    'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
]

data_cols = data.select(cols)
data_cols.show()
final_data = data_cols.na.drop()

# Transform the categorical columns into numbers
gender_indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')

# A B C
# 0 1 2
# One hot encode ----> this is mapping everyting into [1, 0, 0] [0, 1, 0] etc.
gender_encoder = OneHotEncoder(
    inputCol='SexIndex', outputCol='SexVec'
)  # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1]

embark_indexer = StringIndexer(inputCol='Embarked', outputCol='EmbarkedIndex')
embark_encoder = OneHotEncoder(
    inputCol='EmbarkedIndex', outputCol='EmbarkedVec'
)  # ---> each entry will be converted to a vector A = [1, 0] B = [0, 1]

new_cols = ['Pclass', 'SexVec', 'Age', 'SibSp', 'Parch', 'Fare', 'EmbarkedVec']
assembler = VectorAssembler(inputCols=new_cols, outputCol='features')

logreg_titanic = LogisticRegression(featuresCol='features',
                                    labelCol='Survived')

pipeline = Pipeline(stages=[
    gender_indexer, embark_indexer, gender_encoder, embark_encoder, assembler,