print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])


#****************************************************************
#*********************CROSS VALIDATION: 80%/20%******************
#*******************Model: DecisionTreeClassifier*****************
#*****************************************************************

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
#final_data.printSchema()

# ## Machine Learning Pipeline

# In[66]:

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, VectorIndexer
from pyspark.sql.functions import col
stages = []
for stringCols in string_cols:
    stringIndexer = StringIndexer(inputCol=stringCols,
                                  outputCol=stringCols + 'Index',
                                  handleInvalid='skip')
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                            outputCol=stringCols + "stringEnc")
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol='base_plan_id',
                                outputCol='label',
                                handleInvalid='skip')
stages += [label_stringIdx]
assemblerInputs = [c + "stringEnc" for c in string_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

# In[67]:

import time
start_time = time.time()
Exemple #3
0
features=dfBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema)

print "Features from bigrams created"

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"


dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol=string_indexer.getOutputCol(), maxDepth=10)


from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')


from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
grid=(ParamGridBuilder()
     .baseOn([evaluator.metricName,'precision'])
     .addGrid(dt.maxDepth, [10,20])
     .build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator)

from time import time
Exemple #4
0
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ["review", "label"])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english"))
)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
Exemple #5
0
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8, 0.2])

print "Random split is done"

tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words')
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(),
                       outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(),
                            labelCol=string_indexer.getOutputCol(),
                            maxDepth=10)

pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer, dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='target_indexed',
                                              metricName='precision')

grid = (ParamGridBuilder().baseOn([evaluator.metricName, 'precision'
                                   ]).addGrid(dt.maxDepth, [10, 20]).build())
cv = CrossValidator(estimator=pipeline,
                    estimatorParamMaps=grid,
                    evaluator=evaluator)

print "Grid is build"
def DataPreparation():
    spark = SparkSession.builder.appName('SistemaDeDeteccion').master(
        "local[*]").getOrCreate()  #Creamos la sesión de spark
    data = spark.read.csv("Burnout_Data.csv", header=True,
                          inferSchema=True)  #Cargamos el dataset
    data = data.select('Tiempo_PlazaActual', 'EstadoCivil', 'Burnout_Antes',
                       'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso',
                       'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias',
                       'Sales_Social', 'Edad', 'Estado_Animo',
                       'Tiempo_Vida_Laboral', 'Hijos', 'Lectura',
                       'Hora_Gratificante', 'Horas_Activ_Fisica')
    #Nos quedamos con las columnas de importancia p>1 según el análisis de componentes
    cols = data.columns  #Guardamos en una variable los nombres de las columnas

    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler  #importamos las librerias necesiarias para convertir datos categóricos
    # en datos tratables por los algoritmos, es decir transformandolos a números
    categoricalColumns = [
        'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social',
        'Edad', 'Estado_Animo', 'Lectura', 'EstadoCivil'
    ]
    stages = [
    ]  #en esta variable guardaremos cada uno de los pasos para luego aplicarlos en el PipeLine
    for categoricalCol in categoricalColumns:  #indexamos para cada una de las variables categoricas de la lista
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[stringIndexer.getOutputCol()],
            outputCols=[categoricalCol + "classVec"])
        #una vez indexadas utilizamos el OneHotEncoderEstimator que le asigna a cada valor de la variable categórica un número
        stages += [stringIndexer.setHandleInvalid("keep"), encoder]
        #Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    label_stringIdx = StringIndexer(
        inputCol="Burnout_Antes", outputCol="label"
    )  #Indexamos como label la variable que queremos predecir que es el Burnout_Antes cuyos valores
    #Son VERDADERO y FALSO
    stages += [label_stringIdx.setHandleInvalid("keep")]
    # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    numericCols = [
        'Tiempo_PlazaActual', 'Hora_Social', 'Horas_Cuidados', 'Calorias',
        'Peso', 'Tiempo_Vida_Laboral', 'Hijos', 'Hora_Gratificante',
        'Horas_Activ_Fisica'
    ]
    #Con las variables categóricas transformadas a números podemos hacer un vector uniendolo con las variables numéricas.
    assemblerInputs = [c + "classVec"
                       for c in categoricalColumns] + numericCols
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")
    #este proceso nos da como resultado las "features" que contienen en objeto vector las variables numéricas y categóricas.
    stages += [assembler.setHandleInvalid("keep")]
    # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve

    from pyspark.ml import Pipeline
    pipeline = Pipeline(stages=stages)
    #Inicializamos nuestro Pipeline y le pasamos la lista de pasos que debe ejecutar, que se encuentran en la variable stages.
    pipelineModel = pipeline.fit(data)
    data = pipelineModel.transform(data)
    #Ejecutamos y entreamos el modelo que sería el procesamiento de los datos.
    path = 'modelo_Pipeline'
    os.mkdir(path)
    pipelineModel.save(os.path.join(path, 'Pipeline'))
    #Guardamos este modelo, debido a que para predecir necesitamos aplicar este mismo modelo a los nuevos datos
    selectedCols = ['label', 'features'] + cols
    data = data.select(selectedCols)
    #Seleccionamos la variable label y features, más la variable cols que contiene las columnas antes de hacer el procesado de datos

    train, test = data.randomSplit([0.7, 0.3])
    #Para el entrenamiento y las pruebas utilizamos entonces un randomSplit para dividir el dataset en un porcentaje 70% entrenamiento y 30% pruebas
    print("Training Dataset Count: " + str(train.count()))
    print("Test Dataset Count: " + str(test.count()))
    #imprimimos la cantidad de filas que tiene cada uno y devolvemos estos datos para su utilización por los algoritmos.
    return train, test
Exemple #7
0
def data_processing(df):
    '''
    :param data: A PySpark dataframe
    :return: A preprocessed data that has been cleaned, indexed and assembled
    '''
    df.createOrReplaceTempView("data")

    processed_data = spark.sql("""
    select
        host_id,
        price,
        bathrooms,
        bedrooms,
        room_type,
        property_type,
        case when host_is_superhost = True
            then 1.0
            else 0.0
        end as host_is_superhost,
        accommodates,
        cancellation_policy,
        minimum_nights,
        maximum_nights,
        availability_30,
        availability_60,
        availability_90,
        availability_365,
        case when security_deposit is null
            then 0.0
            else security_deposit
        end as security_deposit,
        case when number_of_reviews is null
            then 0.0
            else number_of_reviews
        end as number_of_reviews,
        case when extra_people is null
            then 0.0
            else extra_people
        end as extra_people,
        case when instant_bookable = True
            then 1.0
            else 0.0
        end as instant_bookable,
        case when cleaning_fee is null
            then 0.0
            else cleaning_fee
        end as cleaning_fee,
        case when review_scores_rating is null
            then 0.0
            else review_scores_rating
        end as review_scores_rating,
        case when review_scores_accuracy is null
            then 0.0
            else review_scores_accuracy
        end as review_scores_accuracy,
        case when review_scores_cleanliness is null
            then 0.0
            else review_scores_cleanliness
        end as review_scores_cleanliness,
        case when review_scores_checkin is null
            then 0.0
            else review_scores_checkin
        end as review_scores_checkin,
        case when review_scores_communication is null
            then 0.0
            else review_scores_communication
        end as review_scores_communication,
        case when review_scores_location is null
            then 0.0
            else review_scores_location
        end as review_scores_location,
        case when review_scores_value is null
            then 0.0
            else review_scores_value
        end as review_scores_value,
        case when square_feet is not null and square_feet > 100
            then square_feet
            when (square_feet is null or square_feet <=100) and (bedrooms is null or bedrooms = 0)
            then 350.0
            else 380 * bedrooms
        end as square_feet,
        case when bathrooms >= 2
            then 1.0
            else 0.0
        end as n_bathrooms_more_than_two,
        case when amenity_wifi = True
            then 1.0
            else 0.0
        end as amenity_wifi,
        case when amenity_heating = True
            then 1.0
            else 0.0
        end as amenity_heating,
        case when amenity_essentials = True
            then 1.0
            else 0.0
        end as amenity_essentials,
        case when amenity_kitchen = True
            then 1.0
            else 0.0
        end as amenity_kitchen,
        case when amenity_tv = True
            then 1.0
            else 0.0
        end as amenity_tv,
        case when amenity_smoke_detector = True
            then 1.0
            else 0.0
        end as amenity_smoke_detector,
        case when amenity_washer = True
            then 1.0
            else 0.0
        end as amenity_washer,
        case when amenity_hangers = True
            then 1.0
            else 0.0
        end as amenity_hangers,
        case when amenity_laptop_friendly_workspace = True
            then 1.0
            else 0.0
        end as amenity_laptop_friendly_workspace,
        case when amenity_iron = True
            then 1.0
            else 0.0
        end as amenity_iron,
        case when amenity_shampoo = True
            then 1.0
            else 0.0
        end as amenity_shampoo,
        case when amenity_hair_dryer = True
            then 1.0
            else 0.0
        end as amenity_hair_dryer,
        case when amenity_family_kid_friendly = True
            then 1.0
            else 0.0
        end as amenity_family_kid_friendly,
        case when amenity_dryer = True
            then 1.0
            else 0.0
        end as amenity_dryer,
        case when amenity_fire_extinguisher = True
            then 1.0
            else 0.0
        end as amenity_fire_extinguisher,
        case when amenity_hot_water = True
            then 1.0
            else 0.0
        end as amenity_hot_water,
        case when amenity_internet = True
            then 1.0
            else 0.0
        end as amenity_internet,
        case when amenity_cable_tv = True
            then 1.0
            else 0.0
        end as amenity_cable_tv,
        case when amenity_carbon_monoxide_detector = True
            then 1.0
            else 0.0
        end as amenity_carbon_monoxide_detector,
        case when amenity_first_aid_kit = True
            then 1.0
            else 0.0
        end as amenity_first_aid_kit,
        case when amenity_host_greets_you = True
            then 1.0
            else 0.0
        end as amenity_host_greets_you,
        case when amenity_translation_missing_en_hosting_amenity_50 = True
            then 1.0
            else 0.0
        end as amenity_translation_missing_en_hosting_amenity_50,
        case when amenity_private_entrance = True
            then 1.0
            else 0.0
        end as amenity_private_entrance,
        case when amenity_bed_linens = True
            then 1.0
            else 0.0
        end as amenity_bed_linens,
        case when amenity_refrigerator = True
            then 1.0
            else 0.0
        end as amenity_refrigerator
    from data
    where bedrooms is not null
    """)

    processed_data = processed_data.na.drop()

    cat_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, StringType)
    ]
    num_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, IntegerType)
    ]
    decimal_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, DecimalType)
    ]
    double_cols = [
        f.name for f in processed_data.schema.fields
        if isinstance(f.dataType, DoubleType)
    ]
    num_features = num_cols + decimal_cols + double_cols
    dataset_imputed = processed_data.persist()

    stages = []
    for x in cat_cols:
        cats_indexer = StringIndexer(inputCol=x, outputCol=x + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[cats_indexer.getOutputCol()], outputCols=[x + "encode"])
        stages += [cats_indexer, encoder]

    assembler_inputs = [c + "encode" for c in cat_cols] + num_features
    assembler = VectorAssembler(inputCols=assembler_inputs,
                                outputCol="features")
    stages += [assembler]
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(dataset_imputed)
    df = pipeline_model.transform(dataset_imputed)

    return df
Exemple #8
0
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

#convert relevant categorical into one hot encoded
indexer1 = StringIndexer(inputCol="EntityCode",
                         outputCol="EntityCodeIdx").setHandleInvalid("skip")
indexer2 = StringIndexer(inputCol="Zip",
                         outputCol="ZipIdx").setHandleInvalid("skip")
indexer3 = StringIndexer(inputCol="ProviderType",
                         outputCol="ProviderTypeIdx").setHandleInvalid("skip")

#gather all indexers as inputs to the One Hot Encoder
inputs = [
    indexer1.getOutputCol(),
    indexer2.getOutputCol(),
    indexer3.getOutputCol()
]

#create the one hot encoder
encoder_outputs = ["EntityCodeVec", "ZipVec", "ProviderTypeVec"]
encoder = OneHotEncoderEstimator(inputCols=inputs, outputCols=encoder_outputs)

#run it through a pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, encoder])
encodedData = pipeline.fit(df_physicians_all).transform(df_physicians_all)

# COMMAND ----------

encodedData.select(encoder_outputs).show(5)
Exemple #9
0
def main(spark, data_file, model_file):
    '''Main routine for supervised training

    Parameters
    ----------
    spark : SparkSession object

    data_file : string, path to the parquet file to load

    model_file : string, path to store the serialized model file
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    ###

    # Read data
    df = spark.read.parquet(data_file)

    # Take 1/10 data without replacement
    df = df.sample(False, 0.1, seed = 0)

    # Vectorize selected features
    features = ['mfcc_' + '%.2d' % i for i in range(20)]
    assembler = VectorAssembler(inputCols=features, outputCol="vectorized_features")

    # Standardize the features
    scaler = StandardScaler(inputCol="vectorized_features", outputCol="scaled_features", withStd=True, withMean=False)

    # Transform string target variable into numerical
    indexer = StringIndexer(inputCol="genre", outputCol="label", handleInvalid = "skip")

    # Build logistic regression
    lr = LogisticRegression(maxIter=20, featuresCol = scaler.getOutputCol(), labelCol=indexer.getOutputCol())

    # Build a pipeline
    pipeline = Pipeline(stages = [assembler, scaler, indexer, lr])

    # Build parameter grid and cross validation
    paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam,[0.1,0.3,0.5,0.8]).addGrid(lr.regParam, [0.1,0.08,0.05,0.02,0.01]).build()

    crossval = CrossValidator(estimator = pipeline, estimatorParamMaps = paramGrid, evaluator = MulticlassClassificationEvaluator(), numFolds = 5)

    # Save model
    cvModel = crossval.fit(df)
    cvModel.bestModel.write().overwrite().save(model_file)
# COMMAND ----------

# MAGIC %md The ML package needs the label and feature vector to be added as columns to the input dataframe. We set up a pipeline to pass the data through transformers in order to extract the features and label. We index each categorical column using the `StringIndexer` to a column of number indices, then convert the indexed categories into one-hot encoded variables with at most a single one-value. These binary vectors are appended to the end of each row. Encoding categorical features allows decision trees to treat categorical features appropriately, improving performance. We then use the `StringIndexer` to encode our labels to label indices.

# COMMAND ----------

categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Using the slightly older OneHotEncoder (instead of OneHotEncoderEstimator) for compatibility reasons when operationalizing within the DSVM
    encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol + "classVec")
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label")
stages += [label_stringIdx]

# COMMAND ----------

# MAGIC %md Now we need to use the `VectorAssembler` to combine all the feature columns into a single vector column. This includes our numeric columns as well as the one-hot encoded binary vector columns.

# COMMAND ----------

# Transform all features into a vector using VectorAssembler
numericCols = ["Month", "DayofMonth", "CRSDepHour", "DayOfWeek", "WindSpeed", "SeaLevelPressure", "HourlyPrecip"]
Exemple #11
0
 dataset_df = sqlContext.read.csv('salaries.csv', header='true', inferSchema='true')
 # initializing stages of main transformation pipeline
 stages = []
 # list of categorical features for further hot-encoding
 cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"]
 # removing column with ID field
 dataset_df = dataset_df.drop('_c0')
 # bining numeric features by local binner udf function (specified for current dataset if needed)
 dataset_df = dataset_df.withColumn('sincephd_bin', binner(dataset_df['sincephd']))
 dataset_df = dataset_df.withColumn('service_bin', binner(dataset_df['service']))
 dataset_df = dataset_df.withColumn('model_type', sf.lit(0))
 dataset_df = dataset_df.drop('sincephd', 'service')
 # hot encoding categorical features
 for feature in cat_features:
     string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index")
     encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"])
     encoder.setDropLast(False)
     stages += [string_indexer, encoder]
 assembler_inputs = [feature + "_vec" for feature in cat_features]
 assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs")
 stages += [assembler]
 assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features")
 stages += [assembler_final]
 pipeline = Pipeline(stages=stages)
 pipeline_model = pipeline.fit(dataset_df)
 dataset_transformed = pipeline_model.transform(dataset_df)
 df_transform_fin = dataset_transformed.select('features', label, 'model_type').toPandas()
 train, test = train_test_split(df_transform_fin, test_size=0.3, random_state=0)
 train_df = sqlContext.createDataFrame(train)
 test_df = sqlContext.createDataFrame(test)
 decode_dict = {}
Exemple #12
0
    def add_pyspark_features(self,
                             transform_type='countvectorizer',
                             pca=False,
                             pca_k=500,
                             chi_sqr=False,
                             chi_feature_num=500):
        '''
        Add built in pyspark feature transformations using pyspark's
        Pipeline.

        Input:
        -------
        transform_type : str (Determines how to transform the reviews
                               - 'countvectorizer'
                               - 'bigram'
                               - 'tfidf'
                               - 'word2vec')
        pca : boolean (Determines whether to run PCA on the tranformed review.)
        pca_number : int (Number of features you want to reduce to.)

        Output:
        -------
        None
        '''

        # Set up stages.
        stages = []

        # Tokenize reviews into vectors of words.
        regexTokenizer = RegexTokenizer(inputCol="reviewText",
                                        outputCol="words",
                                        pattern="\\W")
        # Add to stages.
        stages += [regexTokenizer]

        # Remove stopwords from the word vectors.
        add_stopwords = ['the', 'a', 'to']
        stopwordsRemover = StopWordsRemover(
            inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
        # Add to stages.
        stages += [stopwordsRemover]

        # Using CountVectorizer as our review transformation.
        if transform_type == 'countvectorizer':
            # Create count vectors from the filtered bag of words.
            countVectors = CountVectorizer(inputCol="filtered",
                                           outputCol="review_vector",
                                           vocabSize=5000,
                                           minDF=5)
            # Add to stages.
            stages += [countVectors]

        # Using TFIDF as our review transformation.
        if transform_type == 'tfidf':
            # Creating IDF from the words the filtered words
            hashingTF = HashingTF(inputCol="filtered",
                                  outputCol="rawFeatures",
                                  numFeatures=5000)
            idf = IDF(inputCol="rawFeatures",
                      outputCol="review_vector",
                      minDocFreq=5)
            # Add to stages
            stages += [hashingTF, idf]

        # Using bigrams as our review transformation.
        if transform_type == 'bigram':

            # Single grams.
            unigram = NGram(n=1, inputCol='words', outputCol='unigrams')
            stages += [unigram]

            # Add n-grams to feature set.
            bigrams = NGram(n=2, inputCol="words", outputCol="bigrams")
            stages += [bigrams]

            # Vectorize unigrams
            unigrams_vector = CountVectorizer(inputCol="unigrams",
                                              outputCol="unigrams_vector",
                                              vocabSize=2500)
            stages += [unigrams_vector]

            bigrams_vector = CountVectorizer(inputCol="bigrams",
                                             outputCol="bigrams_vector",
                                             vocabSize=2500)
            stages += [bigrams_vector]

            # Vector assemble the unigrams and the bigrams
            ngrams = VectorAssembler(
                inputCols=['unigrams_vector', 'bigrams_vector'],
                outputCol='review_vector')
            stages += [ngrams]

        # Using word2vec as our review transformation.
        if transform_type == 'word2vec':
            word2vec = Word2Vec(vectorSize=5000,
                                minCount=0,
                                inputCol="words",
                                outputCol="review_vector")
            stages += [word2vec]

        # Use PCA if user wants to use it.
        if pca:
            pca = PCA(k=pca_k,
                      inputCol="review_vector",
                      outputCol="pcaFeatures")
            stages += [pca]

        # Perform one hot encoding on all categorical variables.
        categorical_cols = ['reviewerID']

        for col in categorical_cols:
            # Map each categorical value to an index (number).
            stringIndexer = StringIndexer(inputCol=col,
                                          outputCol=col + "_Index")
            # Use OneHotEncoder to convert categorical variables
            # into binary SparseVectors. Similar to pd.get_dummies()
            encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(),
                                    outputCol=col + "_classVec")
            # Add to stages
            stages += [stringIndexer, encoder]

        # Numeric columns
        numericCols = ['overall_transform']

        # Get columns that we want from before spark pipleline.
        prev_features = [  #'neg',
            #'neu',
            #'pos',
            #'compound',
            'sentence_cnt',
            'word_cnt',
            'punctuation_cnt',
            'capital_cnt',
            'upper_word_cnt',
            'avg_word_cnt',
            'avg_punc_cnt',
            'avg_capital_cnt',
            'avg_upper_cnt'
        ]

        # Vector assemble all features into one column called features.
        assemblerInputs = ['review_vector'] + numericCols + prev_features

        # Add pca to features if user wants.
        if pca:
            assemblerInputs += ['pcaFeatures']
            assemblerInputs.remove('review_vector')

        assembler = VectorAssembler(inputCols=assemblerInputs,
                                    outputCol="unstandard_features")
        stages += [assembler]

        # Do Chi-Squared Feature Reduction if wanted.
        if chi_sqr:
            chi_selector = ChiSqSelector(numTopFeatures=chi_feature_num,
                                         featuresCol="unstandard_features",
                                         outputCol="chi_features",
                                         labelCol="label")
            stages += [chi_selector]

            scaler = StandardScaler(inputCol="chi_features",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=False)
            stages += [scaler]
        else:
            scaler = StandardScaler(inputCol="unstandard_features",
                                    outputCol="features",
                                    withStd=True,
                                    withMean=False)
            stages += [scaler]

        # Initialize the pipeline with the stages that were set.
        pipeline = Pipeline(stages=stages)

        # Fit the pipeline to training documents.
        pipelineFit = pipeline.fit(self.df)
        self.df = pipelineFit.transform(self.df)
    label = 'salary'
    numerical_cols = [
        'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
        'hours_per_week'
    ]
    categorical_cols = [
        "workclass", "education", "marital_status", "occupation",
        "relationship", "race", "sex", "native_country"
    ]
    stages = []

    # One hot encode categorical cols
    for cname in categorical_cols:
        string_idxer = StringIndexer(inputCol=cname, outputCol=cname + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[string_idxer.getOutputCol()],
            outputCols=[cname + 'classVec'])
        stages += [string_idxer, encoder]

    # Convert labels (Slary) to 0 and 1
    label_idxer = StringIndexer(inputCol="salary", outputCol="label")
    stages += [label_idxer]

    # Standardize numberical cols
    numerical_assembler = VectorAssembler(inputCols=numerical_cols,
                                          outputCol='numFeatures')
    scaler = StandardScaler(inputCol='numFeatures',
                            outputCol='norm_cols',
                            withStd=True,
                            withMean=True)
Exemple #14
0
        return when(col(x) != "", col(x)).otherwise(impute)

    def impute_1pc(x, larger_than_1pc):
        return when(col(x).isin(list(larger_than_1pc)[0]),
                    col(x)).otherwise('less_than_1pc')

    def log_transformation(x):
        return when(col(x) < 0, col(x)).otherwise(log1p(col(x)))

    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler, ChiSqSelector
    stages = []
    for categoricalCol in categorical_features_select:
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + 'Index')
        encoder = OneHotEncoderEstimator(
            inputCols=[stringIndexer.getOutputCol()],
            outputCols=[categoricalCol + "classVec"])
        stages += [stringIndexer, encoder]

    assemblerInputs = [c + "classVec"
                       for c in categorical_features_select] + numeric_features
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")
    stages += [assembler]
    scaler = StandardScaler(inputCol='features',
                            outputCol='selected_features',
                            withStd=True,
                            withMean=True)
    stages += [scaler]
    # selector = ChiSqSelector(numTopFeatures=50, featuresCol="scaled_features",
    #                          outputCol="selected_features", labelCol="label")
Exemple #15
0
print "Text is cleaned"

sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Random split is done"

tokenizerNoSw = tr.NLTKWordPunctTokenizer(
    inputCol="review", outputCol="wordsNoSw",  
    stopwords=set(nltk.corpus.stopwords.words('english')))
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf')
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01)

pipeline = Pipeline(stages=[tokenizerNoSw,
                            hashing_tf,
                            idf,
                            string_indexer,
                            dt])

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

# grid=(ParamGridBuilder()
#      .baseOn([evaluator.metricName,'precision'])
#      .addGrid(dt.maxDepth, [10,20])
#      .build())

#cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid,evaluator=evaluator)
def flight(input, output):

    import pyspark
    from pyspark.sql import SparkSession
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession
    import pyspark.sql.functions as F
    from pyspark.sql import SQLContext
    from pyspark.sql.types import IntegerType, StringType
    from pyspark.ml import Pipeline
    from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
    from pyspark.ml.evaluation import BinaryClassificationEvaluator, ClusteringEvaluator
    from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
    from pyspark.ml.clustering import KMeans

    # Dropping the null values from the dataset
    def drop_nan_values_spark(df):
        df_drop_nan_values_spark = df.na.drop()
        return df_drop_nan_values_spark

    # Merging the labels into Binary values to perform Binary Classification
    def merge_labels_spark(df):
        df_merge_labels_spark = df.select(df.CASE_SUBMITTED_YEAR, df.EMPLOYER_NAME, df.SOC_NAME, df.FULL_TIME_POSITION, df.PREVAILING_WAGE,
               df.WORKSITE_STATE, F.when(df.CASE_STATUS == "WITHDRAWN", "DENIED") \
                    .when(df.CASE_STATUS == "CERTIFIEDWITHDRAWN", "CERTIFIED") \
                    .otherwise(df.CASE_STATUS).alias("CASE_STATUS"))
        return df_merge_labels_spark

    # Dividing the wages into the ranges
    def prevailing_wage_spark(df):
        df_prevailing_wage_spark = df.select(df.CASE_SUBMITTED_YEAR, df.EMPLOYER_NAME, df.SOC_NAME, df.FULL_TIME_POSITION,
                F.when(df.PREVAILING_WAGE <= 20000, "0-20000") \
                .when((df.PREVAILING_WAGE > 20000) & (df.PREVAILING_WAGE <= 50000), "20000-50000") \
                .when((df.PREVAILING_WAGE > 50000) & (df.PREVAILING_WAGE <= 120000), "50000-120000") \
                .when((df.PREVAILING_WAGE > 120000) & (df.PREVAILING_WAGE <= 250000), "120000-250000") \
                .otherwise(">250000").alias("WAGE_RANGE"), df.WORKSITE_STATE, df.CASE_STATUS)
        return df_prevailing_wage_spark

    # Changing the individual field to its Industry field
    def classify_employer_spark(df):
        df_classify_employer_spark = df.select(df.CASE_SUBMITTED_YEAR, df.EMPLOYER_NAME,
                F.when((df.SOC_NAME == "COMPUTER OCCUPATION") | (df.SOC_NAME == "GRAPHIC DESIGNERS") |
                  (df.SOC_NAME == "ANALYSTS"), "IT INDUSTRY") \
                .when((df.SOC_NAME == "ACCOUNTANTS") | (df.SOC_NAME == "BUSINESS OPERATIONS SPECIALIST") |
                  (df.SOC_NAME == "CHIEF EXECUTIVES") | (df.SOC_NAME == "CURATORS") |
                  (df.SOC_NAME == "EVENT PLANNERS") | (df.SOC_NAME == "FIRST LINE SUPERVISORS") |
                  (df.SOC_NAME == "HUMAN RESOURCES") | (df.SOC_NAME == "IT MANAGERS") |
                  (df.SOC_NAME == "MANAGEMENT") | (df.SOC_NAME == "MANAGERS") |
                  (df.SOC_NAME == "PUBLIC RELATIONS"), "MANAGEMENT") \
                .when((df.SOC_NAME == "ACTUARIES") | (df.SOC_NAME == "FINANCE"), "FINANCE") \
                .when((df.SOC_NAME == "AGRICULTURE") | (df.SOC_NAME == "ANIMAL HUSBANDARY") |
                  (df.SOC_NAME == "FOOD PREPARATION WORKERS"), "FOOD AND AGRICULTURE") \
                .when((df.SOC_NAME == "COACHES AND SCOUTS") | (df.SOC_NAME == "COUNSELORS") |
                  (df.SOC_NAME == "EDUCATION")| (df.SOC_NAME == "FITNESS TRAINERS") |
                  (df.SOC_NAME == "INTERPRETERS AND TRANSLATORS") | (df.SOC_NAME == "LIBRARIANS") |
                  (df.SOC_NAME == "LOGISTICIANS") | (df.SOC_NAME == "SURVEYORS") |
                  (df.SOC_NAME == "WRITERS EDITORS AND AUTHORS"), "EDUCATION") \
                .when((df.SOC_NAME == "SALES AND RELATED WORKERS") | (df.SOC_NAME == "MARKETING"), "MARKETING") \
                .when((df.SOC_NAME == "DOCTORS") | (df.SOC_NAME == "SCIENTIST") |
                  (df.SOC_NAME == "INTERNIST"), "ADVANCED SCIENCES") \
                .when((df.SOC_NAME == "COMMUNICATIONS") | (df.SOC_NAME == "ENGINEERS") |
                  (df.SOC_NAME == "LAB TECHNICIANS") | (df.SOC_NAME == "CONSTRUCTION") |
                  (df.SOC_NAME == "ARCHITECTURE") | (df.SOC_NAME == "MECHANICS"), "ENGINEERING AND ARCHITECTURE") \
                .otherwise("ARTISTS AND ENTERTAINMENT").alias("INDUSTRY"), df.FULL_TIME_POSITION, df.WAGE_RANGE, df.WORKSITE_STATE, df.CASE_STATUS)
        return df_classify_employer_spark

    # Implementation of the Spark Code
    spark = SparkSession.builder.getOrCreate()
    sc = spark.read
    sc.option('header', True)
    sc.option('inferSchema', True)
    sqlContext = SQLContext(spark)

    # Creating the dataframe from the CSV file
    df_H1b_file = sc.csv(input)

    # Pre-processing the data
    df_dnv = drop_nan_values_spark(df_H1b_file)
    df_ml = merge_labels_spark(df_dnv)
    df_pw = prevailing_wage_spark(df_ml)
    df_ce = classify_employer_spark(df_pw)

    # DataFrame after pre-processing the data from the original dataframe
    print("Data after applying pre-processing methods:")
    df_ce.show(10)

    # Converting the values using StringIndexer and encoding the values with OneHotEncoder
    categoricalColumns = [
        "EMPLOYER_NAME", "INDUSTRY", "FULL_TIME_POSITION", "WAGE_RANGE",
        "WORKSITE_STATE"
    ]
    stages = []  # stages in the Pipeline
    for categoricalCol in categoricalColumns:
        # Category Indexing with StringIndexer
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol + "Index")
        # Encoding the values
        encoder = OneHotEncoderEstimator(
            inputCols=[stringIndexer.getOutputCol()],
            outputCols=[categoricalCol + "classVec"])
        # Adding the stages
        stages += [stringIndexer, encoder]

    # Setting the label value from CASE_STATUS which is to be predicted
    label_stringIdx = StringIndexer(inputCol="CASE_STATUS", outputCol="label")
    stages += [label_stringIdx]

    # Using the VectorAssembler to get the labels vector for the prediction
    assemblerInputs = [c + "classVec"
                       for c in categoricalColumns] + ["CASE_SUBMITTED_YEAR"]
    assembler = VectorAssembler(inputCols=assemblerInputs,
                                outputCol="features")
    stages += [assembler]

    # Implementing the pipeline for the flow
    partialPipeline = Pipeline().setStages(stages)
    pipelineModel = partialPipeline.fit(df_ce)
    preppedDataDF = pipelineModel.transform(df_ce)

    selectedcols = ["label", "features"] + df_ce.columns
    final_dataset = preppedDataDF.select(
        selectedcols)  # DataFrame to be used for the Machine Learning Models

    # Dividing the dataset into training and testing samples
    (trainData, testData) = final_dataset.randomSplit([0.7, 0.3], seed=100)
    print("Number of samples to train the model: " + str(trainData.count()))
    print("Number of samples to test the model: " + str(testData.count()))

    # Calling the Logistic Regression Model from the MLlib in Spark
    lrModel = LogisticRegression(featuresCol='features',
                                 labelCol='label',
                                 maxIter=15)

    # Fitting the training data in the model to train the data
    LR_Model = lrModel.fit(trainData)

    # Predicting the outputs for the test data
    predictions_LR = LR_Model.transform(testData)

    print("Predictions analysis for Logistic Regression Model:")
    predictions_LR.select("EMPLOYER_NAME", "INDUSTRY", "FULL_TIME_POSITION",
                          "WAGE_RANGE", "WORKSITE_STATE", "label",
                          "rawPrediction", "prediction",
                          "probability").show(10)

    # Evaluating the accuracy for the Logistic Regression
    evaluator = BinaryClassificationEvaluator()
    LR_accuracy = str(
        evaluator.evaluate(predictions_LR,
                           {evaluator.metricName: "areaUnderROC"}))
    print("Accuracy for Logistic Regression Model: " + LR_accuracy)

    # Defining the accuracy list to store the accuracies of the model
    accuracy = []

    # Appending the accuracy of Logistic Regression Model
    accuracy.append(LR_accuracy)

    # Implementation of Random Forest Model
    rf = RandomForestClassifier(featuresCol='features', labelCol='label')

    rfModel = rf.fit(trainData)

    predictions = rfModel.transform(testData)

    print("Predictions analysis for Random Forest Model:")
    predictions_LR.select("EMPLOYER_NAME", "INDUSTRY", "FULL_TIME_POSITION",
                          "WAGE_RANGE", "WORKSITE_STATE", "label",
                          "rawPrediction", "prediction",
                          "probability").show(10)

    evaluator = BinaryClassificationEvaluator()
    RF_accuracy = str(
        evaluator.evaluate(predictions,
                           {evaluator.metricName: "areaUnderROC"}))
    print("Accuracy for Random Forest Model: " + RF_accuracy)

    # Appending the accuracy of Random Forest Model
    accuracy.append(RF_accuracy)

    # Converting the list to a Dataframe
    df_accuracy = sqlContext.createDataFrame(accuracy, StringType())
    df_accuracy = df_accuracy.selectExpr("Value as Accuracy")

    models = ["Logistic Regression Model", "Random Forest Model"]
    df_models = sqlContext.createDataFrame(models, StringType())
    df_models = df_models.selectExpr("Value as Models")

    df_accuracy = df_accuracy.withColumn("id", F.monotonically_increasing_id())
    df_models = df_models.withColumn("id", F.monotonically_increasing_id())
    df_final = df_models.join(df_accuracy, "id", "outer").drop("id")
    df_final.show()

    # Writing the file back to the storage
    df_final.repartition(1).write.option("header",
                                         "true").format('csv').save(output)

    # Implementation of kMeans Model
    for k in range(2, 9):
        kmeans = KMeans(featuresCol="features", k=k)
        model = kmeans.fit(trainData)
        wsse = model.computeCost(trainData)
        print("k = {}, the error is {}".format(k, str(
            wsse)))  # Showing the Squared Sum Errors for different values of k

    spark.stop()
    (3, "a"),
    (4, "a"),
    (5, "c")
], ["id", "category"])
stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)
# default setting: dropLast=True
encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec", dropLast=False)
encoded = encoder.transform(indexed)
encoded.show()

categoricalCols = ['category']
indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols]
# default setting: dropLast=True
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
                          outputCol="{0}_encoded".format(indexer.getOutputCol()), dropLast=False)
            for indexer in indexers]
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                            , outputCol="features")
pipeline = Pipeline(stages=indexers + encoders + [assembler])

model = pipeline.fit(df)
data = model.transform(df)


# application: get dummy variable
def get_dummy(df, indexCol, categoricalCols, continuousCols, labelCol, dropLast=False):
    '''
    Get dummy variables and concat with continuous variables for ml modeling.
    :param df: the dataframe
Exemple #18
0
    OneHotEncoderEstimator,
    StringIndexer,
    VectorAssembler,
    OneHotEncoder,
)

fireServiceDF = fireServiceDF.withColumn("ALSUnit",
                                         col("ALSUnit").astype("string"))
for var in categorical_variables:
    indexer = StringIndexer(
        inputCol=var,
        outputCol=var + "_Index",
        handleInvalid="keep",
        stringOrderType="alphabetAsc",
    )
    encoder = OneHotEncoder(inputCol=indexer.getOutputCol(),
                            outputCol=var + "_classVec")
    stages += [indexer, encoder]

# create 'features' column using VectorAssembler
from pyspark.ml.feature import VectorAssembler
assemblerInputs = [c + "_classVec"
                   for c in categorical_variables] + numerical_variables
print(bcolors.WARNING + str(assemblerInputs) + bcolors.ENDC)
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

# create pipeline from stages and apply
print(bcolors.OKBLUE + bcolors.BOLD + "Applying Pipeline" + bcolors.ENDC)
from pyspark.ml import Pipeline
import time
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"),
                                (4, "a"), (5, "c")], ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'" %
          (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory")
    converted = converter.transform(indexed)

    print(
        "Transformed indexed column '%s' back to original string column '%s' using "
        "labels in metadata" %
        (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$
Exemple #20
0
pipeline = Pipeline(stages=[
  indexer, 
  assembler, 
  multinomialRegression
])

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorAssembler

dbTest("ML1-P-07-02-01", True, type(indexer) == type(StringIndexer()))
dbTest("ML1-P-07-02-02", True, indexer.getInputCol() == 'species')
dbTest("ML1-P-07-02-03", True, indexer.getOutputCol() == 'speciesClass')

dbTest("ML1-P-07-02-04", True, type(assembler) == type(VectorAssembler()))
dbTest("ML1-P-07-02-05", True, assembler.getInputCols() == irisDF.columns[:-1])
dbTest("ML1-P-07-02-06", True, assembler.getOutputCol() == 'features')

dbTest("ML1-P-07-02-07", True, type(multinomialRegression) == type(LogisticRegression()))
dbTest("ML1-P-07-02-08", True, multinomialRegression.getLabelCol() == "speciesClass")
dbTest("ML1-P-07-02-09", True, multinomialRegression.getFeaturesCol() == 'features')

dbTest("ML1-P-07-02-10", True, type(pipeline) == type(Pipeline()))

print("Tests passed!")

# COMMAND ----------
Exemple #21
0
spark = SparkSession \
    .builder \
    .appName("Pyspark Model") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

# Create a test data frame
l = [('Alice', 1), ('Bob', 2)]
rdd = sc.parallelize(l)
Person = Row('name', 'age')
person = rdd.map(lambda r: Person(*r))
df2 = spark.createDataFrame(person)
df2.collect()

# Build a very simple pipeline using two transformers
string_indexer = StringIndexer(inputCol='name', outputCol='name_string_index')

feature_assembler = VectorAssembler(inputCols=[string_indexer.getOutputCol()],
                                    outputCol="features")

feature_pipeline = [string_indexer, feature_assembler]

featurePipeline = Pipeline(stages=feature_pipeline)

fittedPipeline = featurePipeline.fit(df2)

fittedPipeline.serializeToBundle(
    "jar:file:/pyspark_examples/pyspark.example.zip",
    fittedPipeline.transform(df2))
data_all.printSchema()

# COMMAND ----------

categoricalColumns = [
    "workclass", "education", "marital_status", "occupation", "relationship",
    "race", "sex", "native_country"
]
stages = []  # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol,
                                  outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()],
                                     outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="income", outputCol="label")
stages += [label_stringIdx]

# Transform all features into a vector using VectorAssembler
numericCols = [
    "age", "fnlwgt", "education_num", "capital_gain", "capital_loss",
    "hours_per_week"
]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
Exemple #23
0
cv = CrossValidator(estimator=dt_clf,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3)
cv_pipeline = Pipeline(stages=[categeriesIndexer,onehotencoder,assember,cv])
cv_pipelineModel = cv_pipeline.fit(train_df)
bestModel = cv_pipelineModel.stage[3].bestModel
predictions = bestModel.transform(test_df)
auc = evaluator.evaluate(prediction)


from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator

df = spark.createDataFrame([(0, "a", 1), (1, "b", 2), (2, "c", 3), (3, "a", 4), (4, "a", 4), (5, "c", 3)], ["id", "category1", "category2"])

indexer = StringIndexer(inputCol="category1", outputCol="category1Index")
inputs = [indexer.getOutputCol(), "category2"]
encoder = OneHotEncoderEstimator(inputCols=inputs, outputCols=["categoryVec1", "categoryVec2"])
pipeline = Pipeline(stages=[indexer, encoder])
pipeline.fit(df).transform(df).show()
# +---+---------+---------+--------------+-------------+-------------+
# | id|category1|category2|category1Index| categoryVec1| categoryVec2|
# +---+---------+---------+--------------+-------------+-------------+
# |  0|        a|        1|           0.0|(2,[0],[1.0])|(4,[1],[1.0])|
# |  1|        b|        2|           2.0|    (2,[],[])|(4,[2],[1.0])|
# |  2|        c|        3|           1.0|(2,[1],[1.0])|(4,[3],[1.0])|
# |  3|        a|        4|           0.0|(2,[0],[1.0])|    (4,[],[])|
# |  4|        a|        4|           0.0|(2,[0],[1.0])|    (4,[],[])|
# |  5|        c|        3|           1.0|(2,[1],[1.0])|(4,[3],[1.0])|
# +---+---------+---------+--------------+-------------+-------------+

def encode_columns(df, col_list):
schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)])


features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)

print "Features created"

from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"

lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol())

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

lr_model = lr.fit(featIndexed)

dfTestTok = tokenizer.transform(dfTest)
featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)

df_test_pred = lr_model.transform(testIndexed)

res=evaluator.evaluate(df_test_pred)

print res
#renaming columns, all columns that contain a - will be replaced with an "_"
columns_new = [col.replace("-", "_") for col in data_all.columns]
data_all = data_all.toDF(*columns_new)

data_all.printSchema()

# COMMAND ----------

categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]
    
    
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="income", outputCol="label")
stages += [label_stringIdx]

# Transform all features into a vector using VectorAssembler
numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

partialPipeline = Pipeline().setStages(stages)
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$

    spark.stop()
Exemple #27
0
#в общем - все анализируемые колонки заносим в колонку-вектор features
# пробуем скормить неполные столбцы - получается, модель обучается и без обработки данных
categoricalColumns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
                      'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
                      'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
                      'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
                      'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
                      'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
                      'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
                      'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
                      'MiscFeature', 'SaleType', 'SaleCondition']

stages = []
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').setHandleInvalid("keep")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]).setHandleInvalid("keep")
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol = 'SalePrice', outputCol = 'label').setHandleInvalid("keep")
stages += [label_stringIdx]

numericCols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
               'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
               'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
               'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
               'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
               'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
               '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features").setHandleInvalid("keep")