print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) #**************************************************************** #*********************CROSS VALIDATION: 80%/20%****************** #*******************Model: DecisionTreeClassifier***************** #***************************************************************** evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')
#final_data.printSchema() # ## Machine Learning Pipeline # In[66]: from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, VectorIndexer from pyspark.sql.functions import col stages = [] for stringCols in string_cols: stringIndexer = StringIndexer(inputCol=stringCols, outputCol=stringCols + 'Index', handleInvalid='skip') encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=stringCols + "stringEnc") stages += [stringIndexer, encoder] label_stringIdx = StringIndexer(inputCol='base_plan_id', outputCol='label', handleInvalid='skip') stages += [label_stringIdx] assemblerInputs = [c + "stringEnc" for c in string_cols] + numeric_cols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] # In[67]: import time start_time = time.time()
features=dfBigram.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(schema) print "Features from bigrams created" from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import DecisionTreeClassifier string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed" dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol=string_indexer.getOutputCol(), maxDepth=10) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator grid=(ParamGridBuilder() .baseOn([evaluator.metricName,'precision']) .addGrid(dt.maxDepth, [10,20]) .build()) cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator) from time import time
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ["review", "label"]) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english")) ) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf") idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed") dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" ) grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizer = Tokenizer(inputCol='review', outputCol='reviews_words') hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizer, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') grid = (ParamGridBuilder().baseOn([evaluator.metricName, 'precision' ]).addGrid(dt.maxDepth, [10, 20]).build()) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator) print "Grid is build"
def DataPreparation(): spark = SparkSession.builder.appName('SistemaDeDeteccion').master( "local[*]").getOrCreate() #Creamos la sesión de spark data = spark.read.csv("Burnout_Data.csv", header=True, inferSchema=True) #Cargamos el dataset data = data.select('Tiempo_PlazaActual', 'EstadoCivil', 'Burnout_Antes', 'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso', 'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social', 'Edad', 'Estado_Animo', 'Tiempo_Vida_Laboral', 'Hijos', 'Lectura', 'Hora_Gratificante', 'Horas_Activ_Fisica') #Nos quedamos con las columnas de importancia p>1 según el análisis de componentes cols = data.columns #Guardamos en una variable los nombres de las columnas from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler #importamos las librerias necesiarias para convertir datos categóricos # en datos tratables por los algoritmos, es decir transformandolos a números categoricalColumns = [ 'Contrato_Adjunto', 'Musica', 'Sexo', 'Estudias', 'Sales_Social', 'Edad', 'Estado_Animo', 'Lectura', 'EstadoCivil' ] stages = [ ] #en esta variable guardaremos cada uno de los pasos para luego aplicarlos en el PipeLine for categoricalCol in categoricalColumns: #indexamos para cada una de las variables categoricas de la lista stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index') encoder = OneHotEncoderEstimator( inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) #una vez indexadas utilizamos el OneHotEncoderEstimator que le asigna a cada valor de la variable categórica un número stages += [stringIndexer.setHandleInvalid("keep"), encoder] #Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve label_stringIdx = StringIndexer( inputCol="Burnout_Antes", outputCol="label" ) #Indexamos como label la variable que queremos predecir que es el Burnout_Antes cuyos valores #Son VERDADERO y FALSO stages += [label_stringIdx.setHandleInvalid("keep")] # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve numericCols = [ 'Tiempo_PlazaActual', 'Hora_Social', 'Horas_Cuidados', 'Calorias', 'Peso', 'Tiempo_Vida_Laboral', 'Hijos', 'Hora_Gratificante', 'Horas_Activ_Fisica' ] #Con las variables categóricas transformadas a números podemos hacer un vector uniendolo con las variables numéricas. assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") #este proceso nos da como resultado las "features" que contienen en objeto vector las variables numéricas y categóricas. stages += [assembler.setHandleInvalid("keep")] # Guardamos este proceso en la variable stages indicandole que si hay valores inválidos los conserve from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) #Inicializamos nuestro Pipeline y le pasamos la lista de pasos que debe ejecutar, que se encuentran en la variable stages. pipelineModel = pipeline.fit(data) data = pipelineModel.transform(data) #Ejecutamos y entreamos el modelo que sería el procesamiento de los datos. path = 'modelo_Pipeline' os.mkdir(path) pipelineModel.save(os.path.join(path, 'Pipeline')) #Guardamos este modelo, debido a que para predecir necesitamos aplicar este mismo modelo a los nuevos datos selectedCols = ['label', 'features'] + cols data = data.select(selectedCols) #Seleccionamos la variable label y features, más la variable cols que contiene las columnas antes de hacer el procesado de datos train, test = data.randomSplit([0.7, 0.3]) #Para el entrenamiento y las pruebas utilizamos entonces un randomSplit para dividir el dataset en un porcentaje 70% entrenamiento y 30% pruebas print("Training Dataset Count: " + str(train.count())) print("Test Dataset Count: " + str(test.count())) #imprimimos la cantidad de filas que tiene cada uno y devolvemos estos datos para su utilización por los algoritmos. return train, test
def data_processing(df): ''' :param data: A PySpark dataframe :return: A preprocessed data that has been cleaned, indexed and assembled ''' df.createOrReplaceTempView("data") processed_data = spark.sql(""" select host_id, price, bathrooms, bedrooms, room_type, property_type, case when host_is_superhost = True then 1.0 else 0.0 end as host_is_superhost, accommodates, cancellation_policy, minimum_nights, maximum_nights, availability_30, availability_60, availability_90, availability_365, case when security_deposit is null then 0.0 else security_deposit end as security_deposit, case when number_of_reviews is null then 0.0 else number_of_reviews end as number_of_reviews, case when extra_people is null then 0.0 else extra_people end as extra_people, case when instant_bookable = True then 1.0 else 0.0 end as instant_bookable, case when cleaning_fee is null then 0.0 else cleaning_fee end as cleaning_fee, case when review_scores_rating is null then 0.0 else review_scores_rating end as review_scores_rating, case when review_scores_accuracy is null then 0.0 else review_scores_accuracy end as review_scores_accuracy, case when review_scores_cleanliness is null then 0.0 else review_scores_cleanliness end as review_scores_cleanliness, case when review_scores_checkin is null then 0.0 else review_scores_checkin end as review_scores_checkin, case when review_scores_communication is null then 0.0 else review_scores_communication end as review_scores_communication, case when review_scores_location is null then 0.0 else review_scores_location end as review_scores_location, case when review_scores_value is null then 0.0 else review_scores_value end as review_scores_value, case when square_feet is not null and square_feet > 100 then square_feet when (square_feet is null or square_feet <=100) and (bedrooms is null or bedrooms = 0) then 350.0 else 380 * bedrooms end as square_feet, case when bathrooms >= 2 then 1.0 else 0.0 end as n_bathrooms_more_than_two, case when amenity_wifi = True then 1.0 else 0.0 end as amenity_wifi, case when amenity_heating = True then 1.0 else 0.0 end as amenity_heating, case when amenity_essentials = True then 1.0 else 0.0 end as amenity_essentials, case when amenity_kitchen = True then 1.0 else 0.0 end as amenity_kitchen, case when amenity_tv = True then 1.0 else 0.0 end as amenity_tv, case when amenity_smoke_detector = True then 1.0 else 0.0 end as amenity_smoke_detector, case when amenity_washer = True then 1.0 else 0.0 end as amenity_washer, case when amenity_hangers = True then 1.0 else 0.0 end as amenity_hangers, case when amenity_laptop_friendly_workspace = True then 1.0 else 0.0 end as amenity_laptop_friendly_workspace, case when amenity_iron = True then 1.0 else 0.0 end as amenity_iron, case when amenity_shampoo = True then 1.0 else 0.0 end as amenity_shampoo, case when amenity_hair_dryer = True then 1.0 else 0.0 end as amenity_hair_dryer, case when amenity_family_kid_friendly = True then 1.0 else 0.0 end as amenity_family_kid_friendly, case when amenity_dryer = True then 1.0 else 0.0 end as amenity_dryer, case when amenity_fire_extinguisher = True then 1.0 else 0.0 end as amenity_fire_extinguisher, case when amenity_hot_water = True then 1.0 else 0.0 end as amenity_hot_water, case when amenity_internet = True then 1.0 else 0.0 end as amenity_internet, case when amenity_cable_tv = True then 1.0 else 0.0 end as amenity_cable_tv, case when amenity_carbon_monoxide_detector = True then 1.0 else 0.0 end as amenity_carbon_monoxide_detector, case when amenity_first_aid_kit = True then 1.0 else 0.0 end as amenity_first_aid_kit, case when amenity_host_greets_you = True then 1.0 else 0.0 end as amenity_host_greets_you, case when amenity_translation_missing_en_hosting_amenity_50 = True then 1.0 else 0.0 end as amenity_translation_missing_en_hosting_amenity_50, case when amenity_private_entrance = True then 1.0 else 0.0 end as amenity_private_entrance, case when amenity_bed_linens = True then 1.0 else 0.0 end as amenity_bed_linens, case when amenity_refrigerator = True then 1.0 else 0.0 end as amenity_refrigerator from data where bedrooms is not null """) processed_data = processed_data.na.drop() cat_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, StringType) ] num_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, IntegerType) ] decimal_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, DecimalType) ] double_cols = [ f.name for f in processed_data.schema.fields if isinstance(f.dataType, DoubleType) ] num_features = num_cols + decimal_cols + double_cols dataset_imputed = processed_data.persist() stages = [] for x in cat_cols: cats_indexer = StringIndexer(inputCol=x, outputCol=x + 'Index') encoder = OneHotEncoderEstimator( inputCols=[cats_indexer.getOutputCol()], outputCols=[x + "encode"]) stages += [cats_indexer, encoder] assembler_inputs = [c + "encode" for c in cat_cols] + num_features assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipeline_model = pipeline.fit(dataset_imputed) df = pipeline_model.transform(dataset_imputed) return df
from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator #convert relevant categorical into one hot encoded indexer1 = StringIndexer(inputCol="EntityCode", outputCol="EntityCodeIdx").setHandleInvalid("skip") indexer2 = StringIndexer(inputCol="Zip", outputCol="ZipIdx").setHandleInvalid("skip") indexer3 = StringIndexer(inputCol="ProviderType", outputCol="ProviderTypeIdx").setHandleInvalid("skip") #gather all indexers as inputs to the One Hot Encoder inputs = [ indexer1.getOutputCol(), indexer2.getOutputCol(), indexer3.getOutputCol() ] #create the one hot encoder encoder_outputs = ["EntityCodeVec", "ZipVec", "ProviderTypeVec"] encoder = OneHotEncoderEstimator(inputCols=inputs, outputCols=encoder_outputs) #run it through a pipeline pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, encoder]) encodedData = pipeline.fit(df_physicians_all).transform(df_physicians_all) # COMMAND ---------- encodedData.select(encoder_outputs).show(5)
def main(spark, data_file, model_file): '''Main routine for supervised training Parameters ---------- spark : SparkSession object data_file : string, path to the parquet file to load model_file : string, path to store the serialized model file ''' ### # TODO: YOUR CODE GOES HERE ### # Read data df = spark.read.parquet(data_file) # Take 1/10 data without replacement df = df.sample(False, 0.1, seed = 0) # Vectorize selected features features = ['mfcc_' + '%.2d' % i for i in range(20)] assembler = VectorAssembler(inputCols=features, outputCol="vectorized_features") # Standardize the features scaler = StandardScaler(inputCol="vectorized_features", outputCol="scaled_features", withStd=True, withMean=False) # Transform string target variable into numerical indexer = StringIndexer(inputCol="genre", outputCol="label", handleInvalid = "skip") # Build logistic regression lr = LogisticRegression(maxIter=20, featuresCol = scaler.getOutputCol(), labelCol=indexer.getOutputCol()) # Build a pipeline pipeline = Pipeline(stages = [assembler, scaler, indexer, lr]) # Build parameter grid and cross validation paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam,[0.1,0.3,0.5,0.8]).addGrid(lr.regParam, [0.1,0.08,0.05,0.02,0.01]).build() crossval = CrossValidator(estimator = pipeline, estimatorParamMaps = paramGrid, evaluator = MulticlassClassificationEvaluator(), numFolds = 5) # Save model cvModel = crossval.fit(df) cvModel.bestModel.write().overwrite().save(model_file)
# COMMAND ---------- # MAGIC %md The ML package needs the label and feature vector to be added as columns to the input dataframe. We set up a pipeline to pass the data through transformers in order to extract the features and label. We index each categorical column using the `StringIndexer` to a column of number indices, then convert the indexed categories into one-hot encoded variables with at most a single one-value. These binary vectors are appended to the end of each row. Encoding categorical features allows decision trees to treat categorical features appropriately, improving performance. We then use the `StringIndexer` to encode our labels to label indices. # COMMAND ---------- categoricalColumns = ["OriginAirportCode", "Carrier", "DestAirportCode"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoderEstimator to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(dropLast=False, inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Using the slightly older OneHotEncoder (instead of OneHotEncoderEstimator) for compatibility reasons when operationalizing within the DSVM encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol + "classVec") # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="DepDel15", outputCol="label") stages += [label_stringIdx] # COMMAND ---------- # MAGIC %md Now we need to use the `VectorAssembler` to combine all the feature columns into a single vector column. This includes our numeric columns as well as the one-hot encoded binary vector columns. # COMMAND ---------- # Transform all features into a vector using VectorAssembler numericCols = ["Month", "DayofMonth", "CRSDepHour", "DayOfWeek", "WindSpeed", "SeaLevelPressure", "HourlyPrecip"]
dataset_df = sqlContext.read.csv('salaries.csv', header='true', inferSchema='true') # initializing stages of main transformation pipeline stages = [] # list of categorical features for further hot-encoding cat_features = ["rank", "discipline", "sincephd_bin", "service_bin", "sex"] # removing column with ID field dataset_df = dataset_df.drop('_c0') # bining numeric features by local binner udf function (specified for current dataset if needed) dataset_df = dataset_df.withColumn('sincephd_bin', binner(dataset_df['sincephd'])) dataset_df = dataset_df.withColumn('service_bin', binner(dataset_df['service'])) dataset_df = dataset_df.withColumn('model_type', sf.lit(0)) dataset_df = dataset_df.drop('sincephd', 'service') # hot encoding categorical features for feature in cat_features: string_indexer = StringIndexer(inputCol=feature, outputCol=feature + "_index") encoder = OneHotEncoderEstimator(inputCols=[string_indexer.getOutputCol()], outputCols=[feature + "_vec"]) encoder.setDropLast(False) stages += [string_indexer, encoder] assembler_inputs = [feature + "_vec" for feature in cat_features] assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="assembled_inputs") stages += [assembler] assembler_final = VectorAssembler(inputCols=["assembled_inputs"], outputCol="features") stages += [assembler_final] pipeline = Pipeline(stages=stages) pipeline_model = pipeline.fit(dataset_df) dataset_transformed = pipeline_model.transform(dataset_df) df_transform_fin = dataset_transformed.select('features', label, 'model_type').toPandas() train, test = train_test_split(df_transform_fin, test_size=0.3, random_state=0) train_df = sqlContext.createDataFrame(train) test_df = sqlContext.createDataFrame(test) decode_dict = {}
def add_pyspark_features(self, transform_type='countvectorizer', pca=False, pca_k=500, chi_sqr=False, chi_feature_num=500): ''' Add built in pyspark feature transformations using pyspark's Pipeline. Input: ------- transform_type : str (Determines how to transform the reviews - 'countvectorizer' - 'bigram' - 'tfidf' - 'word2vec') pca : boolean (Determines whether to run PCA on the tranformed review.) pca_number : int (Number of features you want to reduce to.) Output: ------- None ''' # Set up stages. stages = [] # Tokenize reviews into vectors of words. regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W") # Add to stages. stages += [regexTokenizer] # Remove stopwords from the word vectors. add_stopwords = ['the', 'a', 'to'] stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(add_stopwords) # Add to stages. stages += [stopwordsRemover] # Using CountVectorizer as our review transformation. if transform_type == 'countvectorizer': # Create count vectors from the filtered bag of words. countVectors = CountVectorizer(inputCol="filtered", outputCol="review_vector", vocabSize=5000, minDF=5) # Add to stages. stages += [countVectors] # Using TFIDF as our review transformation. if transform_type == 'tfidf': # Creating IDF from the words the filtered words hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=5000) idf = IDF(inputCol="rawFeatures", outputCol="review_vector", minDocFreq=5) # Add to stages stages += [hashingTF, idf] # Using bigrams as our review transformation. if transform_type == 'bigram': # Single grams. unigram = NGram(n=1, inputCol='words', outputCol='unigrams') stages += [unigram] # Add n-grams to feature set. bigrams = NGram(n=2, inputCol="words", outputCol="bigrams") stages += [bigrams] # Vectorize unigrams unigrams_vector = CountVectorizer(inputCol="unigrams", outputCol="unigrams_vector", vocabSize=2500) stages += [unigrams_vector] bigrams_vector = CountVectorizer(inputCol="bigrams", outputCol="bigrams_vector", vocabSize=2500) stages += [bigrams_vector] # Vector assemble the unigrams and the bigrams ngrams = VectorAssembler( inputCols=['unigrams_vector', 'bigrams_vector'], outputCol='review_vector') stages += [ngrams] # Using word2vec as our review transformation. if transform_type == 'word2vec': word2vec = Word2Vec(vectorSize=5000, minCount=0, inputCol="words", outputCol="review_vector") stages += [word2vec] # Use PCA if user wants to use it. if pca: pca = PCA(k=pca_k, inputCol="review_vector", outputCol="pcaFeatures") stages += [pca] # Perform one hot encoding on all categorical variables. categorical_cols = ['reviewerID'] for col in categorical_cols: # Map each categorical value to an index (number). stringIndexer = StringIndexer(inputCol=col, outputCol=col + "_Index") # Use OneHotEncoder to convert categorical variables # into binary SparseVectors. Similar to pd.get_dummies() encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=col + "_classVec") # Add to stages stages += [stringIndexer, encoder] # Numeric columns numericCols = ['overall_transform'] # Get columns that we want from before spark pipleline. prev_features = [ #'neg', #'neu', #'pos', #'compound', 'sentence_cnt', 'word_cnt', 'punctuation_cnt', 'capital_cnt', 'upper_word_cnt', 'avg_word_cnt', 'avg_punc_cnt', 'avg_capital_cnt', 'avg_upper_cnt' ] # Vector assemble all features into one column called features. assemblerInputs = ['review_vector'] + numericCols + prev_features # Add pca to features if user wants. if pca: assemblerInputs += ['pcaFeatures'] assemblerInputs.remove('review_vector') assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="unstandard_features") stages += [assembler] # Do Chi-Squared Feature Reduction if wanted. if chi_sqr: chi_selector = ChiSqSelector(numTopFeatures=chi_feature_num, featuresCol="unstandard_features", outputCol="chi_features", labelCol="label") stages += [chi_selector] scaler = StandardScaler(inputCol="chi_features", outputCol="features", withStd=True, withMean=False) stages += [scaler] else: scaler = StandardScaler(inputCol="unstandard_features", outputCol="features", withStd=True, withMean=False) stages += [scaler] # Initialize the pipeline with the stages that were set. pipeline = Pipeline(stages=stages) # Fit the pipeline to training documents. pipelineFit = pipeline.fit(self.df) self.df = pipelineFit.transform(self.df)
label = 'salary' numerical_cols = [ 'age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week' ] categorical_cols = [ "workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country" ] stages = [] # One hot encode categorical cols for cname in categorical_cols: string_idxer = StringIndexer(inputCol=cname, outputCol=cname + 'Index') encoder = OneHotEncoderEstimator( inputCols=[string_idxer.getOutputCol()], outputCols=[cname + 'classVec']) stages += [string_idxer, encoder] # Convert labels (Slary) to 0 and 1 label_idxer = StringIndexer(inputCol="salary", outputCol="label") stages += [label_idxer] # Standardize numberical cols numerical_assembler = VectorAssembler(inputCols=numerical_cols, outputCol='numFeatures') scaler = StandardScaler(inputCol='numFeatures', outputCol='norm_cols', withStd=True, withMean=True)
return when(col(x) != "", col(x)).otherwise(impute) def impute_1pc(x, larger_than_1pc): return when(col(x).isin(list(larger_than_1pc)[0]), col(x)).otherwise('less_than_1pc') def log_transformation(x): return when(col(x) < 0, col(x)).otherwise(log1p(col(x))) from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, StandardScaler, ChiSqSelector stages = [] for categoricalCol in categorical_features_select: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index') encoder = OneHotEncoderEstimator( inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) stages += [stringIndexer, encoder] assemblerInputs = [c + "classVec" for c in categorical_features_select] + numeric_features assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] scaler = StandardScaler(inputCol='features', outputCol='selected_features', withStd=True, withMean=True) stages += [scaler] # selector = ChiSqSelector(numTopFeatures=50, featuresCol="scaled_features", # outputCol="selected_features", labelCol="label")
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(),maxIter=30, regParam=0.01) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision']) # .addGrid(dt.maxDepth, [10,20]) # .build()) #cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid,evaluator=evaluator)
def flight(input, output): import pyspark from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession import pyspark.sql.functions as F from pyspark.sql import SQLContext from pyspark.sql.types import IntegerType, StringType from pyspark.ml import Pipeline from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler from pyspark.ml.evaluation import BinaryClassificationEvaluator, ClusteringEvaluator from pyspark.ml.classification import LogisticRegression, RandomForestClassifier from pyspark.ml.clustering import KMeans # Dropping the null values from the dataset def drop_nan_values_spark(df): df_drop_nan_values_spark = df.na.drop() return df_drop_nan_values_spark # Merging the labels into Binary values to perform Binary Classification def merge_labels_spark(df): df_merge_labels_spark = df.select(df.CASE_SUBMITTED_YEAR, df.EMPLOYER_NAME, df.SOC_NAME, df.FULL_TIME_POSITION, df.PREVAILING_WAGE, df.WORKSITE_STATE, F.when(df.CASE_STATUS == "WITHDRAWN", "DENIED") \ .when(df.CASE_STATUS == "CERTIFIEDWITHDRAWN", "CERTIFIED") \ .otherwise(df.CASE_STATUS).alias("CASE_STATUS")) return df_merge_labels_spark # Dividing the wages into the ranges def prevailing_wage_spark(df): df_prevailing_wage_spark = df.select(df.CASE_SUBMITTED_YEAR, df.EMPLOYER_NAME, df.SOC_NAME, df.FULL_TIME_POSITION, F.when(df.PREVAILING_WAGE <= 20000, "0-20000") \ .when((df.PREVAILING_WAGE > 20000) & (df.PREVAILING_WAGE <= 50000), "20000-50000") \ .when((df.PREVAILING_WAGE > 50000) & (df.PREVAILING_WAGE <= 120000), "50000-120000") \ .when((df.PREVAILING_WAGE > 120000) & (df.PREVAILING_WAGE <= 250000), "120000-250000") \ .otherwise(">250000").alias("WAGE_RANGE"), df.WORKSITE_STATE, df.CASE_STATUS) return df_prevailing_wage_spark # Changing the individual field to its Industry field def classify_employer_spark(df): df_classify_employer_spark = df.select(df.CASE_SUBMITTED_YEAR, df.EMPLOYER_NAME, F.when((df.SOC_NAME == "COMPUTER OCCUPATION") | (df.SOC_NAME == "GRAPHIC DESIGNERS") | (df.SOC_NAME == "ANALYSTS"), "IT INDUSTRY") \ .when((df.SOC_NAME == "ACCOUNTANTS") | (df.SOC_NAME == "BUSINESS OPERATIONS SPECIALIST") | (df.SOC_NAME == "CHIEF EXECUTIVES") | (df.SOC_NAME == "CURATORS") | (df.SOC_NAME == "EVENT PLANNERS") | (df.SOC_NAME == "FIRST LINE SUPERVISORS") | (df.SOC_NAME == "HUMAN RESOURCES") | (df.SOC_NAME == "IT MANAGERS") | (df.SOC_NAME == "MANAGEMENT") | (df.SOC_NAME == "MANAGERS") | (df.SOC_NAME == "PUBLIC RELATIONS"), "MANAGEMENT") \ .when((df.SOC_NAME == "ACTUARIES") | (df.SOC_NAME == "FINANCE"), "FINANCE") \ .when((df.SOC_NAME == "AGRICULTURE") | (df.SOC_NAME == "ANIMAL HUSBANDARY") | (df.SOC_NAME == "FOOD PREPARATION WORKERS"), "FOOD AND AGRICULTURE") \ .when((df.SOC_NAME == "COACHES AND SCOUTS") | (df.SOC_NAME == "COUNSELORS") | (df.SOC_NAME == "EDUCATION")| (df.SOC_NAME == "FITNESS TRAINERS") | (df.SOC_NAME == "INTERPRETERS AND TRANSLATORS") | (df.SOC_NAME == "LIBRARIANS") | (df.SOC_NAME == "LOGISTICIANS") | (df.SOC_NAME == "SURVEYORS") | (df.SOC_NAME == "WRITERS EDITORS AND AUTHORS"), "EDUCATION") \ .when((df.SOC_NAME == "SALES AND RELATED WORKERS") | (df.SOC_NAME == "MARKETING"), "MARKETING") \ .when((df.SOC_NAME == "DOCTORS") | (df.SOC_NAME == "SCIENTIST") | (df.SOC_NAME == "INTERNIST"), "ADVANCED SCIENCES") \ .when((df.SOC_NAME == "COMMUNICATIONS") | (df.SOC_NAME == "ENGINEERS") | (df.SOC_NAME == "LAB TECHNICIANS") | (df.SOC_NAME == "CONSTRUCTION") | (df.SOC_NAME == "ARCHITECTURE") | (df.SOC_NAME == "MECHANICS"), "ENGINEERING AND ARCHITECTURE") \ .otherwise("ARTISTS AND ENTERTAINMENT").alias("INDUSTRY"), df.FULL_TIME_POSITION, df.WAGE_RANGE, df.WORKSITE_STATE, df.CASE_STATUS) return df_classify_employer_spark # Implementation of the Spark Code spark = SparkSession.builder.getOrCreate() sc = spark.read sc.option('header', True) sc.option('inferSchema', True) sqlContext = SQLContext(spark) # Creating the dataframe from the CSV file df_H1b_file = sc.csv(input) # Pre-processing the data df_dnv = drop_nan_values_spark(df_H1b_file) df_ml = merge_labels_spark(df_dnv) df_pw = prevailing_wage_spark(df_ml) df_ce = classify_employer_spark(df_pw) # DataFrame after pre-processing the data from the original dataframe print("Data after applying pre-processing methods:") df_ce.show(10) # Converting the values using StringIndexer and encoding the values with OneHotEncoder categoricalColumns = [ "EMPLOYER_NAME", "INDUSTRY", "FULL_TIME_POSITION", "WAGE_RANGE", "WORKSITE_STATE" ] stages = [] # stages in the Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Encoding the values encoder = OneHotEncoderEstimator( inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Adding the stages stages += [stringIndexer, encoder] # Setting the label value from CASE_STATUS which is to be predicted label_stringIdx = StringIndexer(inputCol="CASE_STATUS", outputCol="label") stages += [label_stringIdx] # Using the VectorAssembler to get the labels vector for the prediction assemblerInputs = [c + "classVec" for c in categoricalColumns] + ["CASE_SUBMITTED_YEAR"] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] # Implementing the pipeline for the flow partialPipeline = Pipeline().setStages(stages) pipelineModel = partialPipeline.fit(df_ce) preppedDataDF = pipelineModel.transform(df_ce) selectedcols = ["label", "features"] + df_ce.columns final_dataset = preppedDataDF.select( selectedcols) # DataFrame to be used for the Machine Learning Models # Dividing the dataset into training and testing samples (trainData, testData) = final_dataset.randomSplit([0.7, 0.3], seed=100) print("Number of samples to train the model: " + str(trainData.count())) print("Number of samples to test the model: " + str(testData.count())) # Calling the Logistic Regression Model from the MLlib in Spark lrModel = LogisticRegression(featuresCol='features', labelCol='label', maxIter=15) # Fitting the training data in the model to train the data LR_Model = lrModel.fit(trainData) # Predicting the outputs for the test data predictions_LR = LR_Model.transform(testData) print("Predictions analysis for Logistic Regression Model:") predictions_LR.select("EMPLOYER_NAME", "INDUSTRY", "FULL_TIME_POSITION", "WAGE_RANGE", "WORKSITE_STATE", "label", "rawPrediction", "prediction", "probability").show(10) # Evaluating the accuracy for the Logistic Regression evaluator = BinaryClassificationEvaluator() LR_accuracy = str( evaluator.evaluate(predictions_LR, {evaluator.metricName: "areaUnderROC"})) print("Accuracy for Logistic Regression Model: " + LR_accuracy) # Defining the accuracy list to store the accuracies of the model accuracy = [] # Appending the accuracy of Logistic Regression Model accuracy.append(LR_accuracy) # Implementation of Random Forest Model rf = RandomForestClassifier(featuresCol='features', labelCol='label') rfModel = rf.fit(trainData) predictions = rfModel.transform(testData) print("Predictions analysis for Random Forest Model:") predictions_LR.select("EMPLOYER_NAME", "INDUSTRY", "FULL_TIME_POSITION", "WAGE_RANGE", "WORKSITE_STATE", "label", "rawPrediction", "prediction", "probability").show(10) evaluator = BinaryClassificationEvaluator() RF_accuracy = str( evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})) print("Accuracy for Random Forest Model: " + RF_accuracy) # Appending the accuracy of Random Forest Model accuracy.append(RF_accuracy) # Converting the list to a Dataframe df_accuracy = sqlContext.createDataFrame(accuracy, StringType()) df_accuracy = df_accuracy.selectExpr("Value as Accuracy") models = ["Logistic Regression Model", "Random Forest Model"] df_models = sqlContext.createDataFrame(models, StringType()) df_models = df_models.selectExpr("Value as Models") df_accuracy = df_accuracy.withColumn("id", F.monotonically_increasing_id()) df_models = df_models.withColumn("id", F.monotonically_increasing_id()) df_final = df_models.join(df_accuracy, "id", "outer").drop("id") df_final.show() # Writing the file back to the storage df_final.repartition(1).write.option("header", "true").format('csv').save(output) # Implementation of kMeans Model for k in range(2, 9): kmeans = KMeans(featuresCol="features", k=k) model = kmeans.fit(trainData) wsse = model.computeCost(trainData) print("k = {}, the error is {}".format(k, str( wsse))) # Showing the Squared Sum Errors for different values of k spark.stop()
(3, "a"), (4, "a"), (5, "c") ], ["id", "category"]) stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) # default setting: dropLast=True encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec", dropLast=False) encoded = encoder.transform(indexed) encoded.show() categoricalCols = ['category'] indexers = [StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categoricalCols] # default setting: dropLast=True encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol()), dropLast=False) for indexer in indexers] assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] , outputCol="features") pipeline = Pipeline(stages=indexers + encoders + [assembler]) model = pipeline.fit(df) data = model.transform(df) # application: get dummy variable def get_dummy(df, indexCol, categoricalCols, continuousCols, labelCol, dropLast=False): ''' Get dummy variables and concat with continuous variables for ml modeling. :param df: the dataframe
OneHotEncoderEstimator, StringIndexer, VectorAssembler, OneHotEncoder, ) fireServiceDF = fireServiceDF.withColumn("ALSUnit", col("ALSUnit").astype("string")) for var in categorical_variables: indexer = StringIndexer( inputCol=var, outputCol=var + "_Index", handleInvalid="keep", stringOrderType="alphabetAsc", ) encoder = OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=var + "_classVec") stages += [indexer, encoder] # create 'features' column using VectorAssembler from pyspark.ml.feature import VectorAssembler assemblerInputs = [c + "_classVec" for c in categorical_variables] + numerical_variables print(bcolors.WARNING + str(assemblerInputs) + bcolors.ENDC) assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] # create pipeline from stages and apply print(bcolors.OKBLUE + bcolors.BOLD + "Applying Pipeline" + bcolors.ENDC) from pyspark.ml import Pipeline import time
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("IndexToStringExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print( "Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # $example off$
pipeline = Pipeline(stages=[ indexer, assembler, multinomialRegression ]) # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import StringIndexer, VectorAssembler dbTest("ML1-P-07-02-01", True, type(indexer) == type(StringIndexer())) dbTest("ML1-P-07-02-02", True, indexer.getInputCol() == 'species') dbTest("ML1-P-07-02-03", True, indexer.getOutputCol() == 'speciesClass') dbTest("ML1-P-07-02-04", True, type(assembler) == type(VectorAssembler())) dbTest("ML1-P-07-02-05", True, assembler.getInputCols() == irisDF.columns[:-1]) dbTest("ML1-P-07-02-06", True, assembler.getOutputCol() == 'features') dbTest("ML1-P-07-02-07", True, type(multinomialRegression) == type(LogisticRegression())) dbTest("ML1-P-07-02-08", True, multinomialRegression.getLabelCol() == "speciesClass") dbTest("ML1-P-07-02-09", True, multinomialRegression.getFeaturesCol() == 'features') dbTest("ML1-P-07-02-10", True, type(pipeline) == type(Pipeline())) print("Tests passed!") # COMMAND ----------
spark = SparkSession \ .builder \ .appName("Pyspark Model") \ .getOrCreate() sc = spark.sparkContext sc.setLogLevel("ERROR") # Create a test data frame l = [('Alice', 1), ('Bob', 2)] rdd = sc.parallelize(l) Person = Row('name', 'age') person = rdd.map(lambda r: Person(*r)) df2 = spark.createDataFrame(person) df2.collect() # Build a very simple pipeline using two transformers string_indexer = StringIndexer(inputCol='name', outputCol='name_string_index') feature_assembler = VectorAssembler(inputCols=[string_indexer.getOutputCol()], outputCol="features") feature_pipeline = [string_indexer, feature_assembler] featurePipeline = Pipeline(stages=feature_pipeline) fittedPipeline = featurePipeline.fit(df2) fittedPipeline.serializeToBundle( "jar:file:/pyspark_examples/pyspark.example.zip", fittedPipeline.transform(df2))
data_all.printSchema() # COMMAND ---------- categoricalColumns = [ "workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country" ] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="income", outputCol="label") stages += [label_stringIdx] # Transform all features into a vector using VectorAssembler numericCols = [ "age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week" ] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
cv = CrossValidator(estimator=dt_clf,evaluator=evaluator,estimatorParamMaps=paramGrid,numFolds=3) cv_pipeline = Pipeline(stages=[categeriesIndexer,onehotencoder,assember,cv]) cv_pipelineModel = cv_pipeline.fit(train_df) bestModel = cv_pipelineModel.stage[3].bestModel predictions = bestModel.transform(test_df) auc = evaluator.evaluate(prediction) from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator df = spark.createDataFrame([(0, "a", 1), (1, "b", 2), (2, "c", 3), (3, "a", 4), (4, "a", 4), (5, "c", 3)], ["id", "category1", "category2"]) indexer = StringIndexer(inputCol="category1", outputCol="category1Index") inputs = [indexer.getOutputCol(), "category2"] encoder = OneHotEncoderEstimator(inputCols=inputs, outputCols=["categoryVec1", "categoryVec2"]) pipeline = Pipeline(stages=[indexer, encoder]) pipeline.fit(df).transform(df).show() # +---+---------+---------+--------------+-------------+-------------+ # | id|category1|category2|category1Index| categoryVec1| categoryVec2| # +---+---------+---------+--------------+-------------+-------------+ # | 0| a| 1| 0.0|(2,[0],[1.0])|(4,[1],[1.0])| # | 1| b| 2| 2.0| (2,[],[])|(4,[2],[1.0])| # | 2| c| 3| 1.0|(2,[1],[1.0])|(4,[3],[1.0])| # | 3| a| 4| 0.0|(2,[0],[1.0])| (4,[],[])| # | 4| a| 4| 0.0|(2,[0],[1.0])| (4,[],[])| # | 5| c| 3| 1.0|(2,[1],[1.0])|(4,[3],[1.0])| # +---+---------+---------+--------------+-------------+-------------+ def encode_columns(df, col_list):
schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)]) features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) print "Features created" from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(features) featIndexed = string_indexer_model.transform(features) print "labels indexed" lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol()) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') lr_model = lr.fit(featIndexed) dfTestTok = tokenizer.transform(dfTest) featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema) testIndexed = string_indexer_model.transform(featuresTest) df_test_pred = lr_model.transform(testIndexed) res=evaluator.evaluate(df_test_pred) print res
#renaming columns, all columns that contain a - will be replaced with an "_" columns_new = [col.replace("-", "_") for col in data_all.columns] data_all = data_all.toDF(*columns_new) data_all.printSchema() # COMMAND ---------- categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"] stages = [] # stages in our Pipeline for categoricalCol in categoricalColumns: # Category Indexing with StringIndexer stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") # Use OneHotEncoder to convert categorical variables into binary SparseVectors # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]) # Add stages. These are not run here, but will run all at once later on. stages += [stringIndexer, encoder] # Convert label into label indices using the StringIndexer label_stringIdx = StringIndexer(inputCol="income", outputCol="label") stages += [label_stringIdx] # Transform all features into a vector using VectorAssembler numericCols = ["age", "fnlwgt", "education_num", "capital_gain", "capital_loss", "hours_per_week"] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] partialPipeline = Pipeline().setStages(stages)
if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("IndexToStringExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame( [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")], ["id", "category"]) indexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = indexer.fit(df) indexed = model.transform(df) print("Transformed string column '%s' to indexed column '%s'" % (indexer.getInputCol(), indexer.getOutputCol())) indexed.show() print("StringIndexer will store labels in output column metadata\n") converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory") converted = converter.transform(indexed) print("Transformed indexed column '%s' back to original string column '%s' using " "labels in metadata" % (converter.getInputCol(), converter.getOutputCol())) converted.select("id", "categoryIndex", "originalCategory").show() # $example off$ spark.stop()
#в общем - все анализируемые колонки заносим в колонку-вектор features # пробуем скормить неполные столбцы - получается, модель обучается и без обработки данных categoricalColumns = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'] stages = [] for categoricalCol in categoricalColumns: stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').setHandleInvalid("keep") encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"]).setHandleInvalid("keep") stages += [stringIndexer, encoder] label_stringIdx = StringIndexer(inputCol = 'SalePrice', outputCol = 'label').setHandleInvalid("keep") stages += [label_stringIdx] numericCols = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'] assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features").setHandleInvalid("keep")