def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Crossing", "Finishing", "HeadingAccuracy", "ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", "BallControl", "Acceleration", "SprintSpeed", "Agility", "Reactions", "Balance", "ShotPower", "Jumping", "Stamina", "Strength", "LongShots", "Aggression", "Interceptions", "Positioning", "Vision", "Penalties", "Composure", "Marking", "StandingTackle", "SlidingTackle", "GKDiving", "GKHandling", "GKKicking", "GKPositioning", "GKReflexes" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=5, featuresCol="indexedFeatures", labelCol="Position", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "Edad", "Genero", "Zona", "Fumador_Activo", "ultimo_estado_de_Glicemia", "Enfermedad_Coronaria", "Tension_sistolica", "Tension_diastolica", "Colesterol_Total", "Trigliceridos", "Clasificacion_RCV_Global", "Glicemia_de_ayuno", "Perimetro_Abdominal", "Peso", "IMC", "CLAIFICACION_IMC", "Creatinina", "Factor_correccion", "Proteinuria", "Farmacos_Antihipertensivos", "Estatina", "Antidiabeticos", "Adherencia_tratamiento" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=15) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=15, featuresCol="indexedFeatures", labelCol="Diabetes", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show(100)
def test_chi_sq_selector(self): data = self.spark.createDataFrame( [(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0), (Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0), (Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0)], ["features", "label"]) selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures") model = selector.fit(data) # the input name should match that of what StringIndexer.inputCol feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ChiSqSelector', [('features', FloatTensorType([None, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = predicted.toPandas().selectedFeatures.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlChiSqSelector") onnx_model_path = paths[-1] output, output_shapes = run_onnx_model(['selectedFeatures'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def feature_selector_process(spark, ml_df, spark_artefacts_dir, run_mode, i, feature_cols): # APPLY CHI-SQUARE SELECTOR name = f"ChiSquareSelectorModel_{i}" selector_model_path = Path(spark_artefacts_dir).joinpath(name) if run_mode == 'first': # ChiSq Test to obtain ChiSquare values (higher -> more dependence between feature and lable -> better) r = ChiSquareTest.test(ml_df, "features", "label") pValues = r.select("pvalues").collect()[0][0].tolist() stats = r.select("statistics").collect()[0][0].tolist() dof = r.select("degreesOfFreedom").collect()[0][0] # ChiSq Selector selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features", labelCol="label") selector_model = selector.fit(ml_df) selector_model.write().overwrite().save( str(selector_model_path.absolute())) top_10_feaures_importance = [] top_10_features = [] for j in selector_model.selectedFeatures: top_10_feaures_importance.append(feature_cols[j]) top_10_features.append(feature_cols[j]) top_10_feaures_importance.append(stats[j]) model_info = [ name, ml_df.count(), None, None, None, None, None, None, None ] + top_10_feaures_importance model_info_df = spark.createDataFrame(data=[model_info], schema=MODEL_INFO_SCHEMA) model_info_df.write.jdbc(CONNECTION_STR, 'model_info', mode='append', properties=CONNECTION_PROPERTIES) elif run_mode == 'incremental': selector_model = ChiSqSelectorModel.load( str(selector_model_path.absolute())) top_10_features = [] for j in selector_model.selectedFeatures: top_10_features.append(feature_cols[j]) ml_df_10 = selector_model.transform(ml_df) ml_df_10 = ml_df_10.drop("features") #Solve a problem with ChiSqSelector and Tree-based algorithm ml_rdd_10 = ml_df_10.rdd.map( lambda row: Row(label=row[0], features=DenseVector(row[1].toArray()))) ml_df_10 = spark.createDataFrame(ml_rdd_10) return ml_df_10, top_10_features
def Chi_sqr(dataset_add, feature_colm, label_colm): dataset = spark.read.csv(dataset_add, header=True, inferSchema=True) dataset.show() # using the rformula for indexing, encoding and vectorising label = '' for y in label_colm: label = y print(label) f = "" f = label + " ~ " for x in feature_colm: f = f + x + "+" f = f[:-1] f = (f) formula = RFormula(formula=f, featuresCol="features", labelCol="label") length = feature_colm.__len__() output = formula.fit(dataset).transform(dataset) output.select("features", "label").show() # chi selector from pyspark.ml.feature import ChiSqSelector selector = ChiSqSelector(numTopFeatures=length, featuresCol="features", outputCol="selected_features", labelCol="label") result = selector.fit(output).transform(output) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() #runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selected_features", "label").head() print("pValues: " + str(r.pValues)) p_values = str(r.pValues) print("degreesOfFreedom: " + str(r.degreesOfFreedom)) print("statistics: " + str(r.statistics)) json_response = {'pvalues': p_values} return json_response # Chi_sqr(dataset_add, features_colm, label_colm)
def chiSquareTest(self,categoricalFeatures,maxCategories): dataset=self.dataset labelColm=self.labelColm features=self.features length = features.__len__() featureassembler = VectorAssembler( inputCols=self.features, outputCol="featuresChiSquare", handleInvalid="skip") dataset= featureassembler.transform(dataset) vec_indexer = VectorIndexer(inputCol="featuresChiSquare", outputCol='vecIndexedFeaturesChiSqaure', maxCategories=maxCategories, handleInvalid="skip").fit(dataset) categorical_features = vec_indexer.categoryMaps print("Chose %d categorical features: %s" % (len(categorical_features), ", ".join(str(k) for k in categorical_features.keys()))) dataset = vec_indexer.transform(dataset) # finalized_data = dataset.select(labelColm, 'vecIndexedFeaturesChiSqaure') # finalized_data.show() # using chi selector selector = ChiSqSelector(numTopFeatures=length, featuresCol="vecIndexedFeaturesChiSqaure", outputCol="selectedFeatures", labelCol=labelColm) result = selector.fit(dataset).transform(dataset) print("chi2 output with top %d features selected " % selector.getNumTopFeatures()) result.show() # runnin gfor the chi vallue test r = ChiSquareTest.test(result, "selectedFeatures", labelColm).head() p_values = list(r.pValues) PValues = [] for val in p_values: PValues.append(round(val, 4)) print(PValues) dof = list(r.degreesOfFreedom) stats = list(r.statistics) statistics = [] for val in stats: statistics.append(round(val, 4)) print(statistics) chiSquareDict = {} for pval, doF, stat, colm in zip(PValues, dof, statistics, categoricalFeatures): print(pval, doF, stat) chiSquareDict[colm] = pval, doF, stat chiSquareDict['summaryName'] = ['pValue', 'DoF', 'statistics'] print(chiSquareDict) result = {'pvalues': chiSquareDict} return result
def feature_selection(t_data): #Feature selection css = ChiSqSelector(featuresCol='scaled_features', outputCol='Aspect', labelCol='output', numTopFeatures=10) t_data = css.fit(t_data).transform(t_data) return t_data
def clasificar_chi2(): #Leemos la data y convertimos a float los valores de cada columna conf = SparkConf().setAppName("NN_1").setMaster("local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) rdd = sqlContext.read.csv( "/home/ulima-azure/data/Enfermedad_Oncologica_T3.csv", header=True).rdd rdd = rdd.map(lambda x: (float(x[0]), float(x[1]), float(x[2]), float(x[ 3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8]), float(x[9]))) df = rdd.toDF([ "Cellenght", "Cellsize", "Cellshape", "mgadhesion", "sepics", "bnuclei", "bchromatin", "nucleos", "mitoses", "P_Benigno" ]) #Construir nuestro vector assembler (features) assembler = VectorAssembler(inputCols=[ "Cellenght", "Cellsize", "Cellshape", "nucleos", "bchromatin", "mitoses" ], outputCol="featuresChi2") df_chi2 = assembler.transform(df) df_chi2 = df_chi2.select("featuresChi2", "P_Benigno") selector = ChiSqSelector(numTopFeatures=3, featuresCol="featuresChi2", labelCol="P_Benigno", outputCol="featuresSelected") df_result = selector.fit(df_chi2).transform(df_chi2) #Dividir data en training y test (df_training, df_test) = df_result.randomSplit([0.7, 0.3]) # Definir arquitectura de nuestra red (hiperparametro) capas = [3, 4, 6, 2] # Construimos al entrenador # Hiperparametro: maxIter entrenador = MultilayerPerceptronClassifier(featuresCol="featuresSelected", labelCol="P_Benigno", maxIter=1000, layers=capas) # Entrenar nuestro modelo modelo = entrenador.fit(df_training) # Validar nuestro modelo df_predictions = modelo.transform(df_test) evaluador = MulticlassClassificationEvaluator(labelCol="P_Benigno", predictionCol="prediction", metricName="accuracy") accuracy = evaluador.evaluate(df_predictions) print(f"Accuracy: {accuracy}") df_predictions.select("prediction", "rawPrediction", "probability").show() #Mostramos la cantidad de 0 y 1 de las predicciones df_predictions.groupby('prediction').count().show()
def run_feature_selection_on(data): LOGGER.warning("Running feature selection.") selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selectedFeatures", labelCol="label") data = selector.fit(data).transform(data).drop( 'features').withColumnRenamed('selectedFeatures', 'features') LOGGER.warning("Ran feature selection.") return data
def pre_processing(df): ''' feature selection ''' selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show()
def getAllMeasure(rf,selectorData,featureCols,): measure = np.array([' ', ' ', ' ']) for i in range(1, len(featureCols) + 1): selector = ChiSqSelector(numTopFeatures=i, featuresCol="features", outputCol="selectedFeatures", labelCol="label") selectedData = selector.fit(selectorData).transform(selectorData) trainSelected, testSelected = selectedData.randomSplit([0.7, 0.3]) rfModel = rf.fit(trainSelected) prediction = rfModel.transform(testSelected) evaluator = BinaryClassificationEvaluator() measure = np.vstack([evaluateLr(prediction, evaluator, i), measure]) return measure
def preprocess(inputCol=["text", "label"], n=4): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] remover = [StopWordsRemover(inputCol="words", outputCol="filtered")] ngrams = [ NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=2) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=1000)] return Pipeline(stages=tokenizer + remover + ngrams + cv + idf + assembler + label_stringIdx + selector + lr)
def appendselector(stages, percent=0.5): #A Chi-Square Feature Selector uses the Chi-Squared test of independence to decide which features #as the most "useful". In this case, 50% of the original amount of features are set to be kept. #With these Transformers, the stages for training Hybrid Classifiers are set (different Transformer #for TF-IDF and Word Embedding Text-Based Features. if (percent < 1.0): print("Appending Chi-Square to stages with percentage " + str(percent)) selectorType = 'percentile' numTopFeatures = 50 percentile = percent else: print("Appending Chi-Square to stage with numTopFeatures " + str(percent)) selectorType = 'numTopFeatures' numTopFeatures = percent percentile = 0.1 stages[-1].setOutputCol('prefeatures') selector = ChiSqSelector(numTopFeatures=numTopFeatures, featuresCol='prefeatures', outputCol='features', selectorType=selectorType, percentile=percentile) selectorstages = stages + [selector] return selectorstages
def MachineLearning(df): file_dataSVM = "G:/Projects/Spark-Machine-Learning/Spark Machine Learning/Spark Machine Learning/svm/" data = df.select(['Summary','Sentiment']).withColumnRenamed('Sentiment','label') data = data.withColumn('length',length(data['Summary'])) # Basic sentence tokenizer tokenizer = Tokenizer(inputCol="Summary", outputCol="words") #remove stop words remover = StopWordsRemover(inputCol="words", outputCol="filtered_features") #transoform dataset to vectors cv = HashingTF(inputCol="filtered_features", outputCol="features1", numFeatures=1000) #calculate IDF for all dataset idf = IDF(inputCol= 'features1', outputCol = 'tf_idf') normalizer = StandardScaler(inputCol="tf_idf", outputCol="normFeatures", withStd=True, withMean=False) selector = ChiSqSelector(numTopFeatures=150, featuresCol="normFeatures", outputCol="selectedFeatures", labelCol="label") #prepare data for ML spark library cleanUp = VectorAssembler(inputCols =['selectedFeatures'],outputCol='features') # Normalize each Vector using $L^1$ norm. pipeline = Pipeline(stages=[tokenizer, remover, cv, idf,normalizer,selector,cleanUp]) pipelineModel = pipeline.fit(data) data = pipelineModel.transform(data) data.printSchema() train_data, test_data = data.randomSplit([0.7,0.3],seed=2018) lr = LogisticRegression(featuresCol="features", labelCol='label') lrModel = lr.fit(train_data) beta = np.sort(lrModel.coefficients) plt.plot(beta) plt.ylabel('Beta Coefficients') plt.show() trainingSummary = lrModel.summary roc = trainingSummary.roc.toPandas() plt.plot(roc['FPR'],roc['TPR']) plt.ylabel('False Positive Rate') plt.xlabel('True Positive Rate') plt.title('ROC Curve') plt.show() print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC)) pr = trainingSummary.pr.toPandas() plt.plot(pr['recall'],pr['precision']) plt.ylabel('Precision') plt.xlabel('Recall') plt.show() predictions = lrModel.transform(test_data) evaluator = BinaryClassificationEvaluator() print('Test Area Under ROC', evaluator.evaluate(predictions))
def important_feature_selector(predicted): """Uses the Chi-Squared Test to select important features for classification, and prints them out. Params: - predicted (pyspark.sql.DataFrame): The dataset, with predictions """ selector = ChiSqSelector(numTopFeatures=50, featuresCol='presence_feature_set', labelCol='label', outputCol='selected_features', selectorType='numTopFeatures') model = selector.fit(predicted) important_features = model.selectedFeatures with open('bag_of_words_labels.json', 'r') as bow_file: bow_labels = json.loads( bow_file.readlines()[0]) # There is only one line important_feature_labels = [ bow_labels[index] for index in important_features ] print("=====Important Feature Labels=====") print(important_feature_labels)
def pruebaChi(dataframe, categoricalCols, numericalCols, labelCol="TIPO PACIENTE"): """Función que hace todo el preprocesamiento de los datos categóricos de un conjunto de datos de entrenamiento (o no). :param train spark df: conjunto de datos de entrenamiento. :param categoricalCols list,array: conjunto de nombres de columnas categoricas del conjunto de datos train. :param numericalCols list,array: conjunto de nombres de columnas numéricas del conjunto de datos train. :param labelCol str: variable objetivo o etiqueta :Returns spark dataframe con las columnas 'label' y 'features' """ # codificamos todas las variables categóricas stages = [] for categoricalCol in categoricalCols: stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index") encoder = OneHotEncoder(inputCol=stringIndexer.getOutputCol(), outputCol=categoricalCol + "ohe") stages += [stringIndexer, encoder] # variable objetivo (etiqueta) label_strIdx = StringIndexer(inputCol=labelCol, outputCol="label") stages += [label_strIdx] # ponemos todas las covariables en un vector assemblerInputs = [c + "ohe" for c in categoricalCols] + numericalCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="feat") stages += [assembler] # seleccionamos las variables que nos sirven con ChiSqSelector selector = ChiSqSelector(featuresCol="feat", outputCol="feature", labelCol="label", fpr=0.05, selectorType='fpr') stages += [selector] # escala de 0-1 scala = MinMaxScaler(inputCol="feature", outputCol="features") stages += [scala] # pipeline donde vamos a hacer todo el proceso pipe = Pipeline(stages=stages) pipeModel = pipe.fit(dataframe) df = pipeModel.transform(dataframe) # regresamos nuestro df con lo que necesitamos return df
def feature_selection(df): assembler = VectorAssembler(inputCols=[ "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal" ], outputCol="features") df = assembler.transform(df) indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) df = indexer.fit(df).transform(df) # Seleccionamos features que mas suman al modelo selector = ChiSqSelector(numTopFeatures=4, featuresCol="indexedFeatures", labelCol="target", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def feature_selection(df): # Creamos vectorassembler assembler = VectorAssembler(inputCols=[ "EDAD", "GENERO", "ETNIA", "ZONA", "ESCOLARIDAD", "FUMADOR", "HAS", "HTADM", "GLICEMIA", "ENF_CORONARIA", "T_SISTOLICA", "T_DIASTOLICA", "COLESTEROL_TOTAL", "TRIGLICERIDOS", "RCV_GLOBAL", "GLICEMIA_AYUNO", "PERIMETRO_ABDOMINAL", "PESO", "TALLA", "IMC", "CREATININA", "MICROALBUMINURIA", "ESTADO_IRC", "FARMACOS_ANTIHIPERTENSIVOS" ], outputCol="features") df = assembler.transform(df) # Vectorindexer indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures") df = indexer.fit(df).transform(df) # Prueba ChiSquare selector = ChiSqSelector(numTopFeatures=8, featuresCol="indexedFeatures", labelCol="DIABETES", outputCol="selectedFeatures") resultado = selector.fit(df).transform(df) resultado.select("features", "selectedFeatures").show()
def initializePipeline(num_cols, cat_cols): cat_cols_index = [] cat_cols_hoted = [] for i in cat_cols: cat_cols_index.append(i + "_index") cat_cols_hoted.append(i + "_hoted") featureCols = [] for i in num_cols: featureCols.append(i + "scaled") for i in cat_cols: featureCols.append(i + "_hoted") labelindexers = [StringIndexer(inputCol="Churn", outputCol="label")] indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index") for column in cat_cols ] oneHotEncoder = [ OneHotEncoderEstimator(inputCols=cat_cols_index, outputCols=cat_cols_hoted, dropLast=False) ] assembler = [ VectorAssembler(inputCols=num_cols, outputCol=i + "_indexe") for i in num_cols ] normalizers = [ MinMaxScaler(inputCol=column + "_indexe", outputCol=column + "scaled") for column in num_cols ] featureAssembler = [ VectorAssembler(inputCols=featureCols, outputCol="resultedfeatures") ] selector = [ ChiSqSelector(numTopFeatures=13, featuresCol="resultedfeatures", outputCol="features", labelCol="label") ] pipeline = Pipeline(stages=indexers + oneHotEncoder + assembler + normalizers + featureAssembler + labelindexers + selector) return pipeline
def build_trigrams(input_cols=("text", "target"), n=3): logging.warning("Building trigram model.") tokenizer = [Tokenizer(inputCol=input_cols[0], outputCol="words")] ngrams = [ NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_string_idx = [ StringIndexer(inputCol=input_cols[1], outputCol="label") ] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + label_string_idx + selector + lr)
for feature in feature_cols: indexed = feature + "_" + "indexed" indexed_cols.append(indexed) indexer = StringIndexer(inputCol=feature, outputCol=indexed, handleInvalid="keep", stringOrderType="frequencyDesc") stages.append(indexer) stages.append( VectorAssembler(inputCols=indexed_cols, outputCol="features", handleInvalid="keep")) stages.append( ChiSqSelector(numTopFeatures=20, labelCol="HasDetections", featuresCol="features", outputCol="selectedFeatures")) print("Performing model fitting") pipeline = Pipeline(stages=stages) model = pipeline.fit(df) df_features = model.transform(df) df_features.select("features", "selectedFeatures").show() print("Saving Pipeline Model") model.write().overwrite().save(pipeline_model_path) with open(feature_path, "wb") as f: pickle.dump(feature_cols, f) features = model.stages[-1].selectedFeatures
# from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import ChiSqSelector from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("ChiSqSelectorExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,), (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,), (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"]) selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) result.show() # $example off$ spark.stop()
pe = PolynomialExpansion().setInputCol("features").setDegree(2).setOutputCol( "polyFeatures") pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\ .where("CustomerId IS NOT NULL") prechi = fittedCV.transform(tokenized)\ .where("CustomerId IS NOT NULL") chisq = ChiSqSelector()\ .setFeaturesCol("countVec")\ .setLabelCol("CustomerId")\ .setNumTopFeatures(2) chisq.fit(prechi).transform(prechi)\ .drop("customerId", "Description", "DescOut").show() # COMMAND ---------- fittedPCA = pca.fit(scaleDF) fittedPCA.write().overwrite().save("/tmp/fittedPCA") # COMMAND ---------- from pyspark.ml.feature import PCAModel loadedPCA = PCAModel.load("/tmp/fittedPCA") loadedPCA.transform(scaleDF).show()
std_scaler = StandardScaler(inputCol="features", outputCol="scaled_features") scaled_df = std_scaler.fit(features_df).transform(features_df) scaled_df.select("scaled_features").display() # COMMAND ---------- # MAGIC %md ###Part 4: Feature Selection # MAGIC Chi Square Selector # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector from pyspark.ml.linalg import Vectors chisq_selector = ChiSqSelector(numTopFeatures=1, featuresCol="scaled_features", outputCol="selected_features", labelCol="cust_age") result_df = chisq_selector.fit(scaled_df).transform(scaled_df) result_df.select("selected_features").display() # COMMAND ---------- # MAGIC %md Feature Selection using VectorSclicer # COMMAND ---------- from pyspark.ml.feature import VectorSlicer vec_slicer = VectorSlicer(inputCol="scaled_features",
class1_num = class1.count() class2_num = class2.count() fraction = 1.0 * class1_num / class2_num class2 = class2.sample(fraction) training_dataset_balanced = class1.union(class2) training_dataset_balanced.groupBy("_c41").count().show() ####### 14.1 ### converted_cols = ["s" + col for col in string_cols] assembler = VectorAssembler(inputCols=converted_cols + numerical_cols, outputCol="features") labelIndexer = StringIndexer(inputCol="_c41", outputCol="label") #classifier = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10, maxBins=64, maxDepth= 5, subsamplingRate= 1.0 ) ## 14.2 #classifier = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxBins=64) selector = ChiSqSelector(numTopFeatures=35, featuresCol="features", outputCol="selectedFeatures") classifier = NaiveBayes( smoothing=1.0 ) ## modelType="multinomial" we have binomial here so it doesn't make change when we apply this parameter pipeline = Pipeline(stages=indexers + [assembler, labelIndexer, selector, classifier]) model = pipeline.fit(training_dataset_balanced) #predictions = model.transform(dataset_testing) ##14.1 predictions = model.transform(dataset_testing) ## 14.2 predictions.show(10, False) ###### 14.2 #### evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction") accuracy = evaluator.evaluate(predictions)
from pyspark.ml.feature import PolynomialExpansion pe = PolynomialExpansion().setInputCol("features").setDegree(2) pe.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ChiSqSelector, Tokenizer tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut") tokenized = tkn\ .transform(sales.select("Description", "CustomerId"))\ .where("CustomerId IS NOT NULL") prechi = fittedCV.transform(tokenized)\ .where("CustomerId IS NOT NULL") chisq = ChiSqSelector()\ .setFeaturesCol("countVec")\ .setLabelCol("CustomerId")\ .setNumTopFeatures(2) chisq.fit(prechi).transform(prechi)\ .drop("customerId", "Description", "DescOut").show() # COMMAND ---------- fittedPCA = pca.fit(scaleDF) fittedPCA.write().overwrite().save("/tmp/fittedPCA") # COMMAND ---------- from pyspark.ml.feature import PCAModel loadedPCA = PCAModel.load("/tmp/fittedPCA")
# assemble all features into feature vector features_assembler = VectorAssembler(inputCols=num_bool_features, outputCol="features") # Index labels, adding metadata to the label column. label_indexer = StringIndexer(inputCol="has_over_50k", outputCol="label").fit(processed_train_set) # Convert indexed labels back to original labels. label_converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=label_indexer.labels) # - ChiSQ feature Selection selector = ChiSqSelector(numTopFeatures=20, featuresCol="features", outputCol="featuresSel", labelCol="label") # - RandomForest model with parameter tuning using cross validation rf = RandomForestClassifier(labelCol="label", featuresCol="featuresSel", numTrees=20) # - Create ParamGrid for Cross Validation rf_param_grid = (ParamGridBuilder().addGrid( rf.maxDepth, [2, 3, 4, 5, 10, 20]).addGrid(rf.maxBins, [10, 20, 40, 80, 100]).build()) # - Model Evaluation rf_eval = BinaryClassificationEvaluator(labelCol="label")
def add_vectorized_features(self, transform_type, min_df, max_df, isCHISQR, chi_feature_num, num_features): ''' Creates the pySpark feature pipeline and stores the vectorized data under the feature column Input: transform_type: {'tfidf','tfidf_bigram'}, min document frequency (min_df), chi squared feature reduction (isCHISQR) number of reduced features with chi square feature reduction (chi_feature_num), number of features (num_features) Output: Returns the transformed dataframe with the label and features columns ''' stages = [] #Code this code transforms text to vectorized features # Tokenize review sentences into vectors of words regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W") stages += [regexTokenizer] #Remove stopwords from tokenized words #nltk.download('stopwords') from nltk.corpus import stopwords sw = stopwords.words('english') stopwordsRemover = StopWordsRemover( inputCol="words", outputCol="filtered").setStopWords(sw) #lemmatizer = WordNetLemmatizer() #doc = [lemmatizer.lemmatize(token) for token in doc] stages += [stopwordsRemover] # Using TFIDF for review transformation of unigrams. if transform_type == 'tfidf': # Creating IDF from the words the filtered words hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=num_features) idf = IDF(inputCol="rawFeatures", outputCol="review_vector", minDocFreq=min_df) # Add to stages stages += [hashingTF, idf] # Using TFIDF for review transformation of bigrams if transform_type == 'tfidf_bigram': #Add unigram and bigram word vectors, then vectorize using TFIDF unigram = NGram(n=1, inputCol='filtered', outputCol='unigrams') stages += [unigram] bigram = NGram(n=2, inputCol='filtered', outputCol='bigrams') stages += [bigram] # Creating IDF from unigram words hashingTF_unigram = HashingTF(inputCol="unigrams", outputCol="rawFeatures_unigrams", numFeatures=num_features) idf_unigram = IDF(inputCol="rawFeatures_unigrams", outputCol="unigrams_vector", minDocFreq=min_df) # Add to stages stages += [hashingTF_unigram, idf_unigram] # Creating IDF from the bigram words hashingTF_bigram = HashingTF(inputCol="bigrams", outputCol="rawFeatures_bigrams", numFeatures=num_features) idf_bigram = IDF(inputCol="rawFeatures_bigrams", outputCol="bigrams_vector", minDocFreq=min_df) # Add to stages stages += [hashingTF_bigram, idf_bigram] ngrams = VectorAssembler( inputCols=['unigrams_vector', 'bigrams_vector'], outputCol='review_vector') stages += [ngrams] assemblerInputs = ['review_vector'] assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="unstandard_features") stages += [assembler] if isCHISQR: chi_selector = ChiSqSelector(numTopFeatures=chi_feature_num, featuresCol="unstandard_features", outputCol="chisq_features", labelCol="label") stages += [chi_selector] scaler = StandardScaler(inputCol="chisq_features", outputCol="features", withStd=True, withMean=False) stages += [scaler] else: scaler = StandardScaler(inputCol="unstandard_features", outputCol="features", withStd=True, withMean=False) stages += [scaler] pipeline = Pipeline(stages=stages) pipelineFit = pipeline.fit(self.df) self.df = pipelineFit.transform(self.df) return self.df
spark = SparkSession.builder.appName("ChiSqSelector").getOrCreate() df = spark.createDataFrame([( 7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0, ), ( 8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0, ), ( 9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0, )], ["id", "features", "clicked"]) selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked") model = selector.fit(df) result = model.transform(df) print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show() spark.stop()
from __future__ import print_function from pyspark.sql import SparkSession # $example on$ from pyspark.ml.feature import ChiSqSelector from pyspark.ml.linalg import Vectors # $example off$ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("ChiSqSelectorExample")\ .getOrCreate() # $example on$ df = spark.createDataFrame([ (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,), (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,), (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"]) selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show() # $example off$ spark.stop()
from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import ChiSqSelector from pyspark.ml.linalg import Vectors working_cols = df.columns working_cols.remove("ID") working_cols.remove("Target") # This concatenates all feature columns into a single feature vector in a new column "rawFeatures". vectorAssembler = VectorAssembler(inputCols=working_cols, outputCol="rawFeatures") #Execute Vector Assembler assembled_df = vectorAssembler.transform(df) #Select Features selector = ChiSqSelector(numTopFeatures=5, featuresCol="rawFeatures", outputCol="selectedFeatures", labelCol="Target") #Execute Selector selected_df = selector.fit(assembled_df).transform(assembled_df) #Display Results print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) display(selected_df.select("rawFeatures", "selectedFeatures")) # COMMAND ---------- # COMMAND ---------- display(assembled)
df = spark.createDataFrame([( 7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0, ), ( 8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0, ), ( 9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0, )], ["id", "features", "clicked"]) selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked") result = selector.fit(df).transform(df) print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures()) result.show() # COMMAND ---------- ###Locality sensitive hashing (LSH) is used in clustering data from pyspark.ml.feature import BucketedRandomProjectionLSH from pyspark.ml.linalg import Vectors from pyspark.sql.functions import col
def issue_impact_process(ml_df, columns, project, organization): # ChiSquare r = ChiSquareTest.test(ml_df, "features", "label") pValues = r.select("pvalues").collect()[0][0].tolist() stats = r.select("statistics").collect()[0][0].tolist() dof = r.select("degreesOfFreedom").collect()[0][0] # ChiSq Selector selector = ChiSqSelector(numTopFeatures=10, featuresCol="features", outputCol="selected_features", labelCol="label") selector_model = selector.fit(ml_df) top_10_feaures_importance = [] for j in selector_model.selectedFeatures: top_10_feaures_importance.append(columns[j]) top_10_feaures_importance.append(stats[j]) top_issue_lines = [] data_count = ml_df.count() # First importance value being 0 => skip if top_10_feaures_importance[1] != 0: print("\tFirst ChiSquare selected issue's importance is 0") top_issue_lines.append( [organization, project, "ChiSquareSelectorModel", data_count] + top_10_feaures_importance) # Tree-based algorithm's Feature Importances dt = DecisionTreeClassifier(featuresCol='features', labelCol='label', maxDepth=3) rf = RandomForestClassifier(featuresCol='features', labelCol='label', numTrees=10) for algo, model_name in [(dt, "DecisionTreeModel"), (rf, "RandomForestModel")]: model = algo.fit(ml_df) f_importances = model.featureImportances indices = f_importances.indices.tolist() values = f_importances.values.tolist() if len(values) < 2: print( f"\tOnly less or equal to 1 significant issue for model {model_name}. Skipping writing to Database." ) continue value_index_lst = list(zip(values, indices)) value_index_lst.sort(key=lambda x: x[0], reverse=True) importance_sorted_features = [] for value, index in value_index_lst: importance_sorted_features.append(columns[index]) importance_sorted_features.append(value) length = len(importance_sorted_features) if length > 20: importance_sorted_features = importance_sorted_features[:20] elif length < 20: importance_sorted_features = importance_sorted_features + ( 20 - length) * [None] top_issue_lines.append( [organization, project, model_name, data_count] + importance_sorted_features) if len(top_issue_lines) > 0: top_issue_df = spark.createDataFrame(data=top_issue_lines, schema=TOP_ISSUE_SCHEMA) top_issue_df.write.jdbc(CONNECTION_STR, 'top_issues', mode='append', properties=CONNECTION_PROPERTIES)