def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def featureExtract(self, trainDataframe, predictionDataframe): pipeline = None try: pipeline = Pipeline.load(ROOT_PATH + '/pipeline') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff") # lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[remover, hashingTF, idf]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/pipeline') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("filtered", "features", "idff") for row in selected.collect(): filtered, features, idff = row self.logger.info("features: %s", features) self.logger.info("idff: %s", idff) self.logger.info( "filtered: %s", str(filtered).decode("unicode_escape").encode("utf-8")) return selected
def train_lg(training_data, collection): # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr. hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") pipeline1 = Pipeline(stages=[hashingTF, idf]) # Fit the pipeline1 to training documents. model1 = pipeline1.fit(training_data) lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline2 = Pipeline(stages=[model1, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000, 10000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline2, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(), numFolds=5) # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(training_data) # model_path = os.path.join(models_dir , time.strftime("%Y%m%d-%H%M%S") + '_' # + collection["Id"] + '_' # + collection["name"]) # cvModel.save(sc, model_path) return cvModel
def train_svm_idf(sqlContext, df): training, test = df.randomSplit([0.8, 0.2]) tokenizer = Tokenizer(inputCol="body", outputCol="words") hashingTF = HashingTF(numFeatures=2000, inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") svm = LinearSVC(featuresCol="features", labelCol="label") pipline = Pipeline(stages=[tokenizer, hashingTF, idf, svm]) model = pipline.fit(training) test_df = model.transform(test) train_df = model.transform(training) test_df.show() train_df.show() evaluator = BinaryClassificationEvaluator(labelCol="label") """rawPredictionCol="prediction",""" train_metrix = evaluator.evaluate(train_df) test_metrix = evaluator.evaluate(test_df) test_p = test_df.select("prediction").rdd.map( lambda x: x['prediction']).collect() test_l = test_df.select("label").rdd.map(lambda x: x['label']).collect() train_p = train_df.select("prediction").rdd.map( lambda x: x['prediction']).collect() train_l = train_df.select("label").rdd.map(lambda x: x['label']).collect() print("\n\n\n\n") print("-" * 15 + " OUTPUT " + "-" * 15) print() print("confusion matrix for trainning data") print(train_metrix) print("train label") print(train_l) print("train prediction") print(train_p) print("-" * 30) print() print("confusion matrix for testing data") print(test_metrix) print("test label") print(test_l) print("test prediction") print(test_p) print("-" * 30) print("\n\n\n\n")
def __init__(self, data): tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") lr = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr]) self.model = pipeline.fit(data)
def nb_train(data): #Naive Bayes Classifier label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label") lsmodel=label_stringIdx.fit(data) data=lsmodel.transform(data) (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100) countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000) idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=5) nb = NaiveBayes(smoothing=1) pipeline = Pipeline(stages=[countVectors,nb]) pipelineFit = pipeline.fit(trainingData) predictions = pipelineFit.transform(testData) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") return (evaluator.evaluate(predictions),lsmodel.labels,pipelineFit)
def pipeline(self): from pyspark.ml import Pipeline from pyspark.ml.feature import HashingTF, IDF from pyspark.ml.feature import Tokenizer from pyspark.ml.classification import LogisticRegression tokenizer = Tokenizer(inputCol="message", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tempfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") lrClassifier = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lrClassifier]) return pipeline
def nb_train_cv(data): #Naive Bayes Classifier label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label") lsmodel=label_stringIdx.fit(data) data=lsmodel.transform(data) data.cache() #(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100) countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5) evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000) idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=5) nb = NaiveBayes() pipeline = Pipeline(stages=[countVectors,nb]) grid = ParamGridBuilder().addGrid(nb.smoothing, [1]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator, numFolds=10) cvmodel = crossval.fit(data) return (evaluator.evaluate(cvmodel.transform(data)), lsmodel.labels, cvmodel)
def create_pipeline(model_type, num_features=10000): """ Defines pipeline from BOW to prediction. """ remover = StopWordsRemover(inputCol="bow", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") if model_type == 'log_reg': model = LogisticRegression() elif model_type == 'gbt': model = GBTClassifier() elif model_type == 'naive_bayes': model = NaiveBayes() elif model_type == 'rf': model = RandomForestClassifier() return Pipeline(stages=[remover, hashingTF, tfidf, model])
class BaselinePipelineEngine(PipelineEngine): @keyword_only def __init__(self, cv): super(BaselinePipelineEngine, self).__init__(cv) self.hashing_tf_map = [pow(2, 20)] self.lr_map = [0.1, 0.01] self.stages = self._build_stages() self.pipeline = Pipeline(stages=[self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr]) self.param_grid = self._build_param_grid() def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.hashing_tf = HashingTF(inputCol=self.tokenizer.getOutputCol(), outputCol="raw_features") self.idf_model = IDF(inputCol=self.hashing_tf.getOutputCol(), outputCol="features") self.lr = LogisticRegression(maxIter=10, regParam=0.01) return [self.bs_parser, self.tokenizer, self.hashing_tf, self.idf_model, self.lr] def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.lr.regParam, self.lr_map) return param_grid_builder.build()
count_verbs_udf(split_df['words'])) has_q_df = verb_count_df.withColumn('has_q', check_q_udf(verb_count_df['text'])) stem_df = has_q_df.withColumn('words', stem_udf(has_q_df['words'])) no_dupes_df = stem_df.dropDuplicates(['words']) no_emptys_df = no_dupes_df.filter(no_dupes_df['word_count'] > 1) # Split data set training_df, testing_df = no_emptys_df.randomSplit([.75, .25]) # Make Spark ML pipeline using a NaiveBayes classifier (for now) hashingTF = HashingTF(inputCol='words', outputCol='word_hash', numFeatures=500) idf = IDF(minDocFreq=1, inputCol=hashingTF.getOutputCol(), outputCol='tf-idf') va = VectorAssembler(inputCols=[ 'has_link', 'verb_count', 'tf-idf', 'word_count', 'has_q', 'has_tag' ]) mp = MultilayerPerceptronClassifier( featuresCol=va.getOutputCol(), layers=[505, 250, 100, 50, 25, 10, 5, 2]) # Create param grid grid = ParamGridBuilder().addGrid(mp.maxIter, [50, 100, 200]).addGrid( mp.tol, [.0000001, .000001, .0001, .01]).addGrid(mp.stepSize, [.001, .01, .1]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction')
rdd = labeledRdd.map(lambda doc: (cleanLower(doc[0]), doc[1])) print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ["review", "label"]) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words("english")) ) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf") idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed") dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10) pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) # **************************************************************** # *********************CROSS VALIDATION: 80%/20%****************** # *******************Model: DecisionTreeClassifier***************** # ***************************************************************** evaluator = MulticlassClassificationEvaluator( predictionCol="prediction", labelCol="target_indexed", metricName="precision" )
return dataset.withColumn( out_col, udf(lambda x: LabeledPoint(1, Vectors.fromML(x)), t)(in_col)) # COMMAND ---------- # MAGIC %md # MAGIC #### Create data processing pipeline # COMMAND ---------- # Configure an ML pipeline, which consists of four stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") labelPointTF = LabelPointTF(inputCol=hashingTF.getOutputCol(), outputCol="vectors") lsvc = LinearSVC(maxIter=10, regParam=0.1) pipeline = Pipeline(stages=[tokenizer, hashingTF, labelPointTF, lsvc]) # COMMAND ---------- # MAGIC %md # MAGIC #### Train Email Spam Classifier Model # COMMAND ---------- model = pipeline.fit( emails.select("text", column("spam").alias("label").cast(IntegerType())))
# Make predictions on the testing data predictions = pipeline.transform(flights_test) -------------------------------------------------- # Exercise_3 from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # Create a logistic regression object and add everything to a pipeline logistic = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic]) -------------------------------------------------- # Exercise_4 # Create an empty parameter grid params = ParamGridBuilder().build() # Create objects for building and evaluating a regression model regression = LinearRegression(labelCol='duration') evaluator = RegressionEvaluator(labelCol='duration') # Create a cross validator
sqc = SQLContext(sc) sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl') logging.basicConfig(format='%(asctime)s %(message)s') grid_search = logging.getLogger('main') grid_search.setLevel(logging.DEBUG) handler = logging.FileHandler('../logs/grid_search.txt') grid_search.addHandler(handler) bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow)) bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) remover = StopWordsRemover(inputCol="raw", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=10000) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=20) indexer = StringIndexer(inputCol="string_label", outputCol="label") for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]: if type(model) == MultilayerPerceptronClassifier: layers = [10000, 100, 2] model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128) pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler, indexer, model]) scores = cross_val_score(pipeline, bow_rdd) grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \ % (type(model), scores, scores.mean()))
print "Text is cleaned" sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8, 0.2]) print "Random split is done" tokenizerNoSw = tr.NLTKWordPunctTokenizer( inputCol="review", outputCol="wordsNoSw", stopwords=set(nltk.corpus.stopwords.words('english'))) hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol='reviews_tf') idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf") string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') dt = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxIter=30, regParam=0.01) pipeline = Pipeline( stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt]) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision') # grid=(ParamGridBuilder() # .baseOn([evaluator.metricName,'precision'])
############################################################################################### # Pipeline ############################################################################################### # Tokenize by word tokenizer = Tokenizer(inputCol="text", outputCol="words") # Remove stop words in the text stopword = StopWordsRemover(inputCol = tokenizer.getOutputCol(), outputCol = "no_stops") # The cheaper way to do TF-IDF # Creates a hash that contains the term frequency # This mean there are no pairs with the value 0 # It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0 # If the value is 0, the index_from_previous will skip so there can be key that go # 0, 1, 6, 8, ... etc all based on the contents of the previous step hashingTF = HashingTF(inputCol= stopword.getOutputCol(), outputCol="hashing") # Performs the IDF part in TF-IDF idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=5) # Appends output Token-Stopwords-HashingTF-IDF with output of Vader # Initialize Logistic Regression lr = LogisticRegression(maxIter=10, regParam=0.001) # Creates pipeline pipeline = Pipeline(stages=[ tokenizer, stopword,hashingTF,idf, lr]) ############################################################################################### # Fit model to training set #lr_model = PipelineModel.load('./ModelTest') lr_model = pipeline.fit(train) # Make predictions on test set lr_prediction = lr_model.transform(test) # Schema of prediction outcome print(lr_prediction.printSchema())
gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2OAutoML model automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False, seed=1, maxRuntimeSecs=300, # 5 minutes predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, automl, colPruner])
gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(ratio=0.8, seed=1, featuresCols=[idf.getOutputCol()], predictionCol="label") elif algo == "dl": ## Create H2ODeepLearning model algoStage = H2ODeepLearning(epochs=10, seed=1, l1=0.001,
normalizerBi = Normalizer(inputCol="bigrams",outputCol='normBigrams',p=2.0) dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words','normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words',outputCol='wordsTF',numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF idf = IDF(inputCol=htf.getOutputCol(),outputCol="wordsTFIDF") idfModel = idf.fit(dfTrainTF) dfTrainTFIDF = idfModel.transform(dfTrainTF) dfTrainTFIDF.select('review','wordsTF','wordsTFIDF').show() # Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review','label','target_indexed').show() #********************************************************************** #-----------Training the model for prediction--------------------------
############################################################################################### # Tokenize by word tokenizer = Tokenizer(inputCol="text", outputCol="words") # Remove stop words in the text stopword = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="no_stops", stopWords=swords) # The cheaper way to do TF-IDF # Creates a hash that contains the term frequency # This mean there are no pairs with the value 0 # It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0 # If the value is 0, the index_from_previous will skip so there can be key that go # 0, 1, 6, 8, ... etc all based on the contents of the previous step hashingTF = HashingTF(inputCol=stopword.getOutputCol(), outputCol="hashing") # Performs the IDF part in TF-IDF idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features1", minDocFreq=5) # Appends output Token-Stopwords-HashingTF-IDF with output of Vader assembler = VectorAssembler(inputCols=["features1", "vader"], outputCol="features") # Initialize Logistic Regression lr = LogisticRegression(maxIter=10, regParam=0.001) # Creates pipeline pipeline = Pipeline( stages=[tokenizer, stopword, hashingTF, idf, assembler, lr]) ############################################################################################### # Fit model to training set #lr_model = PipelineModel.load('./ModelTest')
normalizerBi = Normalizer(inputCol="bigrams", outputCol='normBigrams', p=2.0) dfNorm = normalizerUni.transform(dfVect2) dfNorm2 = normalizerBi.transform(dfNorm) print "DataFrame(bi-gram): normalisé" dfNorm2.select('words', 'normWords').show() # La différence n'apparait pas dans la table puisqu'on n'a la place de visualiser que les indices des élements # non nuls et pas leur valeur # On passe au TFIDF # Evidemment en choisissant la bonne dataframe parmi celle du dessus, on peut appliquer ces calculs # à n'importz quelle colonne (bigrammes, avec stop words ou sans...) from pyspark.ml.feature import HashingTF htf = HashingTF(inputCol='words', outputCol='wordsTF', numFeatures=10000) dfTrainTF = htf.transform(dfTrainTokNoSw) # INverse doc frequency from pyspark.ml.feature import IDF idf = IDF(inputCol=htf.getOutputCol(), outputCol="wordsTFIDF") idfModel = idf.fit(dfTrainTF) dfTrainTFIDF = idfModel.transform(dfTrainTF) dfTrainTFIDF.select('review', 'wordsTF', 'wordsTFIDF').show() # Je sais que cette étape m'a été utile une fois, la ça a pas trop l'air from pyspark.ml.feature import StringIndexer string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed') string_indexer_model = string_indexer.fit(dfTrainTFIDF) dfTrainFinal = string_indexer_model.transform(dfTrainTFIDF) dfTrainFinal.select('review', 'label', 'target_indexed').show() #********************************************************************** #-----------Training the model for prediction-------------------------- #**********************************************************************
from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+") # COMMAND ---------- # MAGIC %md # MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row. # COMMAND ---------- from pyspark.ml.feature import IDF, HashingTF, Normalizer hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF") idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf") normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features") # COMMAND ---------- # MAGIC %md # MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run. # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.clustering import KMeans kmeans = KMeans().setFeaturesCol("features").setPredictionCol("prediction").setK(5).setSeed(0)
smsXformed = smsData.map(TransformToVector) smsDf = SpSession.createDataFrame(smsXformed, ["label", "message"]) smsDf.cache() smsDf.select("label", "message").show() (trainingData, testData) = smsDf.randomSplit([0.9, 0.1]) from pyspark.ml.classification import NaiveBayes, NaiveBayesModel from pyspark.ml.feature import IDF, HashingTF, Tokenizer from pyspark.ml import Pipeline tokenizer = Tokenizer(inputCol = 'message', outputCol = 'words') hashingTF = HashingTF(inputCol = tokenizer.getOutputCol(), outputCol = 'tempfeatures') idf = IDF(inputCol = hashingTF.getOutputCol(), outputCol = 'features') nb_classifier = NaiveBayes() pipeline = Pipeline(stages = [tokenizer, hashingTF, idf, nb_classifier]) nb_model = pipeline.fit(trainingData) prediction = nb_model.transform(testData) evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction', labelCol = 'label', metricName = 'accuracy') evaluator.evaluate(prediction) evaluator.evaluate(nb_model.transform(trainingData)) prediction.groupBy('label', 'prediction').count().show() prediction2 = nb_model.transform(trainingData) prediction2.groupBy('label', 'prediction').count().show()
pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2ODeepLearning model dl = H2ODeepLearning(epochs=10, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()
bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \ .sample(withReplacement=False, fraction=.5, seed=1) df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw']) train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1) results = [] num_features = 5000 min_doc_freq = 20 layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]] for l in layers: remover = StopWordsRemover(inputCol="raw", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=min_doc_freq) indexer = StringIndexer(inputCol="string_label", outputCol="label") mlpc = MultilayerPerceptronClassifier(maxIter=100, layers=l, blockSize=128) pipeline = Pipeline(stages=[remover, hashingTF, tfidf, indexer, mlpc]) model = pipeline.fit(train_rdd) df_output = model.transform(train_rdd) test_output = model.transform(test_rdd).select("label", "prediction") score = test_output.rdd.map(lambda row: row.label == row.prediction).mean() nn_gridsearch.debug("Layers: %s, Accuracy: %s" % (layers, score))
"spark.some.config.option", "some-value").getOrCreate() df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv', header=True) df = df.select(df['ItemID'], df['SentimentText'], df['label']) training = df.selectExpr("cast(itemID as int) id", "SentimentText", "cast(label as int) label") tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams") hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures") normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="features", p=1.0) #lr = LogisticRegression(maxIter=10, regParam=0.001) nb = NaiveBayes(smoothing=1.0) pipeline = Pipeline( stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb]) model = pipeline.fit(training) """ paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(),
def Pipeline_model(ngram, nb_hash, data, opm, vec="tf_idf", maxIter=200, regParam=0.01, elasticNetParam=0.0, numTrees=200, maxdepth=16): # Fonction tokenizer qui permet de remplacer un long texte par une liste de mot regexTokenizer = RegexTokenizer(inputCol="lib1", outputCol="tokenizedDescr", pattern="[^a-z_]", minTokenLength=3, gaps=True) # Fonction StopWordsRemover qui permet de supprimer des mots remover1 = remover(inputCol="tokenizedDescr", outputCol="stopTokenizedDescr") # Stemmer #stemmer = MyNltkStemmer(inputCol="stopTokenizedDescr", outputCol="cleanDescr") # Define NGram transformer ngram1 = NGram(n=ngram, inputCol="stopTokenizedDescr", outputCol="bigrams") # Indexer indexer = StringIndexer(inputCol="lib4", outputCol="categoryIndex").fit(data) if vec == "tf_idf": # Hasing hashing_tf = HashingTF(inputCol="bigrams", outputCol='tf', numFeatures=nb_hash) # Inverse Document Frequency idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="tfidf") else: Word2Vec_ = Word2Vec(vectorSize=300, minCount=0, inputCol="bigrams", outputCol="tfidf", seed=42) assembler = VectorAssembler(inputCols=["tfidf", "credit_o_n"], outputCol="features") #Logistic Regression if opm == "rl": print("use LogisticRegression") model = LogisticRegression(maxIter=maxIter, regParam=regParam, fitIntercept=False, tol=0.0001, family="multinomial", elasticNetParam=elasticNetParam, featuresCol="features", labelCol="categoryIndex") elif opm == "rf": print("RandomForestClassifier") model = RandomForestClassifier(labelCol="categoryIndex", featuresCol="features", numTrees=numTrees, maxMemoryInMB=1000, maxDepth=maxdepth, seed=42) elif opm == "mp": print("MultilayerPerceptronClassifier") model = MultilayerPerceptronClassifier(maxIter=maxIter, labelCol="categoryIndex", featuresCol="features", layers=[1001, 70, 26], blockSize=128, seed=42) else: print("GradientBoostedTree") model = GBTClassifier(labelCol="categoryIndex", featuresCol="features", maxIter=maxIter, seed=42) # Convertion indexed labels en originale labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=indexer.labels) # Creation du pipeline if vec == "tf_idf": return Pipeline(stages=[ regexTokenizer, remover1, ngram1, indexer, hashing_tf, idf, assembler, model, labelConverter ]) else: return Pipeline(stages=[ regexTokenizer, remover1, ngram1, indexer, Word2Vec_, assembler, model, labelConverter ])