def testPipelineSerialization(craiglistDataset): [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1], 42) tokenizer = RegexTokenizer(inputCol="jobtitle", minTokenLength=2, outputCol="tokenized") stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRemoved") w2v = H2OWord2Vec(sentSampleRate=0, epochs=10, inputCol=stopWordsRemover.getOutputCol(), outputCol="w2v") gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()]) pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/w2v_pipeline")) model = loadedPipeline.fit(traningDataset) expected = model.transform(testingDataset) model.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/w2v_pipeline_model")) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def featureExtractLr(self, trainDataframe, predictionDataframe): pipeline = None try: # pipeline = PipelineModel.load(ROOT_PATH+'/logistic') pipeline = Pipeline.load(ROOT_PATH + '/logistic') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001).setElasticNetParam(0.8) pipeline = Pipeline(stages=[remover, hashingTF, lr]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/logistic') # model.write().overwrite().save(ROOT_PATH+'/logistic') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("id", "features", "probability", "prediction") for row in selected.collect(): rid, features, prob, prediction = row self.logger.info("features: %s", features) self.logger.info("prob: %s", str(prob)) self.logger.info("prediction: %s", str(prediction))
class SentimentalPipelineEngine(PipelineEngine): def __init__(self, cv): super(SentimentalPipelineEngine, self).__init__(cv) self.tokenizer_map = [TweetTokenizer()] self.ngram_map = [1] self.hashing_tf_map = [pow(2, 20)] self.clf_map = [0.1] self.stages = self._build_stages() self.pipeline = Pipeline(stages=self.stages) self.param_grid = self._build_param_grid() def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered") self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed") self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams") self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features") self.idf = IDF(inputCol="features", outputCol="idf_features") self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0) self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1) # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2]) return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf] def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map) param_grid_builder.addGrid(self.ngram.n, self.ngram_map) param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.clf.regParam, self.clf_map) return param_grid_builder.build()
def featureExtract(self, trainDataframe, predictionDataframe): pipeline = None try: pipeline = Pipeline.load(ROOT_PATH + '/pipeline') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff") # lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[remover, hashingTF, idf]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/pipeline') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("filtered", "features", "idff") for row in selected.collect(): filtered, features, idff = row self.logger.info("features: %s", features) self.logger.info("idff: %s", idff) self.logger.info( "filtered: %s", str(filtered).decode("unicode_escape").encode("utf-8")) return selected
def fit_kmeans(spark, products_df): step = 0 step += 1 tokenizer = Tokenizer(inputCol="title", outputCol=str(step) + "_tokenizer") step += 1 stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol=str(step) + "_stopwords") step += 1 tf = HashingTF(inputCol=stopwords.getOutputCol(), outputCol=str(step) + "_tf", numFeatures=16) step += 1 idf = IDF(inputCol=tf.getOutputCol(), outputCol=str(step) + "_idf") step += 1 normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol=str(step) + "_normalizer") step += 1 kmeans = KMeans(featuresCol=normalizer.getOutputCol(), predictionCol=str(step) + "_kmeans", k=2, seed=20) kmeans_pipeline = Pipeline(stages=[tokenizer, stopwords, tf, idf, normalizer, kmeans]) model = kmeans_pipeline.fit(products_df) words_prediction = model.transform(products_df) model.save("./kmeans") # the whole machine learning instance is saved in a folder return model, words_prediction
def main(*args): if len(args) != 2: print("Please provide one input and one output directories!") sys.exit(1) input_fn, output_fn = args[0],args[1] conf = SparkConf() conf.setAppName("grant") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # Load the abstract content in the test folder into spark, # clean text, tokenize the corpus, and stem the words abstract = sc.textFile(input_fn) df_abs = (abstract.map(lambda doc: text_cleaning(doc)) .filter(lambda doc: len(doc) > 0) .filter(lambda line: not line.startswith('app')) .map(lambda doc: doc.split(' ')) .map(lambda word: [x for x in word if len(x)>0]) .map(lambda word: stem(word)) .map(lambda doc: (int(doc[0]), doc[1:])) .filter(lambda doc: len(doc[1])>0) .toDF(['Id','words'])) # build the pipeline and lda model with online optimizer stop_words = StopWordsRemover(inputCol='words', outputCol='clean') stop_words.setStopWords(stop_words.loadDefaultStopWords('english')) countv = CountVectorizer(inputCol=stop_words.getOutputCol(), outputCol="tokens") idf = IDF(inputCol=countv.getOutputCol(),outputCol="features") lda = LDA(maxIter=10,k=10,optimizer='online') pipeline = Pipeline(stages=[stop_words, countv, idf, lda]) lda_model = pipeline.fit(df_abs) labels = lda_model.transform(df_abs) # identify the label as the topic with the max probability # save the label to file topic_labels = (labels.select('Id','topicDistribution') .rdd .map(lambda x: (x[0],np.argmax(x[1]))) .saveAsTextFile(os.path.join(output_fn,'labels'))) # Get the topics wordnum = 5 # choose the number of topic words vocabulary = lda_model.stages[1].vocabulary voc_bv = sc.broadcast(vocabulary) topic_df = (lda_model.stages[3].describeTopics(wordnum) .rdd .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2])) .saveAsTextFile(os.path.join(output_fn,'words')))
def pipeline(cleaned_dataframe, stopwordlist=None): """Pipeline for Tokenizing, removing stop words, and performing word count.""" tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens") if stopwordlist: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped", stopWords=stopwordlist) else: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped") count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(), outputCol="features") pipe_line = Pipeline(stages=[tokenizer, stop_remover, count_vect]) model = pipe_line.fit(cleaned_dataframe) featurized_data = model.transform(cleaned_dataframe) return featurized_data, model.stages[-1].vocabulary
def main(spark, numTopics): jokesDF = spark.read.schema( StructType([ StructField("jokeID", IntegerType(), False), StructField("raw_text", StringType(), False), ])).csv("s3://aws-emr-resources-257018485161-us-east-1/jokes_3.csv", header="true") #jokesDF = jokesDF.withColumn("text", clean_text_udf("raw_text")) (training, test) = jokesDF.randomSplit([0.8, 0.2]) register_remove_punctuation_udf(spark) stopwords = spark.sparkContext.textFile( "s3://aws-emr-resources-257018485161-us-east-1/stopwords").collect() tokenizer = Tokenizer(inputCol="text", outputCol="tokens") remover = StopWordsRemover(stopWords=stopwords, inputCol=tokenizer.getOutputCol(), outputCol="filtered") vectorizer = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="features", minDF=2) lda = LDA(k=numTopics) pipeline = Pipeline(stages=[ SQLTransformer( statement= "SELECT jokeID, remove_punctuation_udf(raw_text) text FROM __THIS__" ), tokenizer, remover, vectorizer, lda ]) model = pipeline.fit(training) model.write().overwrite().save( "s3://aws-emr-resources-257018485161-us-east-1/ldaPipelineModel") prediction = model.transform(test) prediction.show()
def create_pipeline(model_type, num_features=10000): """ Defines pipeline from BOW to prediction. """ remover = StopWordsRemover(inputCol="bow", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=num_features) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") if model_type == 'log_reg': model = LogisticRegression() elif model_type == 'gbt': model = GBTClassifier() elif model_type == 'naive_bayes': model = NaiveBayes() elif model_type == 'rf': model = RandomForestClassifier() return Pipeline(stages=[remover, hashingTF, tfidf, model])
def benchmark_body_pipeline(cleaned_dataframe, stopwordlist=None): """NLP pipeline. Tokenizes, removes stopwords, and computes TF-IDF Returns transformed data as 'features' and the vocabulary of words.""" tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens") if stopwordlist: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped", stopWords=stopwordlist) else: stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="Text_tokens_stopped") count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(), outputCol="Text_counts_raw") idf = IDF(inputCol=count_vect.getOutputCol(), outputCol="features") pipeline = Pipeline(stages=[tokenizer, stop_remover, count_vect, idf]) model = pipeline.fit(cleaned_dataframe) featurized_data = model.transform(cleaned_dataframe) return featurized_data, model.stages[-2].vocabulary
class NaiveBayesModel: """ Creates a Naive Bayes model using pipelines """ def __init__(self, training_data): self.training_data = training_data self.regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") self.remover = StopWordsRemover(inputCol=self.regex_tokenizer.getOutputCol(), outputCol="filtered") self.hashing_tf = HashingTF(inputCol=self.remover.getOutputCol(), outputCol="features") #column names "features" and "labels" are defaults in the spark ml NB API #so no need to specify columns to run model on self.naive_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial") self.model = ( Pipeline(stages=[ self.regex_tokenizer, self.remover, self.hashing_tf, self.naive_bayes ]) .fit(training_data) ) def get_model(self): return self.model def calculate_accuracy(self, test_data): predictions = self.model.transform(test_data) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy" ) accuracy = evaluator.evaluate(predictions) print("Model accuracy: %s" % accuracy)
def train_validate(self, df): # Split the data into training and test sets (30% held out for testing) (training, test) = df.randomSplit([0.7, 0.3]) # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") hashingTF = HashingTF(numFeatures=10000, inputCol=remover.getOutputCol(), outputCol="features") #################### # lr = LogisticRegression(maxIter=10, regParam=0.001) # pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) #################### # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, ovr]) ##################### # Fit the pipeline to training documents. model = pipeline.fit(training) # Make predictions on test documents and print columns of interest. prediction = model.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(prediction) print("Test Error : " + str(1 - accuracy)) return model
plt.title('Review Sentence Length') plt.savefig(IMG_PATH + 'lengthSorted.png', format='png', transparent=True) plt.show() # %% stopWords = list(set(nltk.corpus.stopwords.words('english'))) + [''] tokenizer = Tokenizer(inputCol='review', outputCol='tokens') stopWordRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords) pipeline = Pipeline(stages=[tokenizer, stopWordRemover]) dataSet = pipeline.fit(dataSet).transform(dataSet) # %% newLengthDF = dataSet.withColumn('newLength', F.size(stopWordRemover.getOutputCol())) # %% newSentenceLen = newLengthDF.select('class', 'newLength').collect() #%% y = [ int(row['newLength']) for row in reviewLengths if (int(row['class']) == 0) ] mx, mn = max(y), min(y) ptp = mx - mn y1 = [(i - mn) / ptp for i in y] y = [ int(row['newLength']) for row in reviewLengths if (int(row['class']) == 1) ]
conn = S3Connection() sc = set_spark_context() sqc = SQLContext(sc) sm = SparkModel(sc, conn, rdd_path='meta_rdd.pkl') logging.basicConfig(format='%(asctime)s %(message)s') grid_search = logging.getLogger('main') grid_search.setLevel(logging.DEBUG) handler = logging.FileHandler('../logs/grid_search.txt') grid_search.addHandler(handler) bow_rdd = sm.RDD.map(lambda (key, (bow, meta)): (key, bow)) bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) remover = StopWordsRemover(inputCol="raw", outputCol="words") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="word_counts", numFeatures=10000) tfidf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=20) indexer = StringIndexer(inputCol="string_label", outputCol="label") for model in [GBTClassifier(), RandomForestClassifier(), MultilayerPerceptronClassifier()]: if type(model) == MultilayerPerceptronClassifier: layers = [10000, 100, 2] model = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128) pipeline = Pipeline(stages=[remover, hashingTF, tfidf, # scaler, indexer, model]) scores = cross_val_score(pipeline, bow_rdd) grid_search.debug('Model: %s\nscores: %s\nAverage: %s' \
reviews_mini = update_text_with_key_ngrams(reviews_mini, n=2, seed=42, outputCol=outputCol, pattern=pattern) print("\n") ## PREDICT LABEL BASED ON TF-IDF OF UPDATED TEXT print("Computing TF-IDF matrix for updated text...") tokenizer = RegexTokenizer(inputCol=outputCol, outputCol="words_with_ngrams", pattern=pattern, gaps=False) stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words") cv = CountVectorizer(inputCol=stop_words_remover.getOutputCol(), outputCol="final_tf") idf = IDF(inputCol=cv.getOutputCol(), outputCol="final_tf_idf") pipe = Pipeline(stages=[ tokenizer, stop_words_remover, cv, idf ]) reviews_mini = pipe.fit(reviews_mini).transform(reviews_mini) ## Train test split train, test = reviews_mini.randomSplit([0.8, 0.2], seed=seed)
sc = SparkContext("local", "Simple App") spark = SparkSession.builder.master("local").appName("Word Count").config( "spark.some.config.option", "some-value").getOrCreate() df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv', header=True) df = df.select(df['ItemID'], df['SentimentText'], df['label']) training = df.selectExpr("cast(itemID as int) id", "SentimentText", "cast(label as int) label") tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered") ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams") hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures") normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="features", p=1.0) #lr = LogisticRegression(maxIter=10, regParam=0.001) nb = NaiveBayes(smoothing=1.0) pipeline = Pipeline( stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb]) model = pipeline.fit(training) """ paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build() crossval = CrossValidator(estimator=pipeline,
## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2OAutoML model automl = H2OAutoML(convertUnknownCategoricalLevelsToNa=False, seed=1, maxRuntimeSecs=300, # 5 minutes predictionCol="label") ## Remove all helper columns
nltk.download('stopwords') # list of stopwords to be removed from the posts StopWords = list(set(stopwords.words('english'))) labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train) bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post") RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(), outputCol="words", pattern="[^0-9a-z#+_]+") StopwordRemover = StopWordsRemover( inputCol=RegexTokenizer.getOutputCol(), outputCol="filtered_words").setStopWords(StopWords) CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(), outputCol="countFeatures", minDF=5) idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features") rf = RandomForestClassifier(labelCol="label", featuresCol=idf.getOutputCol(), numTrees=100, maxDepth=4) idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue") idx_2_string.setLabels(labelIndexer.labels) # creating the pipeline pipeline = Pipeline(stages=[ labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover, CountVectorizer, idf, rf, idx_2_string ])
############################################################################################### # Pipeline ############################################################################################### # Tokenize by word tokenizer = Tokenizer(inputCol="text", outputCol="words") # Remove stop words in the text stopword = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="no_stops", stopWords=swords) # The cheaper way to do TF-IDF # Creates a hash that contains the term frequency # This mean there are no pairs with the value 0 # It'll output: (number_of_words {index_from_previous: value, ...}) with no value = 0 # If the value is 0, the index_from_previous will skip so there can be key that go # 0, 1, 6, 8, ... etc all based on the contents of the previous step hashingTF = HashingTF(inputCol=stopword.getOutputCol(), outputCol="hashing") # Performs the IDF part in TF-IDF idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features1", minDocFreq=5) # Appends output Token-Stopwords-HashingTF-IDF with output of Vader assembler = VectorAssembler(inputCols=["features1", "vader"], outputCol="features") # Initialize Logistic Regression lr = LogisticRegression(maxIter=10, regParam=0.001) # Creates pipeline pipeline = Pipeline( stages=[tokenizer, stopword, hashingTF, idf, assembler, lr]) ############################################################################################### # Fit model to training set
class SentAnalysisModelTraining(object): def __init__(self): print("Initializing SentAnalysisModelTraining!!"); self.appName = "Sentiment Analysis Model Training" # create Spark session self.spark = SparkSession.builder.appName(self.appName) \ .config("spark.executor.heartbeatInterval", "200000") \ .config("spark.network.timeout", "300000") \ .getOrCreate() self.modelpath = "sentiment.model" self.data = None self.training_data = None self.testing_data = None self.pipeline = None self.cross_validator = None self.evaluator = None self.predict_training_data = None self.predict_testing_data = None self.cv_model = None self.labeled_output = None self.tokenizer = None self.stopwordsRemover = None self.hashTF = None self.labededIndexer = None return # read csv file into dataFrame with automatically inferred schema def read_data(self): emotion_csv = self.spark.read.csv('dataset/text_emotions1.csv', inferSchema=True, header=True) emotion_csv.show(truncate=False, n=10) emotion_csv.select("sentiment").distinct().show() print(emotion_csv) # select only "content" and "sentiment" column, # and cast "Sentiment" column data into integer self.data = emotion_csv.select("content", "sentiment") self.data.show(truncate=False, n=10) return self.data.count # divide data, 70% for training, 30% for testing def split_data(self): dividedData = self.data.randomSplit([0.9, 0.1]) self.training_data = dividedData[0] # index 0 = data training self.testing_data = dividedData[1] # index 1 = data testing train_rows = self.training_data.count() test_rows = self.testing_data.count() print("Training data rows:", train_rows, "; Testing data rows:", test_rows) return def create_pipeline(self): # Creating all the pipeline elements self.tokenizer = Tokenizer(inputCol="content", outputCol="SentimentWords") labelStringIdx = StringIndexer(inputCol="sentiment", outputCol="label") self.labededIndexer = labelStringIdx.fit(self.training_data) self.stopwordsRemover = StopWordsRemover(inputCol=self.tokenizer.getOutputCol(), outputCol="RelevantWords") self.hashTF = HashingTF(inputCol=self.stopwordsRemover.getOutputCol(), outputCol="features") self.pipeline = Pipeline(stages=[self.tokenizer, labelStringIdx, self.stopwordsRemover, self.hashTF]) lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=15, regParam=0.001, \ elasticNetParam=0.8, family="multinomial") self.evaluator = MulticlassClassificationEvaluator(predictionCol="prediction") # paramGrid = ParamGridBuilder()\ # .addGrid(lr.aggregationDepth,[2,5,10])\ # .addGrid(lr.elasticNetParam,[0.0, 0.5, 1.0])\ # .addGrid(lr.fitIntercept,[False, True])\ # .addGrid(lr.maxIter,[10, 15])\ # .addGrid(lr.regParam,[0.01, 0.1]) \ # .build() paramGrid = ParamGridBuilder() \ .addGrid(lr.aggregationDepth, [2]) \ .addGrid(lr.elasticNetParam, [0.8]) \ .addGrid(lr.maxIter, [15]) \ .addGrid(lr.regParam, [0.001]) \ .build() # Create 5-fold CrossValidator self.cross_validator = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=self.evaluator, numFolds=5) return # Fit the pipeline to training documents. def train_model(self): pipelineFit = self.pipeline.fit(self.training_data) dataset = pipelineFit.transform(self.training_data) # Run cross validations self.cv_model = self.cross_validator.fit(dataset) # this will likely take a fair amount of time because of the amount of models that we're creating and testing self.predict_training_data = self.cv_model.transform(dataset) print("The area under ROC for train set after CV is {}".format(self.evaluator.evaluate(self.predict_training_data))) print("Training is done!") return # Fit the pipeline to training documents. def test_model(self): pipelineFit = self.pipeline.fit(self.testing_data) preparedTestData = pipelineFit.transform(self.testing_data) self.predict_testing_data = self.cv_model.transform(preparedTestData) print("The area under ROC for test set after CV is {}".format(self.evaluator.evaluate(self.predict_testing_data))) print("Testing is done!") return def print_model_summary(self): predictionFinal = self.predict_training_data.select( "RelevantWords", "prediction", "label", "sentiment") predictionFinal.show(n=20, truncate=False) # check the accuracy correctPrediction = self.predict_testing_data.filter( self.predict_testing_data['prediction'] == self.predict_testing_data['label']).count() totalData = self.predict_testing_data.count() print("correct prediction:", correctPrediction, ", total data:", totalData, ", accuracy:", correctPrediction / totalData) trainingSummary = self.cv_model.bestModel.summary # Obtain the objective per iteration objectiveHistory = trainingSummary.objectiveHistory # print("objectiveHistory:") # for objective in objectiveHistory: # print(objective) # for multiclass, we can inspect metrics on a per-label basis print("False positive rate by label:") for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("True positive rate by label:") for i, rate in enumerate(trainingSummary.truePositiveRateByLabel): print("label %d: %s" % (i, rate)) print("Precision by label:") for i, prec in enumerate(trainingSummary.precisionByLabel): print("label %d: %s" % (i, prec)) print("Recall by label:") for i, rec in enumerate(trainingSummary.recallByLabel): print("label %d: %s" % (i, rec)) print("F-measure by label:") for i, f in enumerate(trainingSummary.fMeasureByLabel()): print("label %d: %s" % (i, f)) accuracy = trainingSummary.accuracy falsePositiveRate = trainingSummary.weightedFalsePositiveRate truePositiveRate = trainingSummary.weightedTruePositiveRate fMeasure = trainingSummary.weightedFMeasure() precision = trainingSummary.weightedPrecision recall = trainingSummary.weightedRecall print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s" % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall)) return def save_model_pipeline(self): # pipeline for later deriving value from actual field. labelConverter = IndexToString(inputCol="prediction", outputCol="predictionLabel", labels=self.labededIndexer.labels) pipeline = Pipeline(stages=[self.tokenizer, self.stopwordsRemover, self.hashTF, self.cv_model.bestModel, labelConverter]) pipeline.write().overwrite().save("prediction_pipeline")
# Do the writing trainingFile = open(outputFile, 'wt', encoding="utf-8",newline='') writer = csv.writer(trainingFile) writer.writerows(newRows) trainingFile.close() trainingFile = open(outputFile,"r") pandas_df = pd.read_csv(trainingFile) spark_df = sqlContext.createDataFrame(pandas_df) #Pipeline countTokens = udf(lambda words: len(words), IntegerType()) tokenizer = Tokenizer(inputCol="text", outputCol="new_text") add_stopwords = ["http","https","amp","RT","the"] stopwordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered").setStopWords(add_stopwords) cv = CountVectorizer(inputCol=stopwordsRemover.getOutputCol(), outputCol="features", vocabSize=3, minDF=2.0) lr= LogisticRegression(maxIter=20, regParam=0.001) pipeline = Pipeline(stages=[tokenizer, stopwordsRemover, cv, lr]) model = pipeline.fit(spark_df) #Sentiments Predicted testingData=sqlContext.createDataFrame(testing_df) prediction = model.transform(testingData) selected = prediction.select("text", "probability", "prediction") for row in selected.collect(): text, prob, prediction = row print("(%s) --> prob=%s, prediction=%d" % (text, str(prob), prediction)) #Evaluating the model's accuracy (train, test) = spark_df.randomSplit([0.7, 0.3], seed = 100) print("Training data count: "+str(train.count()))
# %% trainDF, testDF = sql.SQLContext(spark.sparkContext).createDataFrame(trainDF), sql.SQLContext(spark.sparkContext).createDataFrame(testDF) # trainDF.show() # testDF.show() #%% print(trainDF.count()) print(trainDF.filter(F.col('class') == 1).count()) print(trainDF.filter(F.col('class') == 0).count()) # %% stopWords = list(set(nltk.corpus.stopwords.words('english'))) + [''] tokenizer = Tokenizer(inputCol='review', outputCol='tokens') stopWordRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='stoppedWords').setStopWords(stopWords) countVector = CountVectorizer(inputCol=stopWordRemover.getOutputCol(), outputCol='vectors') idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf') pipline = Pipeline(stages=[tokenizer, stopWordRemover, countVector, idf]) model = pipline.fit(trainDF) ptrainDF = model.transform(trainDF) ptestDF = model.transform(testDF) ptrainDF.show() # %% evaluator = MulticlassClassificationEvaluator(labelCol="class", predictionCol="prediction", metricName="f1") # %% lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class') lrModel = lr.fit(ptrainDF) predictionsLR = lrModel.transform(ptestDF) evaluator.evaluate(predictionsLR)
notes_df['rowid'], remove_features_udf(notes_df['text'].cast("string")).alias('text'))) lemmatized_notes_df = filtered_notes_df.withColumn( "lemmatized", lemmatize_udf(filtered_notes_df['text'])) # Defining our ML pipeline. tokenizer = RegexTokenizer(inputCol='lemmatized', outputCol='tokens', pattern='\\W') common_words = ['admission', 'discharge' ] + StopWordsRemover().getStopWords() remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), stopWords=common_words, outputCol='tokens_filtered') cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol='features') lda = LDA().setK(3) pipeline = Pipeline(stages=[tokenizer, remover, cv, lda]) # Fitting our pipeline. model = pipeline.fit(lemmatized_notes_df) # Here we access the last stage of the model, as this is where we applied the LDA. lda_model = model.stages[-1] cv_model = model.stages[2] topics = lda_model.describeTopics() topics_with_words = (topics.select( topics["topic"], make_indices_mapping(cv_model.vocabulary)( topics["termIndices"]).alias("terms"), topics["termWeights"]))
## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) if algo == "gbm": ## Create GBM model algoStage = H2OGBM(ratio=0.8, seed=1, featuresCols=[idf.getOutputCol()], predictionCol="label")
# Make predictions on the testing data predictions = pipeline.transform(flights_test) -------------------------------------------------- # Exercise_3 from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF # Break text into tokens at non-word characters tokenizer = Tokenizer(inputCol='text', outputCol='words') # Remove stop words remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms') # Apply the hashing trick and transform to TF-IDF hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash") idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features") # Create a logistic regression object and add everything to a pipeline logistic = LogisticRegression() pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic]) -------------------------------------------------- # Exercise_4 # Create an empty parameter grid params = ParamGridBuilder().build() # Create objects for building and evaluating a regression model regression = LinearRegression(labelCol='duration') evaluator = RegressionEvaluator(labelCol='duration')
]).map(lambda x: x.split(' ')).map(lambda y: Row(y)).toDF( schema=StructType().add("text", StringType(), True))) positive_string = (sc.parallelize([ "im so happy with my results" ]).map(lambda x: x.split(' ')).map(lambda y: Row(y)).toDF( schema=StructType().add("text", StringType(), True))) test_data_frame.cache() regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") remover = StopWordsRemover(inputCol=regex_tokenizer.getOutputCol(), outputCol="filtered") hashing_tf = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") #Column names "features" and "labels" are defaults in the ml NB API #so no need to specify columns to run model on naive_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial") #Can just pipeline the DF, no need to turn into labelled point! pipeline = Pipeline( stages=[regex_tokenizer, remover, hashing_tf, naive_bayes]) #form the model model = pipeline.fit(training_data_frame) #run the prediciton predictions = model.transform(test_data_frame)
# MAGIC %md ### Define the Pipeline # MAGIC The pipeline for the model consist of the following stages: # MAGIC - A Tokenizer to split the tweets into individual words. # MAGIC - A StopWordsRemover to remove common words such as "a" or "the" that have little predictive value. # MAGIC - A HashingTF class to generate numeric vectors from the text values. # MAGIC - A LogisticRegression algorithm to train a binary classification model. # COMMAND ---------- # convert sentence to words' list tokenizer = Tokenizer(inputCol="text", outputCol="SentimentWords") # remove stop words swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="MeaningfulWords") # convert word to number as word frequency hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features") # set the model lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.01) # process pipeline with the series of transforms - 4 transforms pipeline = Pipeline(stages=[tokenizer, swr, hashTF, lr]) # COMMAND ---------- # MAGIC %md ### Run the Pipeline as an Estimator # MAGIC The pipeline itself is an estimator, and so it has a **fit** method that we called to run the pipeline on a specified DataFrame. In this case, we ran the pipeline on the training data to train a model. # COMMAND ----------
## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2ODeepLearning model dl = H2ODeepLearning(epochs=10, l1=0.001, l2=0.0, hidden=[200, 200], featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns
StructField("id", IntegerType(), True), StructField("text", StringType(), True), StructField("label", DoubleType(), True) ] finalSchema = StructType(fields=newDF) dataset = sqlContext.read.format('csv').options( header='true', schema=finalSchema, delimiter='|').load('/FileStore/tables/dataset.csv') dataset = dataset.withColumn("label", dataset["label"].cast(DoubleType())) dataset = dataset.withColumn("id", dataset["id"].cast(IntegerType())) training, test = dataset.randomSplit([0.8, 0.2], seed=12345) tokenizer = Tokenizer(inputCol="text", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="filtered") hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2, regParam=0.001) nb = NaiveBayes(smoothing=1.0, modelType="multinomial") pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(training) result = model.transform(test)\ .select("features", "label", "prediction") correct = result.where(result["label"] == result["prediction"]) accuracy = correct.count() / test.count() print("Accuracy of model = " + str(accuracy)) test_error = 1 - accuracy print("Test error = " + str(test_error)) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
outputCol='body') cleaner = Cleaner(key='subreddit', val='body', inputCol=extractor.getOutputCol(), outputCol='body') filterer = Filterer(key='subreddit', val='body', inputCol='subreddit', outputCol='body', minlength=args.minlength) tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(), outputCol="tokens", pattern="\\W") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="swr_tokens") cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="tf", minDF=args.mindf, vocabSize=args.vocabsize) idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf") topkwords = TopKWords(inputCol=idf.getOutputCol(), outputCol='top_words', nwords=args.nwords) cos_similarity = CosineSimilarity(inputCol='subreddit', outputCol='norm', spark=spark) topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(), outputCol='top_subreddits', nsubreddits=args.nsubreddits) pipeline = Pipeline(stages=[
from pyspark.sql.functions import udf,col from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover, RegexTokenizer from pyspark.ml.clustering import LDA spark = SparkSession.builder.getOrCreate() data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv') df = spark.createDataFrame(data) textCol = 'review' selfstopwords = ['br'] numOfTopics = 10 numOfKeywords = 5 tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+') stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0') stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean') cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv') idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf') lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10) pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda]) model = pipe1.fit(df) output = model.transform(df) def topicsTerms(vocab, termindices, leng=None): if not leng: return [voca[t] for t in termindices] return [vocab[t] for t in termindices][:leng] def topicsTerm_udf(vocab, leng=None):
from pyspark.sql.functions import udf, col, explode, collect_list, to_date, concat from pyspark.sql.types import StructType, StructField, IntegerType, StringType, \ FloatType, ArrayType, BooleanType from nltk.stem import SnowballStemmer # Import json objects from tar file opinion_df = import_dataframe(spark, 'opinion') docket_df = import_dataframe(spark, 'docket') cluster_df = import_dataframe(spark, 'cluster') # Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec # tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens') tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3) remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop') stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens') bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2) trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3) cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0) idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10) w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d') w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large') pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large]) # Use the pipeline to fit a model model = pipe.fit(opinion_df) # Use the model to transform the data df_transformed = model.transform(opinion_df)
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer from pyspark.ml.regression import LinearRegression, GBTRegressor from pyspark.ml import Pipeline tokenizer = Tokenizer(inputCol='reviewText', outputCol='reviewWords') stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='reviewWordsWithoutTrash') vectorizer = CountVectorizer(inputCol=stop_words_remover.getOutputCol(), outputCol="word_vector", minDF=150) lr = LinearRegression(featuresCol=vectorizer.getOutputCol(), labelCol='overall') pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer, lr])