def testPipelineSerialization(craiglistDataset): [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1], 42) tokenizer = RegexTokenizer(inputCol="jobtitle", minTokenLength=2, outputCol="tokenized") stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRemoved") w2v = H2OWord2Vec(sentSampleRate=0, epochs=10, inputCol=stopWordsRemover.getOutputCol(), outputCol="w2v") gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()]) pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/w2v_pipeline")) model = loadedPipeline.fit(traningDataset) expected = model.transform(testingDataset) model.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/w2v_pipeline_model")) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
class NaiveBayesModel: """ Creates a Naive Bayes model using pipelines """ def __init__(self, training_data): self.training_data = training_data self.regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W") self.remover = StopWordsRemover(inputCol=self.regex_tokenizer.getOutputCol(), outputCol="filtered") self.hashing_tf = HashingTF(inputCol=self.remover.getOutputCol(), outputCol="features") #column names "features" and "labels" are defaults in the spark ml NB API #so no need to specify columns to run model on self.naive_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial") self.model = ( Pipeline(stages=[ self.regex_tokenizer, self.remover, self.hashing_tf, self.naive_bayes ]) .fit(training_data) ) def get_model(self): return self.model def calculate_accuracy(self, test_data): predictions = self.model.transform(test_data) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy" ) accuracy = evaluator.evaluate(predictions) print("Model accuracy: %s" % accuracy)
CountVectorizer, IDF, Word2Vec from pyspark.sql.functions import udf, col, explode, collect_list, to_date, concat from pyspark.sql.types import StructType, StructField, IntegerType, StringType, \ FloatType, ArrayType, BooleanType from nltk.stem import SnowballStemmer # Import json objects from tar file opinion_df = import_dataframe(spark, 'opinion') docket_df = import_dataframe(spark, 'docket') cluster_df = import_dataframe(spark, 'cluster') # Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec # tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens') tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3) remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop') stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens') bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2) trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3) cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0) idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10) w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d') w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large') pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large]) # Use the pipeline to fit a model model = pipe.fit(opinion_df) # Use the model to transform the data df_transformed = model.transform(opinion_df)
val='body', inputCol='subreddit', outputCol='body') cleaner = Cleaner(key='subreddit', val='body', inputCol=extractor.getOutputCol(), outputCol='body') filterer = Filterer(key='subreddit', val='body', inputCol='subreddit', outputCol='body', minlength=args.minlength) tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(), outputCol="tokens", pattern="\\W") remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="swr_tokens") cv = CountVectorizer(inputCol=remover.getOutputCol(), outputCol="tf", minDF=args.mindf, vocabSize=args.vocabsize) idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf") topkwords = TopKWords(inputCol=idf.getOutputCol(), outputCol='top_words', nwords=args.nwords) cos_similarity = CosineSimilarity(inputCol='subreddit', outputCol='norm', spark=spark) topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(), outputCol='top_subreddits', nsubreddits=args.nsubreddits)
return spark.createDataFrame(row_rdd, ["label", "text"]) ## ## Define the pipeline stages ## ## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2OAutoML model
## ## Define the pipeline stages ## ## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover( inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4) ## Create H2ODeepLearning model dl = H2ODeepLearning(epochs=10, l1=0.001,
in_col = dataset[self.getInputCol()] return dataset.withColumn(out_col, udf(f, t)(in_col)) nltk.download('stopwords') # list of stopwords to be removed from the posts StopWords = list(set(stopwords.words('english'))) labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train) bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post") RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(), outputCol="words", pattern="[^0-9a-z#+_]+") StopwordRemover = StopWordsRemover( inputCol=RegexTokenizer.getOutputCol(), outputCol="filtered_words").setStopWords(StopWords) CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(), outputCol="countFeatures", minDF=5) idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features") rf = RandomForestClassifier(labelCol="label", featuresCol=idf.getOutputCol(), numTrees=100, maxDepth=4) idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue") idx_2_string.setLabels(labelIndexer.labels) # creating the pipeline pipeline = Pipeline(stages=[ labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover,
def update_text_with_key_ngrams(df, n, seed=42, outputCol="ngram_text", pattern=r"(?!(?<='))\w+"): def build_text(words): # Wandle bag of words in sentences um und schaue in jedem der # sentences ob # eines der key_bigrams in ihm vorkommt # bspw. bag of words = ["hi", "i", "ralf"] und key_bigram = "i ralf" --> # sentence = ["hi i ralf"] und key_bigram kommt drin vor # Wenn bigram vorkommt, dann ersetze die zwei Wörter im Satz mit der # underscore version des bigrams ("i_ralf") sentence = ' '.join(words) for ngram in key_ngrams: if ngram in sentence: sentence = sentence.replace(ngram, ngram.replace(" ", "_")) return sentence outputs = { "tokenizer": "words", "ngram": "ngrams", "cv": "tf", "idf": "tf_idf", "build_text_udf": outputCol } # Build pipeline tokenizer = RegexTokenizer(inputCol="text", outputCol=outputs["tokenizer"], pattern=pattern, gaps=False) ngram = NGram(n=n, inputCol=tokenizer.getOutputCol(), outputCol=outputs["ngram"]) cv = CountVectorizer(inputCol=ngram.getOutputCol(), outputCol=outputs["cv"]) idf = IDF(inputCol=cv.getOutputCol(), outputCol=outputs["idf"]) pipe = Pipeline(stages=[ tokenizer, # transform ngram, # transform cv, # fit_transform idf # fit ]) print("\t Computing tf_idf matrix for {}-grams...".format(n)) pipe_model = pipe.fit(df) # calls transform on tokenizer & ngram, # fit_transform on cv and fit on idf vocabulary = np.array(pipe_model.stages[2].vocabulary) print("\t\t vocabulary size: {}".format(len(vocabulary))) df = pipe_model.transform(df) # train test split train, _ = df.randomSplit([0.8, 0.2], seed=seed) train.persist(StorageLevel.MEMORY_AND_DISK) # fit linear SVM svc = LinearSVC(maxIter=100, regParam=0.1, featuresCol="tf_idf") print("\t Estimating key {}-grams with SVC...".format(n)) svc_model = svc.fit(train) # Wähle die ngrams mit den schlechtesten/besten weights print("\t Update text with key {}-grams...".format(n)) coeffs = svc_model.coefficients.toArray() key_ngrams = get_n_extremes_of_a_in_b(coeffs, vocabulary, 50) build_text_udf = F.udf(build_text) df = df.withColumn(outputs["build_text_udf"], build_text_udf( F.col(tokenizer.getOutputCol()))) print() return df
from pyspark.ml import Pipeline from pyspark.sql.functions import udf,col from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover, RegexTokenizer from pyspark.ml.clustering import LDA spark = SparkSession.builder.getOrCreate() data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv') df = spark.createDataFrame(data) textCol = 'review' selfstopwords = ['br'] numOfTopics = 10 numOfKeywords = 5 tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+') stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0') stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean') cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv') idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf') lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10) pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda]) model = pipe1.fit(df) output = model.transform(df) def topicsTerms(vocab, termindices, leng=None): if not leng: return [voca[t] for t in termindices] return [vocab[t] for t in termindices][:leng]
sampled = malicious.unionAll(sample_bening) sampled.groupby('label').count().toPandas() # # Data Ingestion and Vectorization # In[18]: #Tokennize the TrainData - sparse the URL string into words regexTokenizer = RegexTokenizer(inputCol="url", outputCol="Words", pattern="\\W") #CountVectorizer converts the the words into feature vectors - Thi is used as it gives better results countVectors = CountVectorizer(inputCol=regexTokenizer.getOutputCol(), outputCol="rawfeatures", vocabSize=10000, minDF=5) # idf = IDF(inputCol=countVectors.getOutputCol(), outputCol="features") #create the pipline pipeline = Pipeline(stages=[regexTokenizer, countVectors, idf ]) # Fit the pipeline to training documents. # Pass 'sampled' in the param to set Balanced datasets pipelineFit = pipeline.fit(sampled) #Transform the pipeline to dataset # Pass 'sampled' in the param to set Balanced datasets dataset = pipelineFit.transform(sampled)
row_rdd = spark.sparkContext.textFile(_locate("smsData.txt")).map(lambda x: x.split("\t", 1)).filter(lambda r: r[0].strip()) return spark.createDataFrame(row_rdd, ["label", "text"]) ## ## Define the pipeline stages ## ## Tokenize the messages tokenizer = RegexTokenizer(inputCol="text", outputCol="words", minTokenLength=3, gaps=False, pattern="[a-zA-Z]+") ## Remove ignored words stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered", stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"], caseSensitive=False) ## Hash the words hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(), outputCol="wordToIndex", numFeatures=1 << 10) ## Create inverse document frequencies model idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)
from pyspark.ml.feature import RegexTokenizer, HashingTF, from pyspark.ml.classification import LogisticRegression from pyspark.ml import Pipeline # schema for dataframe schema = StructType( [StructField("url", StringType()), StructField("label", IntegerType())]) # read the data to generate a dataframe df = spark.read.schema(schema).csv( "file:///home/hadoop/Documents/LR-on-Malicious-Link/data/datacp.csv", header=True) # string regex tokenizer tokenizer = RegexTokenizer(inputCol="url", outputCol="words", pattern="/") # hashing term frequency hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="rawFeatures", numFeatures=20) # inverse document frequency idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features") # logistic tregression lr = LogisticRegression(maxIter=100, regParam=0.001) # add stages to pipeline pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr]) # train the model model = pipeline.fit(df)
# COMMAND ---------- from pyspark.ml.feature import RegexTokenizer tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+") # COMMAND ---------- # MAGIC %md # MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents. Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row. # COMMAND ---------- from pyspark.ml.feature import IDF, HashingTF, Normalizer hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF") idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf") normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features") # COMMAND ---------- # MAGIC %md # MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages. We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`. This will take about a minute to run. # COMMAND ---------- from pyspark.ml import Pipeline from pyspark.ml.clustering import KMeans