Exemple #1
0
def testPipelineSerialization(craiglistDataset):
    [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1],
                                                                    42)

    tokenizer = RegexTokenizer(inputCol="jobtitle",
                               minTokenLength=2,
                               outputCol="tokenized")
    stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="stopWordsRemoved")
    w2v = H2OWord2Vec(sentSampleRate=0,
                      epochs=10,
                      inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="w2v")
    gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()])

    pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm])

    pipeline.write().overwrite().save("file://" +
                                      os.path.abspath("build/w2v_pipeline"))
    loadedPipeline = Pipeline.load("file://" +
                                   os.path.abspath("build/w2v_pipeline"))
    model = loadedPipeline.fit(traningDataset)
    expected = model.transform(testingDataset)

    model.write().overwrite().save("file://" +
                                   os.path.abspath("build/w2v_pipeline_model"))
    loadedModel = PipelineModel.load(
        "file://" + os.path.abspath("build/w2v_pipeline_model"))
    result = loadedModel.transform(testingDataset)

    unit_test_utils.assert_data_frames_are_identical(expected, result)
Exemple #2
0
class NaiveBayesModel:
	"""
	Creates a Naive Bayes model using pipelines
	"""
	def __init__(self, training_data):
		self.training_data = training_data

		self.regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
		self.remover = StopWordsRemover(inputCol=self.regex_tokenizer.getOutputCol(), outputCol="filtered")
		self.hashing_tf = HashingTF(inputCol=self.remover.getOutputCol(), outputCol="features")

		#column names "features" and "labels" are defaults in the spark ml NB API
		#so no need to specify columns to run model on
		self.naive_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial")

		self.model = (
			Pipeline(stages=[
				self.regex_tokenizer,
				self.remover,
				self.hashing_tf,
				self.naive_bayes
			])
			.fit(training_data)
		)

	def get_model(self):
		return self.model

	def calculate_accuracy(self, test_data):
		predictions = self.model.transform(test_data)

		evaluator = MulticlassClassificationEvaluator(
			labelCol="label", predictionCol="prediction",
			metricName="accuracy"
		)

		accuracy = evaluator.evaluate(predictions)
		print("Model accuracy: %s" % accuracy)
Exemple #3
0
        CountVectorizer, IDF, Word2Vec
from pyspark.sql.functions import udf, col, explode, collect_list, to_date, concat
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, \
        FloatType, ArrayType, BooleanType
from nltk.stem import SnowballStemmer


# Import json objects from tar file
opinion_df = import_dataframe(spark, 'opinion')
docket_df = import_dataframe(spark, 'docket')
cluster_df = import_dataframe(spark, 'cluster')

# Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec
# tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens')
tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop')
stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens')
bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2)
trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3)
cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0)
idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10)
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large')

pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large])

# Use the pipeline to fit a model
model = pipe.fit(opinion_df)

# Use the model to transform the data
df_transformed = model.transform(opinion_df)
Exemple #4
0
                      val='body',
                      inputCol='subreddit',
                      outputCol='body')
cleaner = Cleaner(key='subreddit',
                  val='body',
                  inputCol=extractor.getOutputCol(),
                  outputCol='body')
filterer = Filterer(key='subreddit',
                    val='body',
                    inputCol='subreddit',
                    outputCol='body',
                    minlength=args.minlength)
tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(),
                           outputCol="tokens",
                           pattern="\\W")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="swr_tokens")
cv = CountVectorizer(inputCol=remover.getOutputCol(),
                     outputCol="tf",
                     minDF=args.mindf,
                     vocabSize=args.vocabsize)
idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf")
topkwords = TopKWords(inputCol=idf.getOutputCol(),
                      outputCol='top_words',
                      nwords=args.nwords)
cos_similarity = CosineSimilarity(inputCol='subreddit',
                                  outputCol='norm',
                                  spark=spark)
topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(),
                                outputCol='top_subreddits',
                                nsubreddits=args.nsubreddits)
    return spark.createDataFrame(row_rdd, ["label", "text"])


##
## Define the pipeline stages
##

## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

## Create H2OAutoML model
Exemple #6
0

##
## Define the pipeline stages
##

## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol="filtered",
    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="tf_idf", minDocFreq=4)

## Create H2ODeepLearning model
dl = H2ODeepLearning(epochs=10,
                     l1=0.001,
Exemple #7
0
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))


nltk.download('stopwords')

# list of stopwords to be removed from the posts
StopWords = list(set(stopwords.words('english')))

labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train)
bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post")
RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(),
                                outputCol="words",
                                pattern="[^0-9a-z#+_]+")
StopwordRemover = StopWordsRemover(
    inputCol=RegexTokenizer.getOutputCol(),
    outputCol="filtered_words").setStopWords(StopWords)
CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(),
                                  outputCol="countFeatures",
                                  minDF=5)
idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features")
rf = RandomForestClassifier(labelCol="label",
                            featuresCol=idf.getOutputCol(),
                            numTrees=100,
                            maxDepth=4)
idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue")
idx_2_string.setLabels(labelIndexer.labels)

# creating the pipeline
pipeline = Pipeline(stages=[
    labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover,
def update_text_with_key_ngrams(df, n, seed=42,
                                outputCol="ngram_text",
                                pattern=r"(?!(?<='))\w+"):
    def build_text(words):
        # Wandle bag of words in sentences um und schaue in jedem der
        # sentences ob
        # eines der key_bigrams in ihm vorkommt
        # bspw. bag of words = ["hi", "i", "ralf"] und key_bigram = "i ralf" -->
        # sentence = ["hi i ralf"] und key_bigram kommt drin vor
        # Wenn bigram vorkommt, dann ersetze die zwei Wörter im Satz mit der
        # underscore version des bigrams ("i_ralf")
        sentence = ' '.join(words)
        for ngram in key_ngrams:
            if ngram in sentence:
                sentence = sentence.replace(ngram, ngram.replace(" ", "_"))
        return sentence

    outputs = {
        "tokenizer": "words",
        "ngram": "ngrams",
        "cv": "tf",
        "idf": "tf_idf",
        "build_text_udf": outputCol
    }

    # Build pipeline
    tokenizer = RegexTokenizer(inputCol="text",
                               outputCol=outputs["tokenizer"],
                               pattern=pattern,
                               gaps=False)
    ngram = NGram(n=n,
                  inputCol=tokenizer.getOutputCol(),
                  outputCol=outputs["ngram"])
    cv = CountVectorizer(inputCol=ngram.getOutputCol(),
                         outputCol=outputs["cv"])
    idf = IDF(inputCol=cv.getOutputCol(),
              outputCol=outputs["idf"])
    pipe = Pipeline(stages=[
        tokenizer,  # transform
        ngram,  # transform
        cv,  # fit_transform
        idf  # fit
    ])

    print("\t Computing tf_idf matrix for {}-grams...".format(n))
    pipe_model = pipe.fit(df)  # calls transform on tokenizer & ngram,
    # fit_transform on cv and fit on idf
    vocabulary = np.array(pipe_model.stages[2].vocabulary)
    print("\t\t vocabulary size: {}".format(len(vocabulary)))
    df = pipe_model.transform(df)

    # train test split
    train, _ = df.randomSplit([0.8, 0.2], seed=seed)
    train.persist(StorageLevel.MEMORY_AND_DISK)

    # fit linear SVM
    svc = LinearSVC(maxIter=100,
                    regParam=0.1,
                    featuresCol="tf_idf")
    print("\t Estimating key {}-grams with SVC...".format(n))
    svc_model = svc.fit(train)

    # Wähle die ngrams mit den schlechtesten/besten weights
    print("\t Update text with key {}-grams...".format(n))
    coeffs = svc_model.coefficients.toArray()
    key_ngrams = get_n_extremes_of_a_in_b(coeffs, vocabulary, 50)

    build_text_udf = F.udf(build_text)

    df = df.withColumn(outputs["build_text_udf"],
                       build_text_udf(
                           F.col(tokenizer.getOutputCol())))
    print()
    return df
Exemple #9
0
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf,col
from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover, RegexTokenizer
from pyspark.ml.clustering import LDA

spark = SparkSession.builder.getOrCreate()

data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv')
df = spark.createDataFrame(data)
textCol = 'review'
selfstopwords = ['br']
numOfTopics = 10
numOfKeywords = 5

tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+')
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0')
stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean')
cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv')
idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf')
lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10)

pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda])

model = pipe1.fit(df)
output = model.transform(df)

def topicsTerms(vocab, termindices, leng=None):
  if not leng:
    return [voca[t] for t in termindices]
  return [vocab[t] for t in termindices][:leng]
Exemple #10
0
sampled = malicious.unionAll(sample_bening)

sampled.groupby('label').count().toPandas()


# # Data Ingestion and Vectorization

# In[18]:


#Tokennize the TrainData - sparse the URL string into words
regexTokenizer = RegexTokenizer(inputCol="url", outputCol="Words", pattern="\\W")

#CountVectorizer converts the the words into feature vectors - Thi is used as it gives better results
countVectors = CountVectorizer(inputCol=regexTokenizer.getOutputCol(), outputCol="rawfeatures", vocabSize=10000, minDF=5)

#
idf = IDF(inputCol=countVectors.getOutputCol(), outputCol="features") 

#create the pipline 
pipeline = Pipeline(stages=[regexTokenizer, countVectors, idf ])


# Fit the pipeline to training documents.
# Pass 'sampled' in the param to set Balanced datasets
pipelineFit = pipeline.fit(sampled)

#Transform the pipeline to dataset
# Pass 'sampled' in the param to set Balanced datasets
dataset = pipelineFit.transform(sampled)
    row_rdd = spark.sparkContext.textFile(_locate("smsData.txt")).map(lambda x: x.split("\t", 1)).filter(lambda r: r[0].strip())
    return spark.createDataFrame(row_rdd, ["label", "text"])

##
## Define the pipeline stages
##

## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
idf = IDF(inputCol=hashingTF.getOutputCol(),
          outputCol="tf_idf",
          minDocFreq=4)

Exemple #12
0
from pyspark.ml.feature import RegexTokenizer, HashingTF,
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

# schema for dataframe
schema = StructType(
    [StructField("url", StringType()),
     StructField("label", IntegerType())])
# read the data to generate a dataframe
df = spark.read.schema(schema).csv(
    "file:///home/hadoop/Documents/LR-on-Malicious-Link/data/datacp.csv",
    header=True)

# string regex tokenizer
tokenizer = RegexTokenizer(inputCol="url", outputCol="words", pattern="/")
# hashing term frequency
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                      outputCol="rawFeatures",
                      numFeatures=20)
# inverse document frequency
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")

# logistic tregression
lr = LogisticRegression(maxIter=100, regParam=0.001)

# add stages to pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, lr])

# train the model
model = pipeline.fit(df)
Exemple #13
0
# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents.  Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.

# COMMAND ----------

from pyspark.ml.feature import IDF, HashingTF, Normalizer

hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")

idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")

normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")

# COMMAND ----------

# MAGIC %md
# MAGIC Now, let's build the `KMeans` estimator and a `Pipeline` that will contain all of the stages.  We'll then call fit on the `Pipeline` which will give us back a `PipelineModel`.  This will take about a minute to run.

# COMMAND ----------

from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans