class SentimentalPipelineEngine(PipelineEngine):
    def __init__(self, cv):
        super(SentimentalPipelineEngine, self).__init__(cv)
        self.tokenizer_map = [TweetTokenizer()]
        self.ngram_map = [1]
        self.hashing_tf_map = [pow(2, 20)]
        self.clf_map = [0.1]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=self.stages)
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed")
        self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams")
        self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features")
        self.idf = IDF(inputCol="features", outputCol="idf_features")
        self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0)
        self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1)
        # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2])
        return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map)
        param_grid_builder.addGrid(self.ngram.n, self.ngram_map)
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.clf.regParam, self.clf_map)
        return param_grid_builder.build()
Beispiel #2
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
Beispiel #3
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([
         Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def create_pipeline():
    """
    creates model pipeline
    Currently uses RegexTokenizer to get bytewords as tokens, hashingTF to
    featurize the tokens as word counts, and NaiveBayes to fit and classify

    This is where most of the work will be done in improving the model
    """

    tokenizer = RegexTokenizer(inputCol="text",
                               outputCol="words",
                               pattern="(?<=\\s)..",
                               gaps=False)
    ngram = NGram(n=2, inputCol="words", outputCol="grams")
    hashingTF = HashingTF(numFeatures=65792,
                          inputCol=ngram.getOutputCol(),
                          outputCol="features")
    nb = NaiveBayes(smoothing=1)
    pipeline = Pipeline(stages=[tokenizer, ngram, hashingTF, nb])

    return (pipeline)
Beispiel #5
0
spark = SparkSession.builder.master("local").appName("Word Count").config(
    "spark.some.config.option", "some-value").getOrCreate()

df = spark.read.csv('file:///home/zfar/Sentiment Analysis Dataset.csv',
                    header=True)

df = df.select(df['ItemID'], df['SentimentText'], df['label'])

training = df.selectExpr("cast(itemID as int) id", "SentimentText",
                         "cast(label as int) label")

tokenizer = Tokenizer(inputCol="SentimentText", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")
ngrams = NGram(n=2, inputCol=remover.getOutputCol(), outputCol="ngrams")
hashingTF = HashingTF(inputCol=ngrams.getOutputCol(), outputCol="rawfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idffeatures")
normalizer = Normalizer(inputCol=idf.getOutputCol(),
                        outputCol="features",
                        p=1.0)

#lr = LogisticRegression(maxIter=10, regParam=0.001)
nb = NaiveBayes(smoothing=1.0)
pipeline = Pipeline(
    stages=[tokenizer, remover, ngrams, hashingTF, idf, normalizer, nb])
model = pipeline.fit(training)
"""
paramGrid = ParamGridBuilder().addGrid(hashingTF.numFeatures, [10, 100, 1000]).addGrid(lr.regParam, [0.1, 0.01]).build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
def update_text_with_key_ngrams(df, n, seed=42,
                                outputCol="ngram_text",
                                pattern=r"(?!(?<='))\w+"):
    def build_text(words):
        # Wandle bag of words in sentences um und schaue in jedem der
        # sentences ob
        # eines der key_bigrams in ihm vorkommt
        # bspw. bag of words = ["hi", "i", "ralf"] und key_bigram = "i ralf" -->
        # sentence = ["hi i ralf"] und key_bigram kommt drin vor
        # Wenn bigram vorkommt, dann ersetze die zwei Wörter im Satz mit der
        # underscore version des bigrams ("i_ralf")
        sentence = ' '.join(words)
        for ngram in key_ngrams:
            if ngram in sentence:
                sentence = sentence.replace(ngram, ngram.replace(" ", "_"))
        return sentence

    outputs = {
        "tokenizer": "words",
        "ngram": "ngrams",
        "cv": "tf",
        "idf": "tf_idf",
        "build_text_udf": outputCol
    }

    # Build pipeline
    tokenizer = RegexTokenizer(inputCol="text",
                               outputCol=outputs["tokenizer"],
                               pattern=pattern,
                               gaps=False)
    ngram = NGram(n=n,
                  inputCol=tokenizer.getOutputCol(),
                  outputCol=outputs["ngram"])
    cv = CountVectorizer(inputCol=ngram.getOutputCol(),
                         outputCol=outputs["cv"])
    idf = IDF(inputCol=cv.getOutputCol(),
              outputCol=outputs["idf"])
    pipe = Pipeline(stages=[
        tokenizer,  # transform
        ngram,  # transform
        cv,  # fit_transform
        idf  # fit
    ])

    print("\t Computing tf_idf matrix for {}-grams...".format(n))
    pipe_model = pipe.fit(df)  # calls transform on tokenizer & ngram,
    # fit_transform on cv and fit on idf
    vocabulary = np.array(pipe_model.stages[2].vocabulary)
    print("\t\t vocabulary size: {}".format(len(vocabulary)))
    df = pipe_model.transform(df)

    # train test split
    train, _ = df.randomSplit([0.8, 0.2], seed=seed)
    train.persist(StorageLevel.MEMORY_AND_DISK)

    # fit linear SVM
    svc = LinearSVC(maxIter=100,
                    regParam=0.1,
                    featuresCol="tf_idf")
    print("\t Estimating key {}-grams with SVC...".format(n))
    svc_model = svc.fit(train)

    # Wähle die ngrams mit den schlechtesten/besten weights
    print("\t Update text with key {}-grams...".format(n))
    coeffs = svc_model.coefficients.toArray()
    key_ngrams = get_n_extremes_of_a_in_b(coeffs, vocabulary, 50)

    build_text_udf = F.udf(build_text)

    df = df.withColumn(outputs["build_text_udf"],
                       build_text_udf(
                           F.col(tokenizer.getOutputCol())))
    print()
    return df
Beispiel #7
0
text = tokenizer.transform(text)
text.show(5)
#tokenizer = Tokenizer(inputCol='paragraph',outputCol='words')
#text = tokenizer.transform(text)
#text.text(5)

remover = StopWordsRemover(inputCol = 'words',outputCol = 'filtered_words')
text = remover.transform(text)
text.show(5)

ngramer = NGram(n = 2,inputCol = 'filtered_words',outputCol = 'ngrams')
text = ngramer.transform(text)
text.show(5)


words_len = text.select(ngramer.getOutputCol()).rdd.map(lambda x:len(x[0])).collect()
word2vec = Word2Vec(vectorSize = np.quantile(words_len,0.5),minCount=0,inputCol='ngrams',outputCol='word_vec')
text = word2vec.fit(text).transform(text)
text.show(5)


lda = LDA(featuresCol = word2vec.getOutputCol(),k=5, maxIter=10)
lda_model = lda.fit(text)

topics = lda_model.describeTopics(5)
topics.show(5)

text = lda_model.transform(text)
text.show(5)

'''
Beispiel #8
0
tokenizer = RegexTokenizer(inputCol='paragraph',
                           outputCol='words',
                           gaps=False,
                           pattern='[a-z|A-Z]+')
text = tokenizer.transform(text)
text.show(5)

remover = StopWordsRemover(inputCol='words', outputCol='filtered_words')
text = remover.transform(text)
text.show(5)

ngramer = NGram(n=2, inputCol='filtered_words', outputCol='ngrams')
text = ngramer.transform(text)
text.show(5)

count_vec = CountVectorizer(inputCol=ngramer.getOutputCol(),
                            outputCol='ft_features')
count_vec_model = count_vec.fit(text)
vocab = count_vec_model.vocabulary
text = count_vec_model.transform(text)
text.show(5)

idf = IDF(inputCol=count_vec.getOutputCol(), outputCol='features')
text = idf.fit(text).transform(text)

lda = LDA(featuresCol=idf.getOutputCol(), k=5, maxIter=10)
lda_model = lda.fit(text)

topics = lda_model.describeTopics()
# topics_words = topics.rdd.map(lambda x: x['termIndices']).map(lambda x:[vocab[i] for i in x]).collect()
get_topics_words = F.udf(lambda x: [vocab[i] for i in x],