Exemple #1
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input",
                                        outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(
         isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame(
         [Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
Exemple #2
0
 def test_stopwordsremover(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "panda"])])
     stopWordRemover = StopWordsRemover(inputCol="input", outputCol="output")
     # Default
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["panda"])
     self.assertEqual(type(stopWordRemover.getStopWords()), list)
     self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring))
     # Custom
     stopwords = ["panda"]
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getInputCol(), "input")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a"])
     # with language selection
     stopwords = StopWordsRemover.loadDefaultStopWords("turkish")
     dataset = self.spark.createDataFrame([Row(input=["acaba", "ama", "biri"])])
     stopWordRemover.setStopWords(stopwords)
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])
     # with locale
     stopwords = ["BELKİ"]
     dataset = self.spark.createDataFrame([Row(input=["belki"])])
     stopWordRemover.setStopWords(stopwords).setLocale("tr")
     self.assertEqual(stopWordRemover.getStopWords(), stopwords)
     transformedDF = stopWordRemover.transform(dataset)
     self.assertEqual(transformedDF.head().output, [])