Ejemplo n.º 1
0
 def test_is_star_table(self):
     df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"])
     df.write.format("parquet").save(self.tempFile)
     tempFile2 = self.tempFile + '_2'
     df.write.format("star").save(tempFile2)
     self.assertEqual(StarTable.isStarTable(self.spark, self.tempFile), False)
     self.assertEqual(StarTable.isStarTable(self.spark, tempFile2), True)
Ejemplo n.º 2
0
    def test_read_with_merge_operator(self):
        self.__overwriteHashStarTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        table = StarTable.forPath(self.spark, self.tempFile)
        df = self.spark.createDataFrame([('a', 11), ('b', 22), ('e', 55), ('f', 66)], ["key", "value"])
        table.upsert(df._jdf)

        StarTable.registerMergeOperator(self.spark, "org.apache.spark.sql.star.MergeOpLong", "long_op")
        re = table.toDF().withColumn("value", expr("long_op(value)"))
        print(str(re.schema))
        print(re.explain(True))
        self.__checkAnswer(re.select("key", "value"),
                           ([('a', 12), ('b', 24), ('c', 3), ('d', 4), ('e', 55), ('f', 66)]))
Ejemplo n.º 3
0
 def test_compaction(self):
     self.__overwriteHashStarTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
     table = StarTable.forPath(self.spark, self.tempFile)
     df = self.spark.createDataFrame([('a', 11), ('b', 22), ('e', 55), ('f', 66)], ["key", "value"])
     table.upsert(df._jdf)
     table.compaction()
     self.__checkAnswer(table.toDF().select("key", "value"),
                        ([('a', 11), ('b', 22), ('c', 3), ('d', 4), ('e', 55), ('f', 66)]))
Ejemplo n.º 4
0
    def test_upsert(self):
        self.__overwriteHashStarTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        source = self.spark.createDataFrame([('a', -1), ('b', 0), ('e', -5), ('f', -6)], ["key", "value"])

        table = StarTable.forPath(self.spark, self.tempFile)
        table.upsert(source._jdf)
        self.__checkAnswer(table.toDF(),
                           ([('a', -1), ('b', 0), ('c', 3), ('d', 4), ('e', -5), ('f', -6)]))
Ejemplo n.º 5
0
 def test_compaction_with_merge_operator(self):
     self.__overwriteHashStarTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
     table = StarTable.forPath(self.spark, self.tempFile)
     df = self.spark.createDataFrame([('a', 11), ('b', 22), ('e', 55), ('f', 66)], ["key", "value"])
     table.upsert(df._jdf)
     merge_info = {"value": "org.apache.spark.sql.star.MergeOpLong"}
     table.compaction(mergeOperatorInfo=merge_info)
     self.__checkAnswer(table.toDF().select("key", "value"),
                        ([('a', 12), ('b', 24), ('c', 3), ('d', 4), ('e', 55), ('f', 66)]))
Ejemplo n.º 6
0
    def test_cleanup(self):
        self.__writeStarTable([('a', 1), ('b', 2), ('c', 3)])
        table = StarTable.forPath(self.spark, self.tempFile)
        self.__createFile('abc.txt', 'abcde')
        self.__createFile('bac.txt', 'abcdf')
        self.assertEqual(True, self.__checkFileExists('abc.txt'))
        table.cleanup()  # will not delete files as default retention is used.

        self.assertEqual(True, self.__checkFileExists('bac.txt'))
        retentionConf = "spark.engineplus.star.cleanup.interval"
        self.spark.conf.set(retentionConf, "0")
        table.cleanup()
        self.assertEqual(False, self.__checkFileExists('bac.txt'))
        self.assertEqual(False, self.__checkFileExists('abc.txt'))
Ejemplo n.º 7
0
    def test_delete(self):
        self.__writeStarTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        table = StarTable.forPath(self.spark, self.tempFile)

        # delete with condition as str
        table.delete("key = 'a'")
        self.__checkAnswer(table.toDF(), [('b', 2), ('c', 3), ('d', 4)])

        # delete with condition as Column
        table.delete(col("key") == lit("b"))
        self.__checkAnswer(table.toDF(), [('c', 3), ('d', 4)])

        # delete without condition
        table.delete()
        self.__checkAnswer(table.toDF(), [])

        # bad args
        with self.assertRaises(TypeError):
            table.delete(condition=1)
Ejemplo n.º 8
0
    def test_update(self):
        self.__writeStarTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)])
        table = StarTable.forPath(self.spark, self.tempFile)

        # update with condition as str and with set exprs as str
        table.update("key = 'a' or key = 'b'", {"value": "1"})
        self.__checkAnswer(table.toDF(), [('a', 1), ('b', 1), ('c', 3), ('d', 4)])

        # update with condition as Column and with set exprs as Columns
        table.update(expr("key = 'a' or key = 'b'"), {"value": expr("0")})
        self.__checkAnswer(table.toDF(), [('a', 0), ('b', 0), ('c', 3), ('d', 4)])

        # update without condition
        table.update(set={"value": "200"})
        self.__checkAnswer(table.toDF(), [('a', 200), ('b', 200), ('c', 200), ('d', 200)])

        # bad args
        with self.assertRaisesRegex(ValueError, "cannot be None"):
            table.update({"value": "200"})

        with self.assertRaisesRegex(ValueError, "cannot be None"):
            table.update(condition='a')

        with self.assertRaisesRegex(TypeError, "must be a dict"):
            table.update(set=1)

        with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"):
            table.update(1, {})

        with self.assertRaisesRegex(TypeError, "Values of dict in .* must contain only"):
            table.update(set={"value": 1})

        with self.assertRaisesRegex(TypeError, "Keys of dict in .* must contain only"):
            table.update(set={1: ""})

        with self.assertRaises(TypeError):
            table.update(set=1)
Ejemplo n.º 9
0
 def test_alias_and_toDF(self):
     self.__writeStarTable([('a', 1), ('b', 2), ('c', 3)])
     table = StarTable.forPath(self.spark, self.tempFile).toDF()
     self.__checkAnswer(
         table.alias("myTable").select('myTable.key', 'myTable.value'),
         [('a', 1), ('b', 2), ('c', 3)])
Ejemplo n.º 10
0
 def test_forName(self):
     self.__writeAsTable([('a', 1), ('b', 2), ('c', 3)], "test")
     df = StarTable.forName(self.spark, "test").toDF()
     self.__checkAnswer(df, [('a', 1), ('b', 2), ('c', 3)])
Ejemplo n.º 11
0
 def test_forPath(self):
     self.__writeStarTable([('a', 1), ('b', 2), ('c', 3)])
     table = StarTable.forPath(self.spark, self.tempFile).toDF()
     self.__checkAnswer(table, [('a', 1), ('b', 2), ('c', 3)])
Ejemplo n.º 12
0
 def test_drop_table(self):
     self.__writeStarTable([('a', 1), ('b', 2), ('c', 3)])
     StarTable.forPath(self.spark, self.tempFile).dropTable()
     self.assertEqual(StarTable.isStarTable(self.spark, self.tempFile), False)
Ejemplo n.º 13
0
 def test_drop_partition(self):
     df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"])
     df.write.format("star").option("rangePartitions", "key").save(self.tempFile)
     StarTable.forPath(self.spark, self.tempFile).dropPartition("key='a'")
     self.__checkAnswer(StarTable.forPath(self.spark, self.tempFile).toDF().select("key", "value"),
                        ([('b', 2), ('c', 3)]))