Exemple #1
0
class TestWriteCassandra(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        CassandraFixture(
            'cassandra.docker',
            absolute_path(__file__, 'resources', 'test_write', 'cassandra_setup.cql'),
            absolute_path(__file__, 'resources', 'test_write', 'cassandra_teardown.cql'),
        )
    ]

    def test_write_cassandra(self):
        df = self.spark.createDataFrame(TEST_DATA)

        df.write_ext.cassandra(
            host='cassandra.docker',
            port=9042,
            keyspace='sparkly_test',
            table='test_writer',
            consistency='ONE',
            mode='overwrite',
        )

        written_df = self.spark.read_ext.by_url(
            'cassandra://cassandra.docker/'
            'sparkly_test/test_writer'
            '?consistency=ONE'
        )
        self.assertDataFrameEqual(written_df, TEST_DATA)
Exemple #2
0
class TestElastic6Fixture(SparklyTest):

    session = SparklyTestSessionWithES6

    class_fixtures = [
        ElasticFixture(
            'elastic6.docker',
            'sparkly_test_fixture',
            'test',
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'mapping.json'),
            absolute_path(__file__, 'resources', 'test_fixtures', 'data.json'),
        )
    ]

    def test_elastic_fixture(self):
        df = self.spark.read_ext.by_url(
            'elastic://elastic6.docker/sparkly_test_fixture/test?es.read.metadata=false'
        )
        self.assertDataFrameEqual(df, [
            {
                'name': 'John',
                'age': 56
            },
        ])
Exemple #3
0
class TestWriteMysql(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        MysqlFixture(
            'mysql.docker',
            'root',
            None,
            absolute_path(__file__, 'resources', 'test_write', 'mysql_setup.sql'),
            absolute_path(__file__, 'resources', 'test_write', 'mysql_teardown.sql'),
        )
    ]

    def test_write_mysql(self):
        df = self.spark.createDataFrame(TEST_DATA)

        df.write_ext.mysql(
            host='mysql.docker',
            port=3306,
            database='sparkly_test',
            table='test_writer',
            mode='overwrite',
            options={'user': '******', 'password': ''}
        )

        df = self.spark.read_ext.by_url(
            'mysql://mysql.docker/'
            'sparkly_test/test_writer'
            '?user=root&password='
        )
        self.assertDataFrameEqual(df, TEST_DATA)
Exemple #4
0
class TestWriteCassandra(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        CassandraFixture(
            'cassandra.docker',
            absolute_path(__file__, 'resources', 'test_write', 'cassandra_setup.cql'),
            absolute_path(__file__, 'resources', 'test_write', 'cassandra_teardown.cql'),
        )
    ]

    def test_write_cassandra(self):
        df = self.spark.createDataFrame(TEST_DATA)

        df.write_ext.cassandra(
            host='cassandra.docker',
            port=9042,
            keyspace='sparkly_test',
            table='test_writer',
            consistency='ONE',
            mode='overwrite',
            # overwrite would first perform truncation.
            # Either change mode to 'append' to change data already in the table,
            # or set confirm.truncate to true
            options={'confirm.truncate': True},
        )

        written_df = self.spark.read_ext.by_url(
            'cassandra://cassandra.docker/'
            'sparkly_test/test_writer'
            '?consistency=ONE'
        )
        self.assertDataFrameEqual(written_df, TEST_DATA)
Exemple #5
0
    def test_cassandra_fixture(self):
        data_in_cassandra = CassandraFixture(
            'cassandra.docker',
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'cassandra_setup.cql'),
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'cassandra_teardown.cql'),
        )

        with data_in_cassandra:
            df = self.spark.read_ext.by_url(
                'cassandra://cassandra.docker/sparkly_test/test')
            self.assertDataFrameEqual(df, [
                {
                    'uid': '1',
                    'countries': {
                        'AE': 13,
                        'BE': 1,
                        'BH': 3,
                        'CA': 1,
                        'DZ': 1,
                        'EG': 206
                    },
                },
            ],
                                      fields=['uid', 'countries'])
Exemple #6
0
class TestMysqlFixtures(SparklyGlobalSessionTest):

    session = SparklyTestSession

    fixtures = [
        MysqlFixture(
            'mysql.docker',
            'root',
            None,
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'mysql_setup.sql'),
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'mysql_teardown.sql'),
        )
    ]

    def test_mysql_fixture(self):
        df = self.spark.read_ext.by_url(
            'mysql://mysql.docker/sparkly_test/test?user=root&password='******'id': 1,
                'name': 'john',
                'surname': 'sk',
                'age': 111
            },
        ])
Exemple #7
0
class SparklyReaderCassandraTest(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        CassandraFixture(
            'cassandra.docker',
            absolute_path(__file__, 'resources', 'test_read',
                          'cassandra_setup.cql'),
            absolute_path(__file__, 'resources', 'test_read',
                          'cassandra_teardown.cql'),
        )
    ]

    def test_read(self):
        df = self.spark.read_ext.cassandra(
            host='cassandra.docker',
            port=9042,
            keyspace='sparkly_test',
            table='test',
            consistency='ONE',
        )

        self.assertDataFrameEqual(df, [{
            'countries': {
                'DZ': 1,
                'EG': 206,
                'BE': 1,
                'CA': 1,
                'AE': 13,
                'BH': 3
            },
            'uid': '1',
            'created': '1234567894',
        }, {
            'countries': {
                'DZ': 1,
                'EG': 206,
                'BE': 1,
                'CA': 1,
                'AE': 13,
                'BH': 3
            },
            'uid': '2',
            'created': '1234567893',
        }, {
            'countries': {
                'DZ': 1,
                'EG': 206,
                'BE': 1,
                'CA': 1,
                'AE': 13,
                'BH': 3
            },
            'uid': '3',
            'created': '1234567891',
        }])
Exemple #8
0
class TestKafkaFixture(SparklyGlobalSessionTest):

    session = SparklyTestSession

    topic = 'sparkly.test.fixture.{}'.format(uuid.uuid4().hex[:10])
    fixtures = [
        KafkaFixture(
            'kafka.docker',
            topic=topic,
            key_serializer=lambda item: json.dumps(item).encode('utf-8'),
            value_serializer=lambda item: json.dumps(item).encode('utf-8'),
            data=absolute_path(__file__, 'resources', 'test_fixtures',
                               'kafka.json'),
        )
    ]

    def test_kafka_fixture(self):
        consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers='kafka.docker:9092',
            key_deserializer=lambda item: json.loads(item.decode('utf-8')),
            value_deserializer=lambda item: json.loads(item.decode('utf-8')),
            auto_offset_reset='earliest',
        )

        actual_data = []
        for i in range(5):
            message = next(consumer)
            data = {'key': message.key, 'value': message.value}
            actual_data.append(data)

        expected_data = self.spark.read.json(
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'kafka.json'))
        self.assertDataFrameEqual(expected_data, actual_data)
Exemple #9
0
class TestWriteElastic7(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        ElasticFixture(
            'elastic7.docker',
            'sparkly_test',
            None,
            None,
            absolute_path(__file__, 'resources', 'test_write', 'elastic7_setup.json'),
        ),
    ]

    def test_write_elastic(self):
        df = self.spark.createDataFrame(TEST_DATA)

        df.write_ext.elastic(
            host='elastic7.docker',
            port=9200,
            es_index='sparkly_test',
            es_type=None,
            mode='overwrite',
            options={
                'es.mapping.id': 'uid',
            },
        )

        df = self.spark.read_ext.by_url(
            'elastic://elastic7.docker/sparkly_test?es.read.metadata=false',
        )
        self.assertDataFrameEqual(df, TEST_DATA)
Exemple #10
0
class SparklyReaderElastic7Test(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        ElasticFixture(
            'elastic7.docker',
            'sparkly_test',
            None,
            None,
            absolute_path(__file__, 'resources', 'test_read',
                          'elastic7_setup.json'),
        )
    ]

    def test_elastic(self):
        df = self.spark.read_ext.elastic(
            host='elastic7.docker',
            port=9200,
            es_index='sparkly_test',
            es_type=None,
            query='?q=name:*Smith*',
            options={
                'es.read.field.as.array.include': 'topics',
                'es.read.metadata': 'false',
            },
        )

        self.assertDataFrameEqual(df, ELASTIC_TEST_DATA)
Exemple #11
0
class SparklyReaderMySQLTest(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        MysqlFixture(
            'mysql.docker',
            'root',
            None,
            absolute_path(__file__, 'resources', 'test_read',
                          'mysql_setup.sql'),
            absolute_path(__file__, 'resources', 'test_read',
                          'mysql_teardown.sql'),
        )
    ]

    def test_read_mysql(self):
        df = self.spark.read_ext.mysql(host='mysql.docker',
                                       database='sparkly_test',
                                       table='test',
                                       options={
                                           'user': '******',
                                           'password': '',
                                       })

        self.assertDataFrameEqual(df, [
            {
                'id': 1,
                'name': 'john',
                'surname': 'sk',
                'age': 111
            },
            {
                'id': 2,
                'name': 'john',
                'surname': 'po',
                'age': 222
            },
            {
                'id': 3,
                'name': 'john',
                'surname': 'ku',
                'age': 333
            },
        ])
Exemple #12
0
class TestElastic7Fixture(SparklyGlobalSessionTest):

    session = SparklyTestSession

    class_fixtures = [
        ElasticFixture(
            'elastic7.docker',
            'sparkly_test_fixture',
            None,
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'mapping.json'),
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'data_for_es7.json'),
        )
    ]

    def test_elastic_fixture(self):
        df = self.spark.read_ext.by_url(
            'elastic://elastic7.docker/sparkly_test_fixture?es.read.metadata=false'
        )
        self.assertDataFrameEqual(df, [{'name': 'John', 'age': 56}])
Exemple #13
0
 def setUp(self):
     self.json_decoder = lambda item: json.loads(item.decode('utf-8'))
     self.json_encoder = lambda item: json.dumps(item).encode('utf-8')
     self.topic = 'test.topic.write.kafka.{}'.format(uuid.uuid4().hex[:10])
     self.fixture_path = absolute_path(
         __file__,
         '..',
         'integration',
         'resources',
         'test_write',
         'kafka_setup.json',
     )
     self.expected_data = self.spark.read.json(self.fixture_path)
Exemple #14
0
class SparklyReaderElasticTest(SparklyGlobalSessionTest):
    session = SparklyTestSession

    fixtures = [
        ElasticFixture(
            'elastic.docker',
            'sparkly_test',
            'test',
            None,
            absolute_path(__file__, 'resources', 'test_read',
                          'elastic_setup.json'),
        )
    ]

    def test_elastic(self):
        df = self.spark.read_ext.elastic(
            host='elastic.docker',
            port=9200,
            es_index='sparkly_test',
            es_type='test',
            query='?q=name:*Smith*',
            options={
                'es.read.field.as.array.include': 'topics',
                'es.read.metadata': 'false',
            },
        )

        self.assertDataFrameEqual(df, [{
            'name': 'Smith3',
            'topics': [1, 4, 5],
            'age': 31,
            'demo': {
                'age_30': 110,
                'age_10': 50,
            }
        }, {
            'name': 'Smith4',
            'topics': [4, 5],
            'age': 12,
            'demo': {
                'age_30': 20,
                'age_10': 1,
            }
        }])
Exemple #15
0
    def test_kafka_fixture(self):
        consumer = KafkaConsumer(
            self.topic,
            bootstrap_servers='kafka.docker:9092',
            key_deserializer=lambda item: json.loads(item.decode('utf-8')),
            value_deserializer=lambda item: json.loads(item.decode('utf-8')),
            auto_offset_reset='earliest',
        )

        actual_data = []
        for i in range(5):
            message = next(consumer)
            data = {'key': message.key, 'value': message.value}
            actual_data.append(data)

        expected_data = self.spark.read.json(
            absolute_path(__file__, 'resources', 'test_fixtures',
                          'kafka.json'))
        self.assertDataFrameEqual(expected_data, actual_data)
Exemple #16
0
 def setUp(self):
     self.json_decoder = lambda item: json.loads(item.decode('utf-8'))
     self.json_encoder = lambda item: json.dumps(item).encode('utf-8')
     self.topic = 'test.topic.write.kafka.{}'.format(uuid.uuid4().hex[:10])
     self.fixture_path = absolute_path(__file__, 'resources', 'test_read',
                                       'kafka_setup.json')
     self.fixture = KafkaFixture(
         'kafka.docker',
         topic=self.topic,
         key_serializer=self.json_encoder,
         value_serializer=self.json_encoder,
         data=self.fixture_path,
     )
     self.fixture.setup_data()
     self.expected_data_df = self.spark.read.json(self.fixture_path)
     self.expected_data = [
         item.asDict(recursive=True)
         for item in self.expected_data_df.collect()
     ]
Exemple #17
0
class SparklyTestSession(SparklySession):
    packages = [
        'datastax:spark-cassandra-connector:2.0.0-M2-s_2.11',
        'org.elasticsearch:elasticsearch-spark-20_2.11:5.1.1',
        'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0',
        'mysql:mysql-connector-java:5.1.39',
        'io.confluent:kafka-avro-serializer:3.0.1',
    ]

    repositories = [
        'http://packages.confluent.io/maven/',
    ]

    jars = [
        absolute_path(__file__, 'resources', 'brickhouse-0.7.1.jar'),
    ]

    udfs = {
        'collect_max': 'brickhouse.udf.collect.CollectMaxUDAF',
        'length_of_text': (lambda text: len(text), StringType())
    }
Exemple #18
0
 def get_test_data(self, filename):
     file_path = absolute_path(__file__, 'resources', 'test_testing',
                               filename)
     df = self.spark.read.json(file_path)
     data = [item.asDict(recursive=True) for item in df.collect()]
     return df, data