Esempio n. 1
0
 def test_replace_nested_properties_with_underscores_for_default_column_name(
         self):
     self.assertEqual(
         'parent_child_grandchild',
         Property('parent',
                  Property('child', Property('grandchild',
                                             'VARCHAR(MAX)'))).column_name)
    def test_bulk_copy_for_hourly_time_series(self):
        paths = ['test.table/2014-11-10/00', 'test.table/2014-11-11/01']
        properties = Property('userid',
                              'VARCHAR(36)'), Property('timestamp',
                                                       'TIMESTAMP')
        series_column = 'test'
        max_error = 20

        import_pipeline = Mock()
        sources = Mock()
        sources.get = Mock(return_value=paths)

        time_series_import = SqlTimeSeriesImport('test_table',
                                                 '2014-11-10 00:00:00',
                                                 sources, *properties)
        time_series_import.bulk_copy(import_pipeline, '', max_error,
                                     series_column)

        first_bulk_copy, second_bulk_copy = \
            [c[1] for c in import_pipeline.bulk_copy.call_args_list]

        self.__assert_bulk_copy(
            first_bulk_copy, '', paths[0],
            self.__expected_schema('test_table_2014_11_10_00'), max_error)
        self.__assert_bulk_copy(
            second_bulk_copy, '', paths[1],
            self.__expected_schema('test_table_2014_11_11_01'), max_error)
        self.assertEqual(import_pipeline.bulk_copy.call_count, 2)
        import_pipeline.sql.assert_called_once_with(
            *self.expected_hourly_update_sql(series_column))
Esempio n. 3
0
 def test_have_column_name_for_nested_properties(self):
     self.assertEqual(
         'my_column_name',
         Property(
             'parent',
             Property(
                 'child',
                 Property('grandchild', 'VARCHAR(MAX)',
                          'my_column_name'))).column_name)
Esempio n. 4
0
    def test_stage_update_when_column_name_defined(self):
        with patch.object(Database, 'execute') as execute:
            schema = JsonObject(
                TABLE_NAME, Property('property1', 'VARCHAR(10)', 'someColumn'),
                Property('property2', 'TIMESTAMP', 'anotherColumn'))
            table = TargetTable(schema, Database(Mock()))

            table.stage_update()

            expected_sql = 'CREATE TABLE {0}_update (someColumn VARCHAR(10), ' \
                           'anotherColumn TIMESTAMP)'.format(TABLE_NAME)

            execute.assert_called_once_with(expected_sql)
Esempio n. 5
0
    def test_create_when_column_name_not_defined_for_nested_property(self):
        with patch.object(Database, 'execute') as execute:
            schema = JsonObject(
                TABLE_NAME, Property('property1', 'VARCHAR(10)'),
                Property('property2', Property('timestamp', 'TIMESTAMP')))
            table = TargetTable(schema, Database(Mock()))

            table.create()

            expected_sql = 'CREATE TABLE {0} (property1 VARCHAR(10), ' \
                           'property2_timestamp TIMESTAMP)'.format(TABLE_NAME)

            execute.assert_called_once_with(expected_sql)
Esempio n. 6
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
     self.database = Database(psycopg2.connect(CONNECTION))
     self.database.open()
     table = TargetTable(self.schema, self.database)
     table.create()
     self.database.commit()
Esempio n. 7
0
    def test_add_step(self):
        schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
        bucket = Mock()
        database = create_autospec(Database)
        expected = BulkCopyFromS3JsonStep(
            metadata='',
            source='',
            schema=schema,
            aws_access_key_id=AWS_ACCESS_KEY_ID,
            aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
            bucket=bucket,
            table=TargetTable(schema, database))

        pipeline = S3BulkCopyPipeline(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
                                      bucket, database)
        pipeline.step(metadata='', source='', schema=schema)

        step = pipeline.steps()[0]

        self.assertEqual(expected.metadata, step.metadata)
        self.assertEqual(expected.source, step.source)
        self.assertEqual(expected.schema, step.schema)
        self.assertEqual(expected.aws_access_key_id, step.aws_access_key_id)
        self.assertEqual(expected.aws_secret_access_key,
                         step.aws_secret_access_key)
        self.assertEqual(expected.bucket, step.bucket)
        self.assertEqual(expected.table.schema, step.table.schema)
        self.assertEqual(expected.table.database, step.table.database)
    def test_have_paths_for_nested_objects(self):
        schema = JsonObject(
            TABLE_NAME, Property('property1', 'VARCHAR(10)'),
            Property('property2', 'TIMESTAMP'),
            Property('property3.dottedName', 'DOUBLE PRECISION'),
            Property('property4',
                     Property('child', Property('subchild', 'BOOLEAN'))))

        self.assertEqual(
            {
                'jsonpaths': [
                    "$['property1']", "$['property2']",
                    "$['property3.dottedName']",
                    "$['property4']['child']['subchild']"
                ]
            }, schema.paths())
Esempio n. 9
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
By default the name of the JSON property is used as the column, but can be set
to a custom column name.
"""

if __name__ == '__main__':
    pipeline = S3CopyPipeline(
        aws_access_key_id=env('AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=env('AWS_SECRET_ACCESS_KEY'),
        bucket=env('BUCKET_NAME'),
        db_connection=psycopg2.connect(env('REDSHIFT_CONNECTION')))

    pipeline.bulk_copy(metadata='path_to_save_pipeline_metadata',
                       source='path_of_source_data',
                       schema=JsonObject(
                           'destination_table_name',
                           Property('id', 'VARCHAR(36)'),
                           Property('someNumber', 'INTEGER',
                                    'custom_column_name')))

    pipeline.manifest_copy(metadata='path_to_save_pipeline_metadata',
                           source='path_of_incremental_source_data',
                           schema=JsonObject(
                               'incremental_destination_table_name',
                               Property('id', 'VARCHAR(36)'),
                               Property('someNumber', 'INTEGER',
                                        'custom_column_name')))

    pipeline.sql(('SELECT someNumber + %s '
                  'INTO some_olap_table FROM destination_table_name', 1),
                 ('SELECT * INTO destination_table_name_copy '
                  'FROM destination_table_name'))
 def __expected_schema(table):
     return JsonObject(table, Property('userid', 'VARCHAR(36)'),
                       Property('timestamp', 'TIMESTAMP'))
Esempio n. 12
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
     self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
                          BUCKET_NAME, Mock())
     self.bucket.save = Mock()
     self.database = create_autospec(Database)
     self.key_names = [
         'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a',
         'object_path/19440481-7766-4061-bd42-4a54fa0aac7c',
         'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c',
         'object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd',
         'object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268',
         'object_path/80536e83-6bbe-4a42-ade1-533d99321a6c',
         'object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40',
         'object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'
     ]
     self.bucket.list = Mock(
         return_value=[self.mock_key(key) for key in self.key_names])
     self.manifest = SqlManifest(metadata='',
                                 source='',
                                 schema=self.schema,
                                 bucket=self.bucket,
                                 db_connection=self.database)
     self.expected_manifest = {
         'entries': [{
             'url':
             's3://{0}/object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/19440481-7766-4061-bd42-4a54fa0aac7c'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c'
             .format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/80536e83-6bbe-4a42-ade1-533d99321a6c'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }]
     }
Esempio n. 13
0
 def test_preserve_periods_for_default_column_name(self):
     self.assertEqual(
         'some.column.name',
         Property('some.column.name', 'VARCHAR(MAX)').column_name)
Esempio n. 14
0
 def test_have_column_name(self):
     self.assertEqual(
         'my_column_name',
         Property('some.column.name', 'VARCHAR(MAX)',
                  'my_column_name').column_name)