def test_add_property_and_update_paths(self): self.assertEqual({'jsonpaths': ["$['property1']"]}, JsonObject(TABLE_NAME).property( 'property1', 'VARCHAR(10)').paths()) self.assertEqual({'jsonpaths': ["$['property1']", "$['property2']"]}, JsonObject(TABLE_NAME).property( 'property1', 'VARCHAR(10)').property('property2', 'TIMESTAMP').paths())
def setUp(self): self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME, Mock()) self.bucket.save = Mock() self.bucket.delete = Mock() self.schema = JsonObject(TABLE_NAME).property('eventId', 'VARCHAR(36)') self.table = TargetTable(self.schema, Mock()) self.step = BulkCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=self.table) self.step.sql = Mock()
def test_add_step(self): schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)')) bucket = Mock() database = create_autospec(Database) expected = BulkCopyFromS3JsonStep( metadata='', source='', schema=schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=bucket, table=TargetTable(schema, database)) pipeline = S3BulkCopyPipeline(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, bucket, database) pipeline.step(metadata='', source='', schema=schema) step = pipeline.steps()[0] self.assertEqual(expected.metadata, step.metadata) self.assertEqual(expected.source, step.source) self.assertEqual(expected.schema, step.schema) self.assertEqual(expected.aws_access_key_id, step.aws_access_key_id) self.assertEqual(expected.aws_secret_access_key, step.aws_secret_access_key) self.assertEqual(expected.bucket, step.bucket) self.assertEqual(expected.table.schema, step.table.schema) self.assertEqual(expected.table.database, step.table.database)
def test_have_paths_for_nested_objects(self): schema = JsonObject( TABLE_NAME, Property('property1', 'VARCHAR(10)'), Property('property2', 'TIMESTAMP'), Property('property3.dottedName', 'DOUBLE PRECISION'), Property('property4', Property('child', Property('subchild', 'BOOLEAN')))) self.assertEqual( { 'jsonpaths': [ "$['property1']", "$['property2']", "$['property3.dottedName']", "$['property4']['child']['subchild']" ] }, schema.paths())
def setUp(self): self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)')) self.database = Database(psycopg2.connect(CONNECTION)) self.database.open() table = TargetTable(self.schema, self.database) table.create() self.database.commit()
def test_have_paths_for_nested_objects(self): schema = JsonObject(TABLE_NAME, Property('property1', 'VARCHAR(10)'), Property('property2', 'TIMESTAMP'), Property('property3.dottedName', 'DOUBLE PRECISION'), Property('property4', Property('child', Property( 'subchild', 'BOOLEAN')))) self.assertEqual({'jsonpaths': ["$['property1']", "$['property2']", "$['property3.dottedName']", "$['property4']['child']['subchild']"]}, schema.paths())
def setUp(self): self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME, Mock()) self.bucket.save = Mock() self.bucket.delete = Mock() self.schema = JsonObject(TABLE_NAME).property('eventId', 'VARCHAR(36)') self.table = TargetTable(self.schema, Mock()) self.step = ManifestCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=self.table) self.step.manifest = Mock() self.updated_journal = [ 'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a', 'object_path/19440481-7766-4061-bd42-4a54fa0aac7c', 'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c'] self.step.manifest.save = Mock(return_value=self.updated_journal) self.step.sql = Mock()
def test_create_when_column_name_not_defined_for_nested_property(self): with patch.object(Database, 'execute') as execute: schema = JsonObject( TABLE_NAME, Property('property1', 'VARCHAR(10)'), Property('property2', Property('timestamp', 'TIMESTAMP'))) table = TargetTable(schema, Database(Mock())) table.create() expected_sql = 'CREATE TABLE {0} (property1 VARCHAR(10), ' \ 'property2_timestamp TIMESTAMP)'.format(TABLE_NAME) execute.assert_called_once_with(expected_sql)
def test_stage_update_when_column_name_defined(self): with patch.object(Database, 'execute') as execute: schema = JsonObject( TABLE_NAME, Property('property1', 'VARCHAR(10)', 'someColumn'), Property('property2', 'TIMESTAMP', 'anotherColumn')) table = TargetTable(schema, Database(Mock())) table.stage_update() expected_sql = 'CREATE TABLE {0}_update (someColumn VARCHAR(10), ' \ 'anotherColumn TIMESTAMP)'.format(TABLE_NAME) execute.assert_called_once_with(expected_sql)
def bulk_copy(self, pipeline, metadata, max_error, order_by_column): dates = [] source_tables = [] for source in self.sources.get(): date = self.__get_date_from_path(source) target_table = '{0}'.format( source.replace('-', '_').replace('.', '_').replace('/', '_')) pipeline.bulk_copy(metadata=metadata, source=source, schema=JsonObject(target_table, *self.properties), max_error_count=max_error) self.sources.commit(self.update_date or date) dates.append(date) source_tables.append(target_table) update_statements = _SqlSeriesDataUpdate( target_table=self.destination_table, series_column=order_by_column, start=self.update_date or dates[0], source_tables=source_tables).statements() pipeline.sql(*update_statements)
def test_have_empty_paths(self): self.assertEqual({'jsonpaths': []}, JsonObject(TABLE_NAME).paths())
def setUp(self): self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)')) self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME, Mock()) self.bucket.save = Mock() self.database = create_autospec(Database) self.key_names = [ 'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a', 'object_path/19440481-7766-4061-bd42-4a54fa0aac7c', 'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c', 'object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd', 'object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268', 'object_path/80536e83-6bbe-4a42-ade1-533d99321a6c', 'object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40', 'object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1' ] self.bucket.list = Mock( return_value=[self.mock_key(key) for key in self.key_names]) self.manifest = SqlManifest(metadata='', source='', schema=self.schema, bucket=self.bucket, db_connection=self.database) self.expected_manifest = { 'entries': [{ 'url': 's3://{0}/object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a'. format(BUCKET_NAME), 'mandatory': True }, { 'url': 's3://{0}/object_path/19440481-7766-4061-bd42-4a54fa0aac7c'. format(BUCKET_NAME), 'mandatory': True }, { 'url': 's3://{0}/object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c' .format(BUCKET_NAME), 'mandatory': True }, { 'url': 's3://{0}/object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd'. format(BUCKET_NAME), 'mandatory': True }, { 'url': 's3://{0}/object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268'. format(BUCKET_NAME), 'mandatory': True }, { 'url': 's3://{0}/object_path/80536e83-6bbe-4a42-ade1-533d99321a6c'. format(BUCKET_NAME), 'mandatory': True }, { 'url': 's3://{0}/object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40'. format(BUCKET_NAME), 'mandatory': True }, { 'url': 's3://{0}/object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'. format(BUCKET_NAME), 'mandatory': True }] }
def test_throw_schema_exception_when_adding_duplicate_property(self): self.assertRaises( SchemaException, JsonObject(TABLE_NAME).property('property1', 'VARCHAR(10)').property, 'property1', 'VARCHAR(10)')
def test_have_jsonpath_file_name(self): self.assertEqual('{0}_jsonpath.json'.format(TABLE_NAME), JsonObject(TABLE_NAME).file_name)
def setUp(self): self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
class BulkCopyFromS3JsonStepShould(unittest.TestCase): def setUp(self): self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME, Mock()) self.bucket.save = Mock() self.bucket.delete = Mock() self.schema = JsonObject(TABLE_NAME).property('eventId', 'VARCHAR(36)') self.table = TargetTable(self.schema, Mock()) self.step = BulkCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=self.table) self.step.sql = Mock() def assert_migration_with_drop(self, target_table): target_table.database.open.assert_called_once_with() target_table.stage_update.assert_called_once_with() target_table.drop.assert_called_once_with() target_table.promote_update.assert_called_once_with() def assert_migration_without_drop(self, target_table): target_table.database.open.assert_called_once_with() target_table.stage_update.assert_called_once_with() self.assertEqual(False, target_table.drop.called) target_table.promote_update.assert_called_once_with() def test_save_schema_to_s3_bucket_on_run(self): self.step.run() self.bucket.save.assert_called_once_with(self.step.schema_key, json.dumps( self.schema.paths())) def test_copy_schema_to_redshift_with_drop_on_run(self): with patch.object(TargetTable, 'exists') as exists: exists.return_value = True target_table = TargetTable(self.schema, Mock()) target_table.stage_update = Mock() target_table.drop = Mock() target_table.promote_update = Mock() step = BulkCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=target_table) step.sql = Mock() step.run() self.assert_migration_with_drop(target_table) target_table.database.commit.assert_called_once_with() def test_copy_schema_to_redshift_without_drop_on_run(self): with patch.object(TargetTable, 'exists') as exists: exists.return_value = False target_table = TargetTable(self.schema, Mock()) target_table.stage_update = Mock() target_table.drop = Mock() target_table.promote_update = Mock() step = BulkCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=target_table) step.sql = Mock() step.run() self.assert_migration_without_drop(target_table) target_table.database.commit.assert_called_once_with() def test_validate_with_drop_on_run(self): with patch.object(TargetTable, 'exists') as exists: exists.return_value = True target_table = TargetTable(self.schema, Mock()) target_table.stage_update = Mock() target_table.drop = Mock() target_table.promote_update = Mock() step = BulkCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=target_table) step.sql = Mock() step.validate() self.assert_migration_with_drop(target_table) target_table.database.rollback.assert_called_once_with() def test_validate_without_drop_on_run(self): with patch.object(TargetTable, 'exists') as exists: exists.return_value = False target_table = TargetTable(self.schema, Mock()) target_table.stage_update = Mock() target_table.drop = Mock() target_table.promote_update = Mock() step = BulkCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=target_table) step.sql = Mock() step.validate() self.assert_migration_without_drop(target_table) target_table.database.rollback.assert_called_once_with() def test_source_url(self): self.assertEqual('s3://bucket/arbalest_test.event.created', self.step.source_url) def test_schema_url(self): self.assertEqual('s3://bucket/event_created_jsonpath.json', self.step.schema_url) def test_copy_sql(self): expected_copy_table_sql = "COPY %s FROM '%s' CREDENTIALS " \ "'aws_access_key_id=%s;" \ "aws_secret_access_key=%s' " \ "JSON '%s' TIMEFORMAT 'auto' " \ "MAXERROR %s" self.assertEqual(expected_copy_table_sql, self.step.copy_sql) def test_validate_sql(self): expected_validate_sql = "COPY %s FROM '%s' CREDENTIALS " \ "'aws_access_key_id=%s;" \ "aws_secret_access_key=%s' " \ "JSON '%s' TIMEFORMAT 'auto' " \ "MAXERROR %s " \ "NOLOAD" self.assertEqual(expected_validate_sql, self.step.validate_sql) def test_delete_schema_from_s3_bucket_on_run(self): self.step.run() self.bucket.delete.assert_called_once_with(self.step.schema_key)
By default the name of the JSON property is used as the column, but can be set to a custom column name. """ if __name__ == '__main__': pipeline = S3CopyPipeline( aws_access_key_id=env('AWS_ACCESS_KEY_ID'), aws_secret_access_key=env('AWS_SECRET_ACCESS_KEY'), bucket=env('BUCKET_NAME'), db_connection=psycopg2.connect(env('REDSHIFT_CONNECTION'))) pipeline.bulk_copy(metadata='path_to_save_pipeline_metadata', source='path_of_source_data', schema=JsonObject( 'destination_table_name', Property('id', 'VARCHAR(36)'), Property('someNumber', 'INTEGER', 'custom_column_name'))) pipeline.manifest_copy(metadata='path_to_save_pipeline_metadata', source='path_of_incremental_source_data', schema=JsonObject( 'incremental_destination_table_name', Property('id', 'VARCHAR(36)'), Property('someNumber', 'INTEGER', 'custom_column_name'))) pipeline.sql(('SELECT someNumber + %s ' 'INTO some_olap_table FROM destination_table_name', 1), ('SELECT * INTO destination_table_name_copy ' 'FROM destination_table_name'))
def __expected_schema(table): return JsonObject(table, Property('userid', 'VARCHAR(36)'), Property('timestamp', 'TIMESTAMP'))
class ManifestCopyFromS3JsonStepShould(unittest.TestCase): def setUp(self): self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, BUCKET_NAME, Mock()) self.bucket.save = Mock() self.bucket.delete = Mock() self.schema = JsonObject(TABLE_NAME).property('eventId', 'VARCHAR(36)') self.table = TargetTable(self.schema, Mock()) self.step = ManifestCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=self.table) self.step.manifest = Mock() self.updated_journal = [ 'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a', 'object_path/19440481-7766-4061-bd42-4a54fa0aac7c', 'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c'] self.step.manifest.save = Mock(return_value=self.updated_journal) self.step.sql = Mock() def assert_migration_with_drop(self, step): step.table.database.open.assert_called_once_with() step.manifest.save.assert_called_once_with() step.table.drop.assert_called_once_with() step.table.create.assert_called_once_with() def assert_migration_without_drop(self, step): step.table.database.open.assert_called_once_with() step.manifest.save.assert_called_once_with() self.assertEqual(False, step.table.drop.called) step.table.create.assert_called_once_with() def assert_migration_without_drop_and_create(self, step): step.table.database.open.assert_called_once_with() step.manifest.save.assert_called_once_with() self.assertEqual(False, step.table.drop.called) self.assertEqual(False, step.table.create.called) def assert_copy_schema_to_redshift_with_drop(self, sql, execute): with patch.object(TargetTable, 'exists') as exists: exists.return_value = True target_table = TargetTable(self.schema, Mock()) target_table.stage_update = Mock() target_table.drop = Mock() target_table.create = Mock() target_table.insert_update = Mock() step = ManifestCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=target_table) step.manifest.save = Mock(return_value=self.updated_journal) step.manifest.journal_exists = Mock(return_value=False) step.sql = Mock() execute(step) self.assert_migration_with_drop(step) step.sql.execute.assert_called_once_with((sql, self.schema.table, step.manifest.manifest_url, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, step.schema_url, step.max_error_count)) return target_table.database def assert_copy_schema_to_redshift_without_drop(self, sql, execute): with patch.object(TargetTable, 'exists') as exists: exists.return_value = False target_table = TargetTable(self.schema, Mock()) target_table.stage_update = Mock() target_table.drop = Mock() target_table.create = Mock() target_table.insert_update = Mock() step = ManifestCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=target_table) step.manifest.save = Mock(return_value=self.updated_journal) step.manifest.journal_exists = Mock(return_value=False) step.sql = Mock() execute(step) self.assert_migration_without_drop(step) step.sql.execute.assert_called_once_with((sql, self.schema.table, step.manifest.manifest_url, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, step.schema_url, step.max_error_count)) return target_table.database def assert_copy_schema_to_redshift_without_drop_and_create(self, sql, execute): with patch.object(TargetTable, 'exists') as exists: exists.return_value = True target_table = TargetTable(self.schema, Mock()) target_table.stage_update = Mock() target_table.drop = Mock() target_table.create = Mock() target_table.insert_update = Mock() step = ManifestCopyFromS3JsonStep(metadata='', source=SOURCE, schema=self.schema, aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, bucket=self.bucket, table=target_table) step.manifest.save = Mock(return_value=self.updated_journal) step.manifest.journal_exists = Mock(return_value=True) step.sql = Mock() execute(step) self.assert_migration_without_drop_and_create(step) step.sql.execute.assert_called_once_with((sql, self.schema.table, step.manifest.manifest_url, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, step.schema_url, step.max_error_count)) return target_table.database def test_save_schema_to_s3_bucket_on_run(self): self.step.run() self.bucket.save.assert_called_once_with(self.step.schema_key, json.dumps( self.schema.paths())) def test_copy_schema_to_redshift_with_drop_when_no_journal_on_run(self): database = self.assert_copy_schema_to_redshift_with_drop( EXPECTED_COPY_SQL, lambda step: step.run()) database.commit.assert_called_once_with() def test_copy_schema_to_redshift_without_drop_when_no_journal_on_run(self): database = self.assert_copy_schema_to_redshift_without_drop( EXPECTED_COPY_SQL, lambda step: step.run()) database.commit.assert_called_once_with() def test_copy_schema_to_redshift_without_drop_and_create_when_journal_on_run( self): database = self.assert_copy_schema_to_redshift_without_drop_and_create( EXPECTED_COPY_SQL, lambda step: step.run()) database.commit.assert_called_once_with() def test_copy_schema_to_redshift_with_drop_when_no_journal_on_validate( self): database = self.assert_copy_schema_to_redshift_with_drop( EXPECTED_VALIDATE_SQL, lambda step: step.validate()) self.assertEqual(False, database.commit.called) database.rollback.assert_called_once_with() def test_copy_schema_to_redshift_without_drop_when_no_journal_on_validate( self): database = self.assert_copy_schema_to_redshift_without_drop( EXPECTED_VALIDATE_SQL, lambda step: step.validate()) self.assertEqual(False, database.commit.called) database.rollback.assert_called_once_with() def test_copy_schema_to_redshift_without_drop_and_create_when_journal_on_validate( self): database = self.assert_copy_schema_to_redshift_without_drop_and_create( EXPECTED_VALIDATE_SQL, lambda step: step.validate()) self.assertEqual(False, database.commit.called) database.rollback.assert_called_once_with() def test_schema_url(self): self.assertEqual('s3://bucket/event_created_jsonpath.json', self.step.schema_url) def test_commit_manifest_on_run(self): self.step.run() self.step.manifest.commit.assert_called_once_with( self.step.manifest.save()) def test_not_commit_manifest_on_validate(self): self.step.validate() self.assertEqual(False, self.step.manifest.commit.called) def test_delete_schema_from_s3_bucket_on_run(self): self.step.run() self.bucket.delete.assert_called_once_with(self.step.schema_key)