def test_partition_2(self): bundle = Bundle() bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() table = self.bundle.schema.tables[0] p = (('time', 'time2'), ('space', 'space3'), ('table', table.name), ('grain', 'grain4')) p += p pids = {} for i in range(4): for j in range(4): pid = self.bundle.identity.as_partition(**dict(p[i:i + j + 1])) pids[pid.fqname] = pid for pid in pids.values(): part = bundle.partitions.new_db_partition(**pid.dict) part.create() parts = bundle.partitions._find_orm( PartitionNameQuery(vid=pid.vid)).all() self.assertIn(pid.sname, [p.name for p in parts])
def x_test_rewrite(self): from testbundle.bundle import Bundle import json from ambry.run import get_runconfig # Prepare to rewrite the bundle.yaml file. bundle = Bundle() orig = os.path.join(bundle.bundle_dir,'bundle.yaml') save = os.path.join(bundle.bundle_dir,'bundle.yaml.save') try: os.rename(orig,save) print 'Write to ', orig with open(orig,'w') as f: f.write(json.dumps( { "identity":{ "dataset": "dataset1", "id": "dfoo", "revision": 100, "source": "source1", "subset": "subset1", "variation": "variation1", "version": "1.0.1", "vid": "dfob001", }, "about": { "author": "*****@*****.**" } } )) get_runconfig.clear() # clear config cache. bundle = Bundle() bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() # Does the rewrite, adding the 'names' # Need to clear and reload one more time for the 'names' to appear get_runconfig.clear() # clear config cache. bundle = Bundle() bundle.exit_on_fatal = False self.assertEquals('dataset1', bundle.config.identity.dataset) self.assertEquals('dfoo', bundle.config.identity.id) self.assertEquals(100, bundle.config.identity.revision) self.assertEquals("source1-dataset1-subset1-variation1-1.0.100~dfoo01C", bundle.config.names.fqname) self.assertEquals("*****@*****.**", bundle.config.about.author) finally: os.rename(save, orig) self.delete_bundle()
def test_build_bundle_hdf(self): bundle = Bundle() bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_hdf() bundle.post_build()
def test_simple_build(self): import shutil bundle = Bundle() shutil.copyfile(bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build()
def test_simple_build(self): import shutil bundle = Bundle() shutil.copyfile( bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build()
def test_build_bundle(self): import shutil bundle = Bundle() shutil.copyfile( bundle.filesystem.path('meta','schema-edit-me.csv'), bundle.filesystem.path('meta','schema.csv')) #try: bundle.database.enable_delete = True bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() # The second run will use the changes to the schema made in the # first run, due to the types errors in the 'coding' table. bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() try: pass finally: # Need to clean up to ensure that we're back to a good state. # This runs the normal build, which will be used by the other # tests. shutil.copyfile( bundle.filesystem.path('meta','schema-edit-me.csv'), bundle.filesystem.path('meta','schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build()
def test_build_bundle(self): import shutil bundle = Bundle() shutil.copyfile(bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) #try: bundle.database.enable_delete = True bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() # The second run will use the changes to the schema made in the # first run, due to the types errors in the 'coding' table. bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() try: pass finally: # Need to clean up to ensure that we're back to a good state. # This runs the normal build, which will be used by the other # tests. shutil.copyfile( bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build()
def test_partition_2(self): bundle = Bundle() bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() table = self.bundle.schema.tables[0] p = (('time', 'time2'), ('space', 'space3'), ('table', table.name), ('grain', 'grain4')) p += p pids = {} for i in range(4): for j in range(4): pid = self.bundle.identity.as_partition(**dict(p[i:i + j + 1])) pids[pid.fqname] = pid for pid in pids.values(): part = bundle.partitions.new_db_partition(**pid.dict) part.create() parts = bundle.partitions._find_orm(PartitionNameQuery(vid=pid.vid)).all() self.assertIn(pid.sname, [p.name for p in parts])
def test_bundle_build(self): from ambry.dbexceptions import ConflictError bundle = Bundle() # Need to clear the library, or the Bundle's pre_prepare # will cancel the build if this version is already installed bundle.library.purge() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bundle.prepare() bp = bundle.partitions with bundle.session: bp._new_orm_partition(PartialPartitionName(table = 'tone', time = 't1', space='s1')) bp._new_orm_partition(PartialPartitionName(table = 'tone', time = 't1', space='s2')) bp._new_orm_partition(PartialPartitionName(table = 'tone', time = 't1', space=None)) bp._new_orm_partition(PartialPartitionName(table = 'tone', time = 't2', space='s1')) bp._new_orm_partition(PartialPartitionName(table = 'tone', time = 't2', space='s2')) bp._new_orm_partition(PartialPartitionName(table = 'tone', time = 't2', space=None)) with self.assertRaises(ConflictError): with bundle.session: bp._new_orm_partition(PartialPartitionName(table = 'tone',time = 't1', space='s1')) pnq = PartitionNameQuery(table = 'tone', time=NameQuery.ANY, space='s1') names = [p.vname for p in bp._find_orm(pnq).all()] self.assertEqual({u'source-dataset-subset-variation-tone-t1-s1-0.0.1', u'source-dataset-subset-variation-tone-t2-s1-0.0.1'}, set(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(space=NameQuery.ANY)).all()] self.assertEqual(6, len(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(table = 'tone',time='t1',space=NameQuery.ANY)).all()] self.assertEqual({'source-dataset-subset-variation-tone-t1-s2-0.0.1', 'source-dataset-subset-variation-tone-t1-0.0.1', 'source-dataset-subset-variation-tone-t1-s1-0.0.1'}, set(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(table = 'tone',time='t1',space=NameQuery.NONE)).all()] self.assertEqual({'source-dataset-subset-variation-tone-t1-0.0.1'}, set(names)) # Start over, use a higher level function to create the partitions bundle.close() # Or you'll get an OperationalError bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bundle.prepare() bp = bundle.partitions bp._new_partition(PartialPartitionName(table = 'tone',time = 't1', space='s1')) self.assertEquals(1, len(bp.all)) bp._new_partition(PartialPartitionName(table = 'tone',time = 't1', space='s2')) self.assertEquals(2, len(bp.all)) bp._new_partition(PartialPartitionName(table = 'tone',time = 't1', space=None)) bp._new_partition(PartialPartitionName(table = 'tone',time = 't2', space='s1')) bp._new_partition(PartialPartitionName(table = 'tone',time = 't2', space='s2')) bp._new_partition(PartialPartitionName(table = 'tone',time = 't2', space=None)) self.assertEquals(6, len(bp.all)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(time='t1', space=NameQuery.ANY)).all()] self.assertEqual({'source-dataset-subset-variation-tone-t1-s2-0.0.1', 'source-dataset-subset-variation-tone-t1-0.0.1', 'source-dataset-subset-variation-tone-t1-s1-0.0.1'}, set(names)) # Start over, use a higher level function to create the partitions bundle.close() bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bundle.prepare() bp = bundle.partitions p = bp.new_db_partition(table = 'tone',time = 't1', space='s1') self.assertEquals('source-dataset-subset-variation-tone-t1-s1-0.0.1~piEGPXmDC8001001', p.identity.fqname) p = bp.find_or_new(table = 'tone',time = 't1', space='s2') self.assertEquals('source-dataset-subset-variation-tone-t1-s2-0.0.1~piEGPXmDC8002001', p.identity.fqname) # Duplicate p = bp.find_or_new(table = 'tone',time = 't1', space='s2') self.assertEquals('source-dataset-subset-variation-tone-t1-s2-0.0.1~piEGPXmDC8002001', p.identity.fqname) # Ok! Build! bundle.close() bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() self.assertEquals('diEGPXmDC8001', bundle.identity.vid) self.assertEquals('source-dataset-subset-variation', bundle.identity.sname) self.assertEquals('source-dataset-subset-variation-0.0.1', bundle.identity.vname) self.assertEquals('source-dataset-subset-variation-0.0.1~diEGPXmDC8001', bundle.identity.fqname)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.copy_or_build_bundle() self.bundle = Bundle() self.bundle_dir = self.bundle.bundle_dir def test_db_bundle(self): from ambry.bundle import BuildBundle, DbBundle b = BuildBundle(self.bundle_dir) b.clean() self.assertTrue(b.identity.id_ is not None) self.assertEquals('source-dataset-subset-variation', b.identity.sname) self.assertEquals('source-dataset-subset-variation-0.0.1', b.identity.vname) b.database.create() db_path = b.database.path dbb = DbBundle(db_path) self.assertEqual("source-dataset-subset-variation", dbb.identity.sname) self.assertEqual("source-dataset-subset-variation-0.0.1", dbb.identity.vname) def test_paths(self): ''' Test that a build bundle and a db bundle both produce the same paths. ''' from ambry.bundle import DbBundle b = self.bundle db = DbBundle(b.database.path) self.assertEqual(b.path, db.path) self.assertTrue(os.path.exists(b.path)) self.assertEqual(b.database.path, db.database.path) self.assertTrue(os.path.exists(b.database.path)) self.assertEqual(b.identity.path, db.identity.path) for p in zip(b.partitions, db.partitions): self.assertTrue(bool(p[0].path)) self.assertEqual(p[0].path, p[1].path) self.assertTrue(bool(p[0].path)) def test_schema_direct(self): '''Test adding tables directly to the schema''' # If we don't explicitly set the id_, it will change for every run. self.bundle.metadata.identity.id = 'aTest' self.bundle.schema.clean() with self.bundle.session: s = self.bundle.schema s.add_table('table 1', altname='alt name a') s.add_table('table 2', altname='alt name b') self.assertRaises(Exception, s.add_table, ('table 1', )) t = s.add_table('table 3', altname='alt name') s.add_column(t, 'col 1', altname='altname1') s.add_column(t, 'col 2', altname='altname2') s.add_column(t, 'col 3', altname='altname3') #print self.bundle.schema.as_csv() self.assertIn('tiEGPXmDC801', [t.id_ for t in self.bundle.schema.tables]) self.assertIn('tiEGPXmDC802', [t.id_ for t in self.bundle.schema.tables]) self.assertNotIn('cTest03', [t.id_ for t in self.bundle.schema.tables]) t = self.bundle.schema.table('table_3') self.assertIn('ciEGPXmDC803001', [c.id_ for c in t.columns]) self.assertIn('ciEGPXmDC803002', [c.id_ for c in t.columns]) self.assertIn('ciEGPXmDC803003', [c.id_ for c in t.columns]) # Try with a nested session, b/c we need to test it somewhere ... with self.bundle.session: with self.bundle.session: t = s.add_table('table 4', altname='alt name') s.add_column(t, 'col 1', altname='altname1') s.add_column(t, 'col 2', altname='altname2') s.add_column(t, 'col 3', altname='altname3') def x_test_generate_schema(self): '''Uses the generateSchema method in the bundle''' from ambry.orm import Column with self.bundle.session: s = self.bundle.schema s.clean() t1 = s.add_table('table1') s.add_column(t1, name='col1', datatype=Column.DATATYPE_REAL) s.add_column(t1, name='col2', datatype=Column.DATATYPE_INTEGER) s.add_column(t1, name='col3', datatype=Column.DATATYPE_TEXT) t2 = s.add_table('table2') s.add_column(t2, name='col1') s.add_column(t2, name='col2') s.add_column(t2, name='col3') t3 = s.add_table('table3') s.add_column(t3, name='col1', datatype=Column.DATATYPE_REAL) s.add_column(t3, name='col2', datatype=Column.DATATYPE_INTEGER) s.add_column(t3, name='col3', datatype=Column.DATATYPE_TEXT) def test_column_processor(self): from ambry.orm import Column from ambry.transform import BasicTransform, CensusTransform self.bundle.schema.clean() with self.bundle.session: s = self.bundle.schema t = s.add_table('table3') s.add_column(t, name='col1', datatype=Column.DATATYPE_INTEGER, default=-1, illegal_value='999') s.add_column(t, name='col2', datatype=Column.DATATYPE_TEXT) s.add_column(t, name='col3', datatype=Column.DATATYPE_REAL) c1 = t.column('col1') self.assertEquals(1, BasicTransform(c1)({'col1': ' 1 '})) with self.assertRaises(ValueError): print "PROCESSOR '{}'".format( CensusTransform(c1)({ 'col1': ' B ' })) self.assertEquals(1, CensusTransform(c1)({'col1': ' 1 '})) self.assertEquals(-1, CensusTransform(c1)({'col1': ' 999 '})) self.assertEquals(-3, CensusTransform(c1)({'col1': ' # '})) self.assertEquals(-2, CensusTransform(c1)({'col1': ' ! '})) def test_validator(self): # # Validators # tests = [ ('tone', True, (None, 'VALUE', 0, 0)), ('tone', True, (None, 'VALUE', -1, 0)), ('tone', False, (None, 'DEFAULT', 0, 0)), ('tone', False, (None, 'DEFAULT', -1, 0)), ('ttwo', True, (None, 'DEFAULT', 0, 0)), ('ttwo', True, (None, 'DEFAULT', 0, 3.14)), ('ttwo', False, (None, 'DEFAULT', -1, 0)), ('tthree', True, (None, 'DEFAULT', 0, 0)), ('tthree', True, (None, 'DEFAULT', 0, 3.14)), ('all', True, (None, 'text1', 'text2', 1, 2, 3, 3.14)), ('all', False, (None, 'text1', 'text2', -1, -1, 3, 3.14)), ('all', False, (None, 'text1', 'text2', -1, 2, 3, 3.14)), ('all', False, (None, 'text1', 'text2', 1, -1, 3, 3.14)), ] for i, test in enumerate(tests): table_name, truth, row = test table = self.bundle.schema.table(table_name) vd = table._get_validator() if truth: self.assertTrue( vd(row), "Test {} not 'true' for table '{}': {}".format( i + 1, table_name, row)) else: self.assertFalse( vd(row), "Test {} not 'false' for table '{}': {}".format( i + 1, table_name, row)) # Testing the "OR" join of multiple columns. tests = [ ('tone', True, (None, 'VALUE', 0, 0)), #1 ('tone', True, (None, 'VALUE', -1, 0)), ('tone', False, (None, 'DEFAULT', 0, 0)), ('tone', False, (None, 'DEFAULT', -1, 0)), ('ttwo', True, (None, 'DEFAULT', 0, 0)), #5 ('ttwo', True, (None, 'DEFAULT', 0, 3.14)), ('ttwo', False, (None, 'DEFAULT', -1, 0)), ('tthree', True, (None, 'DEFAULT', 0, 0)), #8 ('tthree', True, (None, 'DEFAULT', 0, 3.14)), ('all', True, (None, 'text1', 'text2', 1, 2, 3, 3.14)), #10 ('all', False, (None, 'text1', 'text2', -1, -1, 3, 3.14)), #11 ('all', True, (None, 'text1', 'text2', -1, 2, 3, 3.14)), #12 ('all', True, (None, 'text1', 'text2', 1, -1, 3, 3.14)), #13 ] for i, test in enumerate(tests): table_name, truth, row = test table = self.bundle.schema.table(table_name) vd = table._get_validator(and_join=False) if truth: self.assertTrue( vd(row), "Test {} not 'true' for table '{}': {}".format( i + 1, table_name, row)) else: self.assertFalse( vd(row), "Test {} not 'false' for table '{}': {}".format( i + 1, table_name, row)) # Test the hash functions. This test depends on the d_test values in geoschema.csv tests = [('tone', 'A|1|', (None, 'A', 1, 2)), ('ttwo', '1|2|', (None, 'B', 1, 2)), ('tthree', 'C|2|', (None, 'C', 1, 2))] import hashlib for i, test in enumerate(tests): table_name, hashed_str, row = test table = self.bundle.schema.table(table_name) m = hashlib.md5() m.update(hashed_str) self.assertEquals(int(m.hexdigest()[:14], 16), table.row_hash(row)) def test_partition(self): from ambry.dbexceptions import ConflictError from ambry.identity import PartitionNameQuery from ambry.partition.csv import CsvPartition self.bundle.clean() self.bundle.prepare() p = self.bundle.partitions.new_db_partition(time=10, space=10, data={'pid': 'pid1'}) with self.assertRaises(ConflictError): self.bundle.partitions.new_db_partition(time=10, space=10, data={'pid': 'pid1'}) self.assertEqual(1, len(self.bundle.partitions.all)) p = self.bundle.partitions.find_or_new(time=10, space=10) p.database.create( ) # Find will go to the library if the database doesn't exist. self.assertEqual(1, len(self.bundle.partitions.all)) self.assertEquals('pid1', p.data['pid']) p = self.bundle.partitions.find(PartitionNameQuery(time=10, space=10)) self.assertEquals('pid1', p.data['pid']) p = self.bundle.partitions.find(time=10, space=10) self.assertEquals('pid1', p.data['pid']) pnq3 = PartitionNameQuery(space=10) with self.bundle.session as s: p = self.bundle.partitions._find_orm(pnq3).first() p.data['foo'] = 'bar' s.add(p) bundle = Bundle() p = bundle.partitions.find(pnq3) self.assertEquals('bar', p.data['foo']) #p = self.bundle.partitions.find(PartitionNameQuery(name='source-dataset-subset-variation-30-hdf')) #self.assertTrue(p is not None) #self.assertEquals('source-dataset-subset-variation-30-hdf', p.identity.sname) # # Create all possible combinations of partition names # table = self.bundle.schema.tables[0] p = (('time', 'time2'), ('space', 'space3'), ('table', table.name), ('grain', 'grain4')) p += p pids = {} for i in range(4): for j in range(4): pid = self.bundle.identity.as_partition(**dict(p[i:i + j + 1])) pids[pid.fqname] = pid with self.bundle.session as s: s.commit() # These two deletey bits clear out all of the old # partitions, to avoid a conflict with the next section. We also have # to delete the files, since create() adds a partition record to the database, # and if one already exists, it will throw an Integrity Error. for p in self.bundle.partitions: if os.path.exists(p.database.path): os.remove(p.database.path) for p in self.bundle.dataset.partitions: # Using SQL instead of s.delete() because we want to avoid the cascade to stored_partitions, since # that table doesn't exist in the bundle, only in the library s.execute("DELETE FROM partitions WHERE p_vid = :vid", {'vid': p.vid}) #s.delete(p) def test_partition_2(self): bundle = Bundle() bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() table = self.bundle.schema.tables[0] p = (('time', 'time2'), ('space', 'space3'), ('table', table.name), ('grain', 'grain4')) p += p pids = {} for i in range(4): for j in range(4): pid = self.bundle.identity.as_partition(**dict(p[i:i + j + 1])) pids[pid.fqname] = pid for pid in pids.values(): part = bundle.partitions.new_db_partition(**pid.dict) part.create() parts = bundle.partitions._find_orm( PartitionNameQuery(vid=pid.vid)).all() self.assertIn(pid.sname, [p.name for p in parts]) def test_runconfig(self): """Check the the RunConfig expands the library configuration""" from ambry.run import get_runconfig, RunConfig rc = get_runconfig((os.path.join(self.bundle_dir, 'test-run-config.yaml'), RunConfig.USER_CONFIG, RunConfig.USER_ACCOUNTS)) l = rc.library('library1') self.assertEquals('database1', l['database']['_name']) self.assertEquals('filesystem1', l['filesystem']['_name']) self.assertEquals('filesystem2', l['filesystem']['upstream']['_name']) self.assertEquals('filesystem3', l['filesystem']['upstream']['upstream']['_name']) self.assertEquals( 'devtest.sandiegodata.org', l['filesystem']['upstream']['upstream']['account']['_name']) def test_build_bundle(self): import shutil bundle = Bundle() shutil.copyfile(bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) #try: bundle.database.enable_delete = True bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() # The second run will use the changes to the schema made in the # first run, due to the types errors in the 'coding' table. bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() try: pass finally: # Need to clean up to ensure that we're back to a good state. # This runs the normal build, which will be used by the other # tests. shutil.copyfile( bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build() def test_simple_build(self): import shutil bundle = Bundle() shutil.copyfile(bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build() def test_config_update(self): bundle = Bundle() bundle.update_configuration() def test_session(self): import uuid b = self.bundle uv = str(uuid.uuid4()) with b.session as s1: with b.session as s2: b.set_value('test', 'uuid', uv) b.close() self.assertEqual(uv, b.get_value('test', 'uuid').value) uv2 = str(uuid.uuid4()) self.assertNotEqual(uv, uv2) with b.session as s1: with b.session as s2: b.set_value('test', 'uuid', uv2) self.assertEqual(uv2, b.get_value('test', 'uuid').value) b.set_value('test', 'uuid', uv2)
def test_bundle_build(self): from ambry.dbexceptions import ConflictError bundle = Bundle() # Need to clear the library, or the Bundle's pre_prepare # will cancel the build if this version is already installed bundle.library.purge() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bp = bundle.partitions with bundle.session: bp._new_orm_partition(PartialPartitionName(time = 't1', space='s1')) bp._new_orm_partition(PartialPartitionName(time = 't1', space='s2')) bp._new_orm_partition(PartialPartitionName(time = 't1', space=None)) bp._new_orm_partition(PartialPartitionName(time = 't2', space='s1')) bp._new_orm_partition(PartialPartitionName(time = 't2', space='s2')) bp._new_orm_partition(PartialPartitionName(time = 't2', space=None)) with self.assertRaises(ConflictError): with bundle.session: bp._new_orm_partition(PartialPartitionName(time = 't1', space='s1')) pnq = PartitionNameQuery(time=NameQuery.ANY, space='s1') names = [p.vname for p in bp._find_orm(pnq).all()] self.assertEqual({u'source-dataset-subset-variation-t2-s1-0.0.1', u'source-dataset-subset-variation-t1-s1-0.0.1'}, set(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(space=NameQuery.ANY)).all()] self.assertEqual(6,len(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(time='t1',space=NameQuery.ANY)).all()] self.assertEqual({'source-dataset-subset-variation-t1-s2-0.0.1', 'source-dataset-subset-variation-t1-0.0.1', 'source-dataset-subset-variation-t1-s1-0.0.1'}, set(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(time='t1',space=NameQuery.NONE)).all()] self.assertEqual({'source-dataset-subset-variation-t1-0.0.1'}, set(names)) # Start over, use a higher level function to create the partitions bundle.close() # Or you'll get an OperationalError bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bp = bundle.partitions bp._new_partition(PartialPartitionName(time = 't1', space='s1')) self.assertEquals(1, len(bp.all)) bp._new_partition(PartialPartitionName(time = 't1', space='s2')) self.assertEquals(2, len(bp.all)) bp._new_partition(PartialPartitionName(time = 't1', space=None)) bp._new_partition(PartialPartitionName(time = 't2', space='s1')) bp._new_partition(PartialPartitionName(time = 't2', space='s2')) bp._new_partition(PartialPartitionName(time = 't2', space=None)) self.assertEquals(6, len(bp.all)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(time='t1',space=NameQuery.ANY)).all()] self.assertEqual({'source-dataset-subset-variation-t1-s2-0.0.1', 'source-dataset-subset-variation-t1-0.0.1', 'source-dataset-subset-variation-t1-s1-0.0.1'}, set(names)) # Start over, use a higher level function to create the partitions bundle.close() bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bp = bundle.partitions p = bp.new_db_partition(time = 't1', space='s1') self.assertEquals('source-dataset-subset-variation-t1-s1-0.0.1~piEGPXmDC8001001', p.identity.fqname) p = bp.find_or_new(time = 't1', space='s2') self.assertEquals('source-dataset-subset-variation-t1-s2-0.0.1~piEGPXmDC8002001', p.identity.fqname) # Duplicate p = bp.find_or_new(time = 't1', space='s2') self.assertEquals('source-dataset-subset-variation-t1-s2-0.0.1~piEGPXmDC8002001', p.identity.fqname) p = bp.find_or_new_geo(time = 't2', space='s1') # Which it is depends on whether GDAL is installed. self.assertIn(p.identity.fqname,[ 'source-dataset-subset-variation-t2-s1-geo-0.0.1~piEGPXmDC8003001', 'source-dataset-subset-variation-t2-s1-0.0.1~piEGPXmDC8003001' ] ) # Ok! Build! bundle.close() bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() self.assertEquals('diEGPXmDC8001',bundle.identity.vid) self.assertEquals('source-dataset-subset-variation',bundle.identity.sname) self.assertEquals('source-dataset-subset-variation-0.0.1',bundle.identity.vname) self.assertEquals('source-dataset-subset-variation-0.0.1~diEGPXmDC8001',bundle.identity.fqname)
class Test(TestBase): def setUp(self): super(Test, self).setUp() self.copy_or_build_bundle() self.bundle = Bundle() self.bundle_dir = self.bundle.bundle_dir def test_db_bundle(self): from ambry.bundle import BuildBundle, DbBundle b = BuildBundle(self.bundle_dir) b.clean() self.assertTrue(b.identity.id_ is not None) self.assertEquals('source-dataset-subset-variation', b.identity.sname) self.assertEquals('source-dataset-subset-variation-0.0.1', b.identity.vname) b.database.create() db_path = b.database.path dbb = DbBundle(db_path) self.assertEqual("source-dataset-subset-variation", dbb.identity.sname) self.assertEqual("source-dataset-subset-variation-0.0.1", dbb.identity.vname) def test_paths(self): """ Test that a build bundle and a db bundle both produce the same paths. """ from ambry.bundle import DbBundle b = self.bundle db = DbBundle(b.database.path) self.assertEqual(b.path, db.path) self.assertTrue(os.path.exists(b.path)) self.assertEqual(b.database.path, db.database.path) self.assertTrue(os.path.exists(b.database.path)) self.assertEqual(b.identity.path, db.identity.path) for p in zip(b.partitions, db.partitions): self.assertTrue(bool(p[0].path)) self.assertEqual(p[0].path, p[1].path) self.assertTrue(bool(p[0].path)) def test_schema_direct(self): """Test adding tables directly to the schema""" # If we don't explicitly set the id_, it will change for every run. self.bundle.metadata.identity.id = 'aTest' self.bundle.schema.clean() with self.bundle.session: s = self.bundle.schema s.add_table('table 1', altname='alt name a') s.add_table('table 2', altname='alt name b') self.assertRaises(Exception, s.add_table, ('table 1', )) t = s.add_table('table 3', altname='alt name') s.add_column(t, 'col 1', altname='altname1') s.add_column(t, 'col 2', altname='altname2') s.add_column(t, 'col 3', altname='altname3') # print self.bundle.schema.as_csv() self.assertIn('tiEGPXmDC801', [t.id_ for t in self.bundle.schema.tables]) self.assertIn('tiEGPXmDC802', [t.id_ for t in self.bundle.schema.tables]) self.assertNotIn('cTest03', [t.id_ for t in self.bundle.schema.tables]) t = self.bundle.schema.table('table_3') self.assertIn('ciEGPXmDC803001', [c.id_ for c in t.columns]) self.assertIn('ciEGPXmDC803002', [c.id_ for c in t.columns]) self.assertIn('ciEGPXmDC803003', [c.id_ for c in t.columns]) # Try with a nested session, b/c we need to test it somewhere ... with self.bundle.session: with self.bundle.session: t = s.add_table('table 4', altname='alt name') s.add_column(t, 'col 1', altname='altname1') s.add_column(t, 'col 2', altname='altname2') s.add_column(t, 'col 3', altname='altname3') def x_test_generate_schema(self): """Uses the generateSchema method in the bundle""" from ambry.orm import Column with self.bundle.session: s = self.bundle.schema s.clean() t1 = s.add_table('table1') s.add_column(t1, name='col1', datatype=Column.DATATYPE_REAL) s.add_column(t1, name='col2', datatype=Column.DATATYPE_INTEGER) s.add_column(t1, name='col3', datatype=Column.DATATYPE_TEXT) t2 = s.add_table('table2') s.add_column(t2, name='col1') s.add_column(t2, name='col2') s.add_column(t2, name='col3') t3 = s.add_table('table3') s.add_column(t3, name='col1', datatype=Column.DATATYPE_REAL) s.add_column(t3, name='col2', datatype=Column.DATATYPE_INTEGER) s.add_column(t3, name='col3', datatype=Column.DATATYPE_TEXT) def test_column_processor(self): from ambry.orm import Column from ambry.transform import BasicTransform, CensusTransform self.bundle.schema.clean() with self.bundle.session: s = self.bundle.schema t = s.add_table('table3') s.add_column(t, name='col1', datatype=Column.DATATYPE_INTEGER, default=-1, illegal_value='999') s.add_column(t, name='col2', datatype=Column.DATATYPE_TEXT) s.add_column(t, name='col3', datatype=Column.DATATYPE_REAL) c1 = t.column('col1') self.assertEquals(1, BasicTransform(c1)({'col1': ' 1 '})) with self.assertRaises(ValueError): print "PROCESSOR '{}'".format(CensusTransform(c1)({'col1': ' B '})) self.assertEquals(1, CensusTransform(c1)({'col1': ' 1 '})) self.assertEquals(-1, CensusTransform(c1)({'col1': ' 999 '})) self.assertEquals(-3, CensusTransform(c1)({'col1': ' # '})) self.assertEquals(-2, CensusTransform(c1)({'col1': ' ! '})) def test_validator(self): # # Validators # tests = [ ('tone', True, (None, 'VALUE', 0, 0)), ('tone', True, (None, 'VALUE', -1, 0)), ('tone', False, (None, 'DEFAULT', 0, 0)), ('tone', False, (None, 'DEFAULT', -1, 0)), ('ttwo', True, (None, 'DEFAULT', 0, 0)), ('ttwo', True, (None, 'DEFAULT', 0, 3.14)), ('ttwo', False, (None, 'DEFAULT', -1, 0)), ('tthree', True, (None, 'DEFAULT', 0, 0)), ('tthree', True, (None, 'DEFAULT', 0, 3.14)), ('all', True, (None, 'text1', 'text2', 1, 2, 3, 3.14)), ('all', False, (None, 'text1', 'text2', -1, -1, 3, 3.14)), ('all', False, (None, 'text1', 'text2', -1, 2, 3, 3.14)), ('all', False, (None, 'text1', 'text2', 1, -1, 3, 3.14)), ] for i, test in enumerate(tests): table_name, truth, row = test table = self.bundle.schema.table(table_name) vd = table._get_validator() if truth: self.assertTrue(vd(row), "Test {} not 'true' for table '{}': {}".format(i + 1, table_name, row)) else: self.assertFalse(vd(row), "Test {} not 'false' for table '{}': {}".format(i + 1, table_name, row)) # Testing the "OR" join of multiple columns. tests = [ ('tone', True, (None, 'VALUE', 0, 0)), # 1 ('tone', True, (None, 'VALUE', -1, 0)), ('tone', False, (None, 'DEFAULT', 0, 0)), ('tone', False, (None, 'DEFAULT', -1, 0)), ('ttwo', True, (None, 'DEFAULT', 0, 0)), # 5 ('ttwo', True, (None, 'DEFAULT', 0, 3.14)), ('ttwo', False, (None, 'DEFAULT', -1, 0)), ('tthree', True, (None, 'DEFAULT', 0, 0)), # 8 ('tthree', True, (None, 'DEFAULT', 0, 3.14)), ('all', True, (None, 'text1', 'text2', 1, 2, 3, 3.14)), # 10 ('all', False, (None, 'text1', 'text2', -1, -1, 3, 3.14)), # 11 ('all', True, (None, 'text1', 'text2', -1, 2, 3, 3.14)), # 12 ('all', True, (None, 'text1', 'text2', 1, -1, 3, 3.14)), # 13 ] for i, test in enumerate(tests): table_name, truth, row = test table = self.bundle.schema.table(table_name) vd = table._get_validator(and_join=False) if truth: self.assertTrue(vd(row), "Test {} not 'true' for table '{}': {}".format(i + 1, table_name, row)) else: self.assertFalse(vd(row), "Test {} not 'false' for table '{}': {}".format(i + 1, table_name, row)) # Test the hash functions. This test depends on the d_test values in geoschema.csv tests = [ ('tone', 'A|1|', (None, 'A', 1, 2)), ('ttwo', '1|2|', (None, 'B', 1, 2)), ('tthree', 'C|2|', (None, 'C', 1, 2))] import hashlib for i, test in enumerate(tests): table_name, hashed_str, row = test table = self.bundle.schema.table(table_name) m = hashlib.md5() m.update(hashed_str) self.assertEquals(int(m.hexdigest()[:14], 16), table.row_hash(row)) def test_partition(self): from ambry.dbexceptions import ConflictError from ambry.identity import PartitionNameQuery self.bundle.clean() self.bundle.prepare() p = self.bundle.partitions.new_db_partition(table='tone', time=10, space=10, data={'pid':'pid1'}) with self.assertRaises(ConflictError): self.bundle.partitions.new_db_partition(table='tone',time=10, space=10, data={'pid':'pid1'}) self.assertEqual(1, len(self.bundle.partitions.all)) p = self.bundle.partitions.find_or_new(table='tone',time=10, space=10) p.database.create() # Find will go to the library if the database doesn't exist. self.assertEqual(1, len(self.bundle.partitions.all)) self.assertEquals('pid1', p.data['pid']) p = self.bundle.partitions.find(PartitionNameQuery(table='tone',time=10, space=10)) self.assertEquals('pid1',p.data['pid'] ) p = self.bundle.partitions.find(table='tone',time=10, space=10) self.assertEquals('pid1', p.data['pid']) pnq3 = PartitionNameQuery(space=10) with self.bundle.session as s: p = self.bundle.partitions._find_orm(pnq3).first() p.data['foo'] = 'bar' s.add(p) bundle = Bundle() p = bundle.partitions.find(pnq3) self.assertEquals('bar', p.data['foo']) # p = self.bundle.partitions.find(PartitionNameQuery(name='source-dataset-subset-variation-30-hdf')) # self.assertTrue(p is not None) # self.assertEquals('source-dataset-subset-variation-30-hdf', p.identity.sname) # # Create all possible combinations of partition names # table = self.bundle.schema.tables[0] p = (('time', 'time2'), ('space', 'space3'), ('table', table.name), ('grain', 'grain4')) p += p pids = {} for i in range(4): for j in range(4): pid = self.bundle.identity.as_partition(**dict(p[i:i + j + 1])) pids[pid.fqname] = pid with self.bundle.session as s: s.commit() # These two deletey bits clear out all of the old # partitions, to avoid a conflict with the next section. We also have # to delete the files, since create() adds a partition record to the database, # and if one already exists, it will throw an Integrity Error. for p in self.bundle.partitions: if os.path.exists(p.database.path): os.remove(p.database.path) for p in self.bundle.dataset.partitions: # Using SQL instead of s.delete() because we want to avoid the cascade to stored_partitions, since # that table doesn't exist in the bundle, only in the library s.execute("DELETE FROM partitions WHERE p_vid = :vid", {'vid': p.vid}) # s.delete(p) def test_runconfig(self): """Check the the RunConfig expands the library configuration""" from ambry.run import get_runconfig, RunConfig rc = get_runconfig( (os.path.join(self.bundle_dir, 'test-run-config.yaml'), RunConfig.USER_CONFIG, RunConfig.USER_ACCOUNTS)) l = rc.library('library1') self.assertEquals('database1', l['database']['_name']) self.assertEquals('filesystem1', l['filesystem']['_name']) self.assertEquals('filesystem2', l['filesystem']['upstream']['_name']) self.assertEquals('filesystem3', l['filesystem']['upstream']['upstream']['_name']) def test_build_bundle(self): import shutil bundle = Bundle() shutil.copyfile( bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) # try: bundle.database.enable_delete = True bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() # The second run will use the changes to the schema made in the # first run, due to the types errors in the 'coding' table. bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() bundle.close() try: pass finally: # Need to clean up to ensure that we're back to a good state. # This runs the normal build, which will be used by the other # tests. shutil.copyfile( bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build() def test_simple_build(self): import shutil bundle = Bundle() shutil.copyfile( bundle.filesystem.path('meta', 'schema-edit-me.csv'), bundle.filesystem.path('meta', 'schema.csv')) bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build() bundle.post_build() def test_config_update(self): bundle = Bundle() bundle.update_configuration() def test_session(self): import uuid b = self.bundle uv = str(uuid.uuid4()) with b.session as s1: with b.session as s2: b.set_value('test', 'uuid', uv) b.close() self.assertEqual(uv, b.get_value('test', 'uuid').value) uv2 = str(uuid.uuid4()) self.assertNotEqual(uv, uv2) with b.session as s1: with b.session as s2: b.set_value('test', 'uuid', uv2) self.assertEqual(uv2, b.get_value('test', 'uuid').value) b.set_value('test', 'uuid', uv2)
def test_partition(self): from ambry.dbexceptions import ConflictError from ambry.identity import PartitionIdentity, PartitionNameQuery from ambry.partition.csv import CsvPartition from ambry.partition.hdf import HdfPartition self.bundle.clean() self.bundle.prepare() p = self.bundle.partitions.new_db_partition(time=10, space=10, data={'pid':'pid1'}) p = self.bundle.partitions.new_csv_partition(time=20, space=20, data={'pid':'pid2'}) self.assertIsInstance(p, CsvPartition ) p = self.bundle.partitions.find_or_new_csv(time=20, space=20) self.assertIsInstance(p, CsvPartition) p = self.bundle.partitions.new_hdf_partition(space=30, data={'pid':'pid3'}) self.assertIsInstance(p, HdfPartition) p = self.bundle.partitions.find_or_new_hdf(space=30) self.assertIsInstance(p, HdfPartition) with self.assertRaises(ConflictError): self.bundle.partitions.new_db_partition(time=10, space=10, data={'pid':'pid1'}) with self.assertRaises(ConflictError): self.bundle.partitions.new_csv_partition(time=20, space=20, data={'pid':'pid21'}) with self.assertRaises(ConflictError): self.bundle.partitions.new_hdf_partition(space=30, data={'pid':'pid31'}) self.assertEqual(3, len(self.bundle.partitions.all)) p = self.bundle.partitions.find_or_new(time=10, space=10) p.database.create() # Find will go to the library if the database doesn't exist. self.assertEqual(3, len(self.bundle.partitions.all)) self.assertEquals('pid1',p.data['pid'] ) p = self.bundle.partitions.find_or_new_csv(time=20, space=20) p.database.create() self.assertEquals('pid2',p.data['pid'] ) p = self.bundle.partitions.find_or_new_hdf(space=30) self.assertEquals('pid3',p.data['pid'] ) p = self.bundle.partitions.find(PartitionNameQuery(time=10, space=10)) self.assertEquals('pid1',p.data['pid'] ) p = self.bundle.partitions.find(time=10, space=10) self.assertEquals('pid1', p.data['pid']) p = self.bundle.partitions.find(PartitionNameQuery(time=20, space=20)) self.assertEquals('pid2',p.data['pid'] ) p = self.bundle.partitions.find(time=20, space=20) self.assertEquals('pid2',p.data['pid'] ) pnq3 = PartitionNameQuery(space=30) p = self.bundle.partitions.find(pnq3) self.assertEquals('pid3',p.data['pid'] ) with self.bundle.session as s: p = self.bundle.partitions._find_orm(pnq3).first() p.data['foo'] = 'bar' s.add(p) bundle = Bundle() p = bundle.partitions.find(pnq3) print p.data self.assertEquals('bar',p.data['foo'] ) p = self.bundle.partitions.find(PartitionNameQuery(name='source-dataset-subset-variation-30-hdf')) self.assertTrue(p is not None) self.assertEquals('source-dataset-subset-variation-30-hdf', p.identity.sname) # # Create all possible combinations of partition names # table = self.bundle.schema.tables[0] p = (('time','time2'),('space','space3'),('table',table.name),('grain','grain4')) p += p pids = {} for i in range(4): for j in range(4): pid = self.bundle.identity.as_partition(**dict(p[i:i+j+1])) pids[pid.fqname] = pid with self.bundle.session as s: # These two deletely bits clear out all of the old # partitions, to avoid a conflict with the next section. We also have # to delete the files, since create() adds a partition record to the database, # and if one already exists, it will throw an Integrity Error. for p in self.bundle.partitions: if os.path.exists(p.database.path): os.remove(p.database.path) for p in self.bundle.dataset.partitions: s.delete(p) import pprint pprint.pprint(sorted([ pid.fqname for pid in pids.values()])) bundle = Bundle() bundle.clean() bundle.prepare() for pid in pids.values(): part = bundle.partitions.new_db_partition(**pid.dict) part.create() parts = bundle.partitions._find_orm(PartitionNameQuery(vid=pid.vid)).all() self.assertIn(pid.sname, [p.name for p in parts])