def test_make_bundles(self): bundle = Bundle() bundle.clean() bundle = Bundle() bundle.exit_on_fatal = False bundle.prepare() bundle.build()
def copy_or_build_bundle(self): """Set up a clean bundle build, either by re-building the bundle, or by copying it from a saved bundle directory """ # For most cases, re-set the bundle by copying from a saved version. If # the bundle doesn't exist and the saved version doesn't exist, # build a new one. bundle = Bundle() marker = bundle.filesystem.build_path('test-marker') build_dir = bundle.filesystem.build_path()+'/' # Slash needed for rsync save_dir = bundle.filesystem.build_path()+"-save/" if not os.path.exists(marker): logger.info( "Build dir marker ({}) is missing".format(marker)) # There is a good reason to create a seperate instance, # but don't remember what it is ... bundle.clean() bundle = Bundle() if True or not os.path.exists(save_dir): logger.info( "Save dir is missing; re-build bundle. ") bundle.prepare() bundle.build() with open(marker, 'w') as f: f.write(str(time.time())) # Copy the newly built bundle to the save directory os.system("rm -rf {1}; rsync -arv {0} {1} > /dev/null ".format(build_dir, save_dir)) # Always copy, just to be safe. logger.info( "Copying bundle from {}".format(save_dir)) os.system("rm -rf {0}; rsync -arv {1} {0} > /dev/null ".format(build_dir, save_dir))
def setUp(self): import testbundle.bundle self.bundle_dir = os.path.dirname(testbundle.bundle.__file__) self.rc = get_runconfig( (os.path.join(self.bundle_dir, 'client-test-config.yaml'), os.path.join(self.bundle_dir, 'bundle.yaml'), RunConfig.USER_CONFIG)) self.copy_or_build_bundle() self.bundle = Bundle()
def setUp(self): import testbundle.bundle self.bundle_dir = os.path.dirname(testbundle.bundle.__file__) self.rc = get_runconfig((os.path.join(self.bundle_dir,'warehouse-test-config.yaml'), os.path.join(self.bundle_dir,'bundle.yaml'))) self.copy_or_build_bundle() self.bundle = Bundle() print "Deleting: {}".format(self.rc.group('filesystem').root_dir) databundles.util.rm_rf(self.rc.group('filesystem').root_dir)
def setUp(self): import testbundle.bundle from ambry.run import RunConfig self.bundle_dir = os.path.dirname(testbundle.bundle.__file__) self.rc = get_runconfig((os.path.join(self.bundle_dir,'warehouse-test-config.yaml'), os.path.join(self.bundle_dir,'bundle.yaml'), RunConfig.USER_ACCOUNTS)) self.copy_or_build_bundle() self.bundle = Bundle() print "Deleting: {}".format(self.rc.group('filesystem').root_dir) ambry.util.rm_rf(self.rc.group('filesystem').root_dir)
def setUp(self): import shutil, os self.copy_or_build_bundle() self.bundle_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), '../testbundle') self.bundle = Bundle() self.bundle_dir = self.bundle.bundle_dir self.server_rc = RunConfig( [os.path.join(self.bundle_dir, 'server-test-config.yaml')]) self.client_rc = RunConfig( [os.path.join(self.bundle_dir, 'client-test-config.yaml')]) root = os.path.join(self.client_rc.filesystem.root_dir, 'test') shutil.rmtree(root)
def setUp(self): import testbundle.bundle, shutil, os self.bundle_dir = os.path.dirname(testbundle.bundle.__file__) self.rc = get_runconfig( (os.path.join(self.bundle_dir, 'source-test-config.yaml'), os.path.join(self.bundle_dir, 'bundle.yaml'), RunConfig.USER_ACCOUNTS)) self.copy_or_build_bundle() bundle = Bundle() self.source_save_dir = str( self.rc.group('filesystem').root) + '-source' self.setup_source_dir() print "Deleting: {}".format(self.rc.group('filesystem').root) ambry.util.rm_rf(self.rc.group('filesystem').root) bdir = os.path.join(self.rc.sourcerepo.dir, 'testbundle') pats = shutil.ignore_patterns('build', 'build-save', '*.pyc', '.git', '.gitignore', '.ignore', '__init__.py') print "Copying test dir tree to ", bdir shutil.copytree(bundle.bundle_dir, bdir, ignore=pats) # Import the bundle file from the directory from ambry.run import import_file import imp rp = os.path.realpath(os.path.join(bdir, 'bundle.py')) mod = import_file(rp) dir_ = os.path.dirname(rp) self.bundle = mod.Bundle(dir_) print self.bundle.bundle_dir
def setUp(self): self.copy_or_build_bundle() self.bundle = Bundle()
class Test(TestBase): def setUp(self): self.copy_or_build_bundle() self.bundle = Bundle() def tearDown(self): pass def test_basic(self): from ambry.geo.geocoder import Geocoder g = Geocoder(self.bundle.library) filename = "good_segments" f_input = os.path.join(os.path.dirname(__file__), '../support',filename + '.txt') f_output = os.path.join(os.path.dirname(__file__), '../support',filename + '.out.csv') with open(f_input) as f: for line in f: addr = line.strip() r = g.geocode_address(addr) print "==", addr print "->",r if r: print " ", r['codedaddress'] def write_error_row(self, code, arg, p, w, address, city): try: ps = p.parse(address) except: ps = False if not ps: row = [code, arg, address, city] else: row = [code, arg, address, city, ps.number, ps.street_direction, ps.street_name, ps.street_type] w.writerow(row) def x_test_crime(self): from ambry.geo.address import Parser from ambry.geo.geocoder import Geocoder import csv g = Geocoder(self.bundle.library, addresses_ds='geoaddresses') _,incidents = self.bundle.library.dep('crime') log_rate = self.bundle.init_log_rate(1000) p = Parser() with open(self.bundle.filesystem.path('errors.csv'), 'wb') as f: writer = csv.writer(f) writer.writerow(['code','arg','block_address','city','number','dir','street','type']) multi_cities = 0.0 multi_addr = 0.0 no_response = 0.0 for i, inct in enumerate(incidents.query("SELECT * FROM incidents limit 100000")): row = dict(inct) candidates = g.geocode_semiblock(row['blockaddress'], row['city'], 'CA') if len(candidates) == 0: no_response += 1 self.write_error_row('norsp',0, p,writer,row['blockaddress'], row['city']) continue elif len(candidates) != 1: multi_cities += 1 self.write_error_row('mcities',len(candidates), p,writer,row['blockaddress'], row['city']) continue s = candidates.popitem()[1] if len(s) > 3: self.write_error_row('maddr',len(s), p,writer,row['blockaddress'], row['city']) multi_addr +=1 if i > 0: log_rate("{} cities={}, {}% addr={}, {}% nrp={}, {}%".format(i, multi_cities, int(multi_cities/i * 100), multi_addr, int(multi_addr/i * 100), no_response, int(no_response/i * 100) )) def test_place_coder(self): from ambry.geo.geocoder import PlaceCoder pc = PlaceCoder(self.bundle.library) places = self.bundle.library.dep('places').partition for place in places.rows: try: in_places = [ x['name'] for x in pc.lookup_wgs(place['lat'], place['lon'])] except ValueError: continue # Some of the centroids aren't in the regions, since there are complicated region # shapes, and some cities hold parcels in the east county. if not place['name'] in in_places: print place['type'], place['name'], in_places
def setUp(self): self.copy_or_build_bundle() self.bundle = Bundle() self.bundle_dir = self.bundle.bundle_dir
class Test(TestBase): def setUp(self): import testbundle.bundle from ambry.run import RunConfig self.bundle_dir = os.path.dirname(testbundle.bundle.__file__) self.rc = get_runconfig((os.path.join(self.bundle_dir,'warehouse-test-config.yaml'), os.path.join(self.bundle_dir,'bundle.yaml'), RunConfig.USER_ACCOUNTS)) self.copy_or_build_bundle() self.bundle = Bundle() print "Deleting: {}".format(self.rc.group('filesystem').root_dir) ambry.util.rm_rf(self.rc.group('filesystem').root_dir) def tearDown(self): pass def resolver(self,name): if name == self.bundle.identity.name or name == self.bundle.identity.vname: return self.bundle else: return False def get_library(self, name='default'): """Clear out the database before the test run""" from ambry.library import new_library config = self.rc.library(name) l = new_library(config, reset=True) l.database.enable_delete = True l.database.drop() l.database.create() return l def get_warehouse(self, l, name): from ambry.util import get_logger from ambry.warehouse import new_warehouse w = new_warehouse(self.rc.warehouse(name), l) w.logger = get_logger('unit_test') lr = self.bundle.init_log_rate(10000) w.logger = TestLogger(lr) w.database.enable_delete = True w.database.delete() w.create() return w def _test_local_install(self, name): l = self.get_library('local') l.put_bundle(self.bundle) w = self.get_warehouse(l, name) print "Warehouse: ", w.database.dsn print "Library: ", l.database.dsn w.install("source-dataset-subset-variation-tone-0.0.1") w.install("source-dataset-subset-variation-tthree-0.0.1") w.install("source-dataset-subset-variation-geot1-geo-0.0.1") w = self.get_warehouse(l, 'spatialite') print "WAREHOUSE: ", w.database.dsn w.install("source-dataset-subset-variation-tone-0.0.1") w.install("source-dataset-subset-variation-tthree-0.0.1") w.install("source-dataset-subset-variation-geot1-geo-0.0.1") def test_local_sqlite_install(self): self._test_local_install('sqlite') def test_local_postgres_install(self): self._test_local_install('postgres1') def _test_remote_install(self, name): self.start_server(self.rc.library('server')) l = self.get_library('client') l.put_bundle(self.bundle) w = self.get_warehouse(l, name) print "WAREHOUSE: ", w.database.dsn w.install("source-dataset-subset-variation-tone-0.0.1") w.install("source-dataset-subset-variation-tthree-0.0.1") w.install("source-dataset-subset-variation-geot1-geo-0.0.1") w = self.get_warehouse(l, 'spatialite') print "WAREHOUSE: ", w.database.dsn w.install("source-dataset-subset-variation-tone-0.0.1") w.install("source-dataset-subset-variation-tthree-0.0.1") w.install("source-dataset-subset-variation-geot1-geo-0.0.1") def test_remote_sqlite_install(self): self._test_remote_install('sqlite') def test_remote_postgres_install(self): self._test_remote_install('postgres1') def test_manifest(self): from ambry.warehouse.manifest import Manifest m = Manifest(""" First Line of documentation partitions: part1 # Comment part2 # Comment views: create view foobar1 as one two three; create view foobar2 as one two three; documentation: Foo Doc views: create view foobar3 as one two three; doc: More Documentation sql:driver1|driver2 one two three sql:driver1 four five sql:driver2 seven eight """) for view in m.views: print "view", view for partition in m.partitions: print 'partition', partition print 'doc', m.documentation print '----' print m.sql def x_test_install(self): def resolver(name): if name == self.bundle.identity.name or name == self.bundle.identity.vname: return self.bundle else: return False def progress_cb(lr, type,name,n): if n: lr("{} {}: {}".format(type, name, n)) else: self.bundle.log("{} {}".format(type, name)) from ambry.warehouse import new_warehouse from functools import partial print "Getting warehouse" w = new_warehouse(self.rc.warehouse('postgres')) print "Re-create database" w.database.enable_delete = True w.resolver = resolver w.progress_cb = progress_cb try: w.drop() except: pass w.create() ps = self.bundle.partitions.all print "{} partitions".format(len(ps)) for p in self.bundle.partitions: lr = self.bundle.init_log_rate(10000) w.install(p, progress_cb = partial(progress_cb, lr) ) self.assertTrue(w.has(self.bundle.identity.vname)) for p in self.bundle.partitions: self.assertTrue(w.has(p.identity.vname)) for p in self.bundle.partitions: w.remove(p.identity.vname) print w.get(self.bundle.identity.name) print w.get(self.bundle.identity.vname) print w.get(self.bundle.identity.id_) w.install(self.bundle) print w.get(self.bundle.identity.name) print w.get(self.bundle.identity.vname) print w.get(self.bundle.identity.id_) for p in self.bundle.partitions: lr = self.bundle.init_log_rate(10000) w.install(p, progress_cb = partial(progress_cb, lr))
class Test(TestBase): def setUp(self): import testbundle.bundle self.bundle_dir = os.path.dirname(testbundle.bundle.__file__) self.rc = get_runconfig((os.path.join(self.bundle_dir,'warehouse-test-config.yaml'), os.path.join(self.bundle_dir,'bundle.yaml'))) self.copy_or_build_bundle() self.bundle = Bundle() print "Deleting: {}".format(self.rc.group('filesystem').root_dir) databundles.util.rm_rf(self.rc.group('filesystem').root_dir) def tearDown(self): pass def resolver(self,name): if name == self.bundle.identity.name or name == self.bundle.identity.vname: return self.bundle else: return False class Resolver(object): def get(self,name): if name == self.bundle.identity.name or name == self.bundle.identity.vname: return self.bundle else: return False def get_ref(self,name): pass def progress_cb(self, lr, type_,name,n): if n: lr("{} {}: {}".format(type, name, n)) else: self.bundle.log("{} {}".format(type_, name)) def test_create(self): from databundles.warehouse import new_warehouse w = new_warehouse(self.rc.warehouse('postgres')) print "Re-create database" w.database.enable_delete = True w.resolver = lambda name: self.resolver(name) lr = self.bundle.init_log_rate(10000) w.progress_cb = lambda type_,name,n: self.progress_cb(lr, type_,name,n) try: w.drop() except: pass w.create() w.library.create() w.install(self.bundle) w.create_table(self.bundle.dataset.vid, "ttwo") def x_test_install(self): def resolver(name): if name == self.bundle.identity.name or name == self.bundle.identity.vname: return self.bundle else: return False def progress_cb(lr, type,name,n): if n: lr("{} {}: {}".format(type, name, n)) else: self.bundle.log("{} {}".format(type, name)) from databundles.warehouse import new_warehouse from functools import partial print "Getting warehouse" w = new_warehouse(self.rc.warehouse('postgres')) print "Re-create database" w.database.enable_delete = True w.resolver = resolver w.progress_cb = progress_cb try: w.drop() except: pass w.create() ps = self.bundle.partitions.all print "{} partitions".format(len(ps)) for p in self.bundle.partitions: lr = self.bundle.init_log_rate(10000) w.install(p, progress_cb = partial(progress_cb, lr) ) self.assertTrue(w.has(self.bundle.identity.vname)) for p in self.bundle.partitions: self.assertTrue(w.has(p.identity.vname)) for p in self.bundle.partitions: w.remove(p.identity.vname) print w.get(self.bundle.identity.name) print w.get(self.bundle.identity.vname) print w.get(self.bundle.identity.id_) w.install(self.bundle) print w.get(self.bundle.identity.name) print w.get(self.bundle.identity.vname) print w.get(self.bundle.identity.id_) for p in self.bundle.partitions: lr = self.bundle.init_log_rate(10000) w.install(p, progress_cb = partial(progress_cb, lr))
class Test(TestBase): def setUp(self): self.copy_or_build_bundle() self.bundle = Bundle() def tearDown(self): pass def test_basic(self): from pprint import pprint from databundles.geo.geocoder import Geocoder g = Geocoder(self.bundle.library) filename = "good_segments" f_input = os.path.join(os.path.dirname(__file__),'support',filename + '.txt') f_output = os.path.join(os.path.dirname(__file__),'support',filename + '.out.csv') with open(f_input) as f: for line in f: addr = line.strip() r = g.geocode_address(addr) print "==", addr print "->",r if r: print " ", r['coded_address'] def write_error_row(self, code, arg, p, w, address, city): try: ps = p.parse(address) except: ps = False if not ps: row = [code, arg, address, city] else: row = [code, arg, address, city, ps.number, ps.street_direction, ps.street_name, ps.street_type] w.writerow(row) def x_test_crime(self): from databundles.geo.address import Parser from databundles.geo.geocoder import Geocoder import csv g = Geocoder(self.bundle.library, addresses_ds='geoaddresses') _,incidents = self.bundle.library.dep('crime') log_rate = self.bundle.init_log_rate(1000) p = Parser() with open(self.bundle.filesystem.path('errors.csv'), 'wb') as f: writer = csv.writer(f) writer.writerow(['code','arg','block_address','city','number','dir','street','type']) multi_cities = 0.0 multi_addr = 0.0 no_response = 0.0 for i, inct in enumerate(incidents.query("SELECT * FROM incidents limit 100000")): row = dict(inct) candidates = g.geocode_semiblock(row['blockaddress'], row['city'], 'CA') if len(candidates) == 0: no_response += 1 self.write_error_row('norsp',0, p,writer,row['blockaddress'], row['city']) continue elif len(candidates) != 1: multi_cities += 1 self.write_error_row('mcities',len(candidates), p,writer,row['blockaddress'], row['city']) continue s = candidates.popitem()[1] if len(s) > 3: self.write_error_row('maddr',len(s), p,writer,row['blockaddress'], row['city']) multi_addr +=1 if i > 0: log_rate("{} cities={}, {}% addr={}, {}% nrp={}, {}%".format(i, multi_cities, int(multi_cities/i * 100), multi_addr, int(multi_addr/i * 100), no_response, int(no_response/i * 100) ))
def x_test_rewrite(self): from testbundle.bundle import Bundle from sqlalchemy.exc import IntegrityError import json from ambry.run import get_runconfig # Prepare to rewrite the bundle.yaml file. bundle = Bundle() orig = os.path.join(bundle.bundle_dir,'bundle.yaml') save = os.path.join(bundle.bundle_dir,'bundle.yaml.save') try: os.rename(orig,save) print 'Write to ', orig with open(orig,'w') as f: f.write(json.dumps( { "identity":{ "dataset": "dataset1", "id": "dfoo", "revision": 100, "source": "source1", "subset": "subset1", "variation": "variation1", "version": "1.0.1", "vid": "dfob001", }, "about": { "author": "*****@*****.**" } } )) get_runconfig.clear() # clear config cache. bundle = Bundle() bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() # Does the rewrite, adding the 'names' # Need to clear and reload one more time for the 'names' to appear get_runconfig.clear() # clear config cache. bundle = Bundle() bundle.exit_on_fatal = False self.assertEquals('dataset1', bundle.config.identity.dataset) self.assertEquals('dfoo', bundle.config.identity.id) self.assertEquals(100, bundle.config.identity.revision) self.assertEquals("source1-dataset1-subset1-variation1-1.0.100~dfoo01C", bundle.config.names.fqname) self.assertEquals("*****@*****.**", bundle.config.about.author) finally: os.rename(save, orig) self.delete_bundle()
def test_bundle_build(self): from testbundle.bundle import Bundle from sqlalchemy.exc import IntegrityError from ambry.dbexceptions import ConflictError bundle = Bundle() # Need to clear the library, or the Bundle's pre_prepare # will cancel the build if this version is already installed bundle.library.purge() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bp = bundle.partitions with bundle.session: bp._new_orm_partition(PartialPartitionName(time = 't1', space='s1')) bp._new_orm_partition(PartialPartitionName(time = 't1', space='s2')) bp._new_orm_partition(PartialPartitionName(time = 't1', space=None)) bp._new_orm_partition(PartialPartitionName(time = 't2', space='s1')) bp._new_orm_partition(PartialPartitionName(time = 't2', space='s2')) bp._new_orm_partition(PartialPartitionName(time = 't2', space=None)) with self.assertRaises(ConflictError): with bundle.session: bp._new_orm_partition(PartialPartitionName(time = 't1', space='s1')) pnq = PartitionNameQuery(time=NameQuery.ANY, space='s1') names = [p.vname for p in bp._find_orm(pnq).all()] self.assertEqual({u'source-dataset-subset-variation-t2-s1-0.0.1', u'source-dataset-subset-variation-t1-s1-0.0.1'}, set(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(space=NameQuery.ANY)).all()] self.assertEqual(6,len(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(time='t1',space=NameQuery.ANY)).all()] self.assertEqual({'source-dataset-subset-variation-t1-s2-0.0.1', 'source-dataset-subset-variation-t1-0.0.1', 'source-dataset-subset-variation-t1-s1-0.0.1'}, set(names)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(time='t1',space=NameQuery.NONE)).all()] self.assertEqual({'source-dataset-subset-variation-t1-0.0.1'}, set(names)) # Start over, use a higher level function to create the partitions bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bp = bundle.partitions bp._new_partition(PartialPartitionName(time = 't1', space='s1')) self.assertEquals(1, len(bp.all)) bp._new_partition(PartialPartitionName(time = 't1', space='s2')) self.assertEquals(2, len(bp.all)) bp._new_partition(PartialPartitionName(time = 't1', space=None)) bp._new_partition(PartialPartitionName(time = 't2', space='s1')) bp._new_partition(PartialPartitionName(time = 't2', space='s2')) bp._new_partition(PartialPartitionName(time = 't2', space=None)) self.assertEquals(6, len(bp.all)) names = [p.vname for p in bp._find_orm(PartitionNameQuery(time='t1',space=NameQuery.ANY)).all()] self.assertEqual({'source-dataset-subset-variation-t1-s2-0.0.1', 'source-dataset-subset-variation-t1-0.0.1', 'source-dataset-subset-variation-t1-s1-0.0.1'}, set(names)) # Start over, use a higher level function to create the partitions bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.database.create() bp = bundle.partitions p = bp.new_db_partition(time = 't1', space='s1') self.assertEquals('source-dataset-subset-variation-t1-s1-0.0.1~piEGPXmDC8001001', p.identity.fqname) p = bp.find_or_new(time = 't1', space='s2') self.assertEquals('source-dataset-subset-variation-t1-s2-0.0.1~piEGPXmDC8002001', p.identity.fqname) # Duplicate p = bp.find_or_new(time = 't1', space='s2') self.assertEquals('source-dataset-subset-variation-t1-s2-0.0.1~piEGPXmDC8002001', p.identity.fqname) p = bp.find_or_new_hdf(time = 't2', space='s1') self.assertEquals('source-dataset-subset-variation-t2-s1-hdf-0.0.1~piEGPXmDC8003001', p.identity.fqname) p = bp.find_or_new_geo(time = 't2', space='s1') self.assertEquals('source-dataset-subset-variation-t2-s1-geo-0.0.1~piEGPXmDC8004001', p.identity.fqname) p = bp.find_or_new_csv(time = 't2', space='s1') self.assertEquals('source-dataset-subset-variation-t2-s1-csv-0.0.1~piEGPXmDC8005001', p.identity.fqname) # Ok! Build! bundle = Bundle() bundle.exit_on_fatal = False bundle.clean() bundle.pre_prepare() bundle.prepare() bundle.post_prepare() bundle.pre_build() bundle.build_db_inserter_codes() bundle.post_build() self.assertEquals('diEGPXmDC8001',bundle.identity.vid) self.assertEquals('source-dataset-subset-variation',bundle.identity.sname) self.assertEquals('source-dataset-subset-variation-0.0.1',bundle.identity.vname) self.assertEquals('source-dataset-subset-variation-0.0.1~diEGPXmDC8001',bundle.identity.fqname)