def test_creates_virtual_table_for_simple_fixed_mpr(self): # build rows reader cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources() spec = sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) mprows = MPRowsFile(cache_fs, spec.name).load_rows(s) # first make sure file not changed. expected_names = ['id', 'uuid', 'int', 'float'] expected_types = ['int', binary_type.__name__, 'int', 'float'] self.assertEqual([x['name'] for x in mprows.reader.columns], expected_names) self.assertEqual([x['type'] for x in mprows.reader.columns], expected_types) connection = apsw.Connection(':memory:') table = 'table1' add_partition(connection, mprows, table) # check all columns and some rows. cursor = connection.cursor() query = 'SELECT count(*) FROM {};'.format(table) result = cursor.execute(query).fetchall() self.assertEqual(result, [(10000, )]) with mprows.reader as r: expected_first_row = next(iter(r)).row # query by columns. query = 'SELECT id, uuid, int, float FROM {} LIMIT 1;'.format(table) result = cursor.execute(query).fetchall() self.assertEqual(len(result), 1) self.assertEqual(result[0], expected_first_row)
def test_row_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from ambry_sources.intuit import RowIntuiter cache_fs = fsopendir('temp://') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) rows = list(s) l = len(rows) # the files are short, so the head and tail overlap ri = RowIntuiter(debug=False).run(rows[:int(l * .75)], rows[int(l * .25):], len(rows)) print source_name, ri.start_line, ri.header_lines self.assertEqual( spec.expect_headers, ','.join(str(e) for e in ri.header_lines), 'Headers of {} source does not match to row intuiter'.format( spec.name)) self.assertEqual( spec.expect_start, ri.start_line, 'Start line of {} source does not match to row intuiter start line.' .format(spec.name))
def test_created_source_has_zip_filesystem(self): # FIXME: Optimize to use local file instead of downloading it all the time. cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='geo_sources.csv') spec = sources['community_plan'] source = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) self.assertIsInstance(source._fstor._fs, ZipFS)
def test_ctor(self): d = '/tmp/socrata' from os import makedirs from os.path import exists from shutil import rmtree if exists(d): print "Make", d rmtree(d) makedirs(d) cache_fs = fsopendir(d) # fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='sources.csv') spec = sources['facilities'] source = get_source(spec, cache_fs) def cb(*args): print args mpr = MPRowsFile(cache_fs, spec.name).load_rows(source, callback = cb, limit = 10)
def __iter__(self): import unicodecsv as csv from contextlib import closing from ambry_sources import get_source from ambry.bundle.process import call_interval @call_interval(5) def progress(read_len, total_len): self._bundle.log('Downloading {}: {}'.format(self._source.url, total_len)) spec = self._source.spec spec.urltype = 'zip' s = get_source(spec, self._bundle.library.download_cache, account_accessor=self._bundle.library.account_accessor, callback=progress) encoding = self._source.spec.encoding or 'utf8' header = None header_len = None for i, row in enumerate(s): if i == 0: header = row header_len = len(row) if len(row) > header_len: head = list(row)[:header_len-1] tail = [str(e) for e in row[header_len-1:] ] row = head + [','.join(tail) ] yield row
def test_all(self): """ Test all sources from geo_sources.csv """ cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='geo_sources.csv') for name, spec in sources.items(): if name == 'highways': # it is already tested. Skip. continue source = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) # now check its load to MPRows mpr = MPRowsFile(cache_fs, spec.name).load_rows(source) first_row = next(iter(mpr.reader)) # Are columns recognized properly? NAME_INDEX = 1 # which element of the column description contains name. # Collect all names from column descriptors. Skip first elem of the schema because # it's descriptor of column descriptor elements. columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]] self.assertIn('id', columns) self.assertIn('geometry', columns) # Is first row valid? self.assertEqual(len(columns), len(first_row))
def test_type_intuit(self): from ambry_sources.intuit import TypeIntuiter cache_fs = fsopendir(self.setup_temp_dir()) spec = self.sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) with f.writer as w: w.load_rows(s) with f.reader as r: ti = TypeIntuiter().process_header(r.headers).run(r.rows, r.n_rows) with f.writer as w: w.set_types(ti) columns = [] with f.reader as w: for col in w.columns: columns.append((col.pos, col.name, col.type)) expected_columns = [(1, u'id', u'int'), (2, u'uuid', u'str'), (3, u'int', u'int'), (4, u'float', u'float')] self.assertEqual(columns, expected_columns)
def test_type_intuit(self): from ambry_sources.intuit import TypeIntuiter cache_fs = fsopendir(self.setup_temp_dir()) spec = self.sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) with f.writer as w: w.load_rows(s) with f.reader as r: ti = TypeIntuiter().process_header(r.headers).run(r.rows, r.n_rows) with f.writer as w: w.set_types(ti) columns = [] with f.reader as w: for col in w.columns: columns.append((col.pos, col.name, col.type)) expected_columns = [ (1, u'id', u'int'), (2, u'uuid', u'str'), (3, u'int', u'int'), (4, u'float', u'float') ] self.assertEqual(columns, expected_columns)
def test_row_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from ambry_sources.intuit import RowIntuiter cache_fs = fsopendir('temp://') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) rows = list(s) l = len(rows) # the files are short, so the head and tail overlap ri = RowIntuiter(debug=False).run(rows[:int(l*.75)], rows[int(l*.25):], len(rows)) print source_name, ri.start_line, ri.header_lines self.assertEqual( spec.expect_headers, ','.join(str(e) for e in ri.header_lines), 'Headers of {} source does not match to row intuiter'.format(spec.name)) self.assertEqual( spec.expect_start, ri.start_line, 'Start line of {} source does not match to row intuiter start line.'.format(spec.name))
def test_ctor(self): d = '/tmp/socrata' from os import makedirs from os.path import exists from shutil import rmtree if exists(d): print "Make", d rmtree(d) makedirs(d) cache_fs = fsopendir(d) # fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='sources.csv') spec = sources['facilities'] source = get_source(spec, cache_fs) def cb(*args): print args mpr = MPRowsFile(cache_fs, spec.name).load_rows(source, callback=cb, limit=10)
def test_creates_foreign_data_table_for_simple_fixed_mpr( self, fake_shares): fake_shares.return_value = True # build rows reader cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources() spec = sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) mprows = MPRowsFile(cache_fs, spec.name).load_rows(s) # first make sure file was not changed. expected_names = ['id', 'uuid', 'int', 'float'] expected_types = ['int', binary_type.__name__, 'int', 'float'] self.assertEqual(sorted([x['name'] for x in mprows.reader.columns]), sorted(expected_names)) self.assertEqual(sorted([x['type'] for x in mprows.reader.columns]), sorted(expected_types)) try: # create foreign data table PostgreSQLTestBase._create_postgres_test_db() conn = psycopg2.connect(**PostgreSQLTestBase.pg_test_db_data) try: with conn.cursor() as cursor: # we have to close opened transaction. cursor.execute('COMMIT;') add_partition(cursor, mprows, 'table1') # try to query just added partition foreign data table. with conn.cursor() as cursor: table = 'table1' # count all rows query = 'SELECT count(*) FROM {}.{};'.format( POSTGRES_PARTITION_SCHEMA_NAME, table) cursor.execute(query) result = cursor.fetchall() self.assertEqual(result, [(10000, )]) # check first row cursor.execute( 'SELECT id, uuid, int, float FROM {}.{} LIMIT 1;'. format(POSTGRES_PARTITION_SCHEMA_NAME, table)) result = cursor.fetchall() self.assertEqual(len(result), 1) expected_first_row = (1, 'eb385c36-9298-4427-8925-fe09294dbd', 30, Decimal('99.734691532')) self.assertEqual(result[0], expected_first_row) finally: conn.close() finally: PostgreSQLTestBase._drop_postgres_test_db()
def test_intuit_headers(self): sources = self.load_sources(file_name='sources.csv') for source_name in ['headers4', 'headers3', 'headers2', 'headers1']: cache_fs = fsopendir(self.setup_temp_dir()) spec = sources[source_name] f = MPRowsFile(cache_fs, spec.name) \ .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y))) self.assertEqual(spec.expect_start, f.info['data_start_row']) self.assertEqual([int(e) for e in spec.expect_headers.split(',')], f.info['header_rows'])
def test_intuit_headers(self): sources = self.load_sources(file_name='sources.csv') for source_name in ['headers4', 'headers3', 'headers2', 'headers1']: cache_fs = fsopendir(self.setup_temp_dir()) spec = sources[source_name] f = MPRowsFile(cache_fs, spec.name) \ .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y))) self.assertEqual(spec.expect_start, f.info['data_start_row']) self.assertEqual( [int(e) for e in spec.expect_headers.split(',')], f.info['header_rows'])
def test_creates_foreign_data_table_for_simple_fixed_mpr(self, fake_shares): fake_shares.return_value = True # build rows reader cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources() spec = sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) mprows = MPRowsFile(cache_fs, spec.name).load_rows(s) # first make sure file was not changed. expected_names = ['id', 'uuid', 'int', 'float'] expected_types = ['int', binary_type.__name__, 'int', 'float'] self.assertEqual(sorted([x['name'] for x in mprows.reader.columns]), sorted(expected_names)) self.assertEqual(sorted([x['type'] for x in mprows.reader.columns]), sorted(expected_types)) try: # create foreign data table PostgreSQLTestBase._create_postgres_test_db() conn = psycopg2.connect(**PostgreSQLTestBase.pg_test_db_data) try: with conn.cursor() as cursor: # we have to close opened transaction. cursor.execute('COMMIT;') add_partition(cursor, mprows, 'table1') # try to query just added partition foreign data table. with conn.cursor() as cursor: table = 'table1' # count all rows query = 'SELECT count(*) FROM {}.{};'.format(POSTGRES_PARTITION_SCHEMA_NAME, table) cursor.execute(query) result = cursor.fetchall() self.assertEqual(result, [(10000,)]) # check first row cursor.execute( 'SELECT id, uuid, int, float FROM {}.{} LIMIT 1;' .format(POSTGRES_PARTITION_SCHEMA_NAME, table)) result = cursor.fetchall() self.assertEqual(len(result), 1) expected_first_row = ( 1, 'eb385c36-9298-4427-8925-fe09294dbd', 30, Decimal('99.734691532')) self.assertEqual(result[0], expected_first_row) finally: conn.close() finally: PostgreSQLTestBase._drop_postgres_test_db()
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)
def test_intuit_footer(self): sources = self.load_sources(file_name='sources.csv') for source_name in ['headers4', 'headers3', 'headers2', 'headers1']: cache_fs = fsopendir(self.setup_temp_dir()) spec = sources[source_name] f = MPRowsFile(cache_fs, spec.name) \ .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y))) with f.reader as r: last = list(r.rows)[-1] # islice isn't working on the reader. print source_name, last self.assertEqual(11999, int(last[0])) self.assertEqual('2q080z003Cg2', last[1])
def test_just_download(self): """Just check that all of the sources can be downloaded without exceptions""" cache_fs = fsopendir('temp://') for source_name, spec in self.sources.items(): try: s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) for i, row in enumerate(s): if i > 10: break except Exception as exc: raise AssertionError('Failed to download {} source because of {} error.' .format(s.url, exc))
def test_just_download(self): """Just check that all of the sources can be downloaded without exceptions""" cache_fs = fsopendir('temp://') for source_name, spec in self.sources.items(): try: s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) for i, row in enumerate(s): if i > 10: break except Exception as exc: raise AssertionError( 'Failed to download {} source because of {} error.'.format( s.url, exc))
def test_highways(self): # FIXME: Optimize to use local file instead of downloading it all the time. cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='geo_sources.csv') spec = sources['highways'] source = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) # first check is it converted properly. row_gen = source._get_row_gen() first_row = next(row_gen) # generates valid first row self.assertEqual(len(first_row), 68) self.assertEqual(first_row[0], 0) # last element is wkt. self.assertIn('LINESTRING', first_row[-1]) # header is valid self.assertEqual(len(source._headers), 68) self.assertEqual(source._headers[0], 'id') self.assertEqual(source._headers[-1], 'geometry') # now check its load to MPRows mpr = MPRowsFile(cache_fs, spec.name).load_rows(source) # Are columns recognized properly? NAME_INDEX = 1 # which element of the column description contains name. # Collect all names from column descriptors. Skip first elem of the schema because # it's descriptor of column descriptor elements. columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]] self.assertIn('id', columns) self.assertIn('geometry', columns) self.assertIn('length', columns) # column from shape file. # Is first row valid? first_row = next(iter(mpr.reader)) self.assertEqual(len(first_row), 68) self.assertEqual(first_row['id'], 0) self.assertIn('LINESTRING', first_row['geometry']) return # spec columns are properly populated self.assertEqual(len(spec.columns), 68) self.assertEqual(spec.columns[0]['name'], 'id') self.assertEqual(spec.columns[-1]['name'], 'geometry')
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/'+source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next(six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/' + source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next( six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def test_full_load(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') for source_name, spec in self.sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: self.assertTrue(len(r.headers) > 0)
def test_fixed(self): from ambry_sources import head, tail cache_fs = fsopendir(self.setup_temp_dir()) spec = self.sources['simple_fixed'] assert spec.has_rowspec is False s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) # prepare HDFPartition. f = HDFPartition(cache_fs, spec.name) ri = RowIntuiter().run(head(s, 100), tail(s, 100)) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(s) with f.writer as w: w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(s) self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://public.source.civicknowledge.com/example.com/sources/simple-example.csv', name='simple') s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(10001, f.reader.info['data_end_row'])
def test_stats(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" #cache_fs = fsopendir('temp://') from shutil import rmtree from os import makedirs tp = '/tmp/mpr-test' rmtree(tp, ignore_errors=True) makedirs(tp) cache_fs = fsopendir(tp) s = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, s.spec.name).load_rows(s, run_stats=True) stat_names = ('count', 'min', 'mean', 'max', 'nuniques') vals = { u('str_a'): (30, None, None, None, 10), u('str_b'): (30, None, None, None, 10), u('float_a'): (30, 1.0, 5.5, 10.0, 10), u('float_b'): (30, 1.1, 5.5, 9.9, 10), u('float_c'): (30, None, None, None, 10), u('int_b'): (30, None, None, None, 10), u('int_a'): (30, 1.0, 5.5, 10.0, 10) } with f.reader as r: for col in r.columns: stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None, col.max, col.nuniques) for a, b, stat_name in zip(vals[col.name], stats, stat_names): self.assertEqual( a, b, "{} failed for stat {}: {} != {}".format( col.name, stat_name, a, b))
def test_stats(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from ambry_sources import head, tail cache_fs = fsopendir('temp://') source = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y)) f = HDFPartition(cache_fs, source.spec.name) with f.writer as w: ri = RowIntuiter().run(head(source, 100), tail(source, 100)) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(source) w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(source, run_stats=True) expected = { u('str_a'): (30, None, None, None, 10), u('str_b'): (30, None, None, None, 10), u('float_a'): (30, 1.0, 5.5, 10.0, 10), u('float_b'): (30, 1.1, 5.5, 9.9, 10), u('float_c'): (30, 1.1, 5.5, 9.9, 10), u('int_b'): (30, 1.0, 5.0, 9.0, 10), u('int_a'): (30, 1.0, 5.5, 10.0, 10)} with f.reader as r: for col in r.columns: stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None, col.max, col.nuniques) for a, b in zip(expected[col.name], stats): self.assertEqual( a, b, 'Saved stat ({}) does not match to expected ({}) for {}'.format(a, b, col.name))
def test_stats(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" #cache_fs = fsopendir('temp://') from shutil import rmtree from os import makedirs tp = '/tmp/mpr-test' rmtree(tp, ignore_errors=True) makedirs(tp) cache_fs = fsopendir(tp) s = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, s.spec.name).load_rows(s, run_stats=True) stat_names = ('count', 'min', 'mean', 'max', 'nuniques') vals = {u('str_a'): (30, None, None, None, 10), u('str_b'): (30, None, None, None, 10), u('float_a'): (30, 1.0, 5.5, 10.0, 10), u('float_b'): (30, 1.1, 5.5, 9.9, 10), u('float_c'): (30, None, None, None, 10), u('int_b'): (30, None, None, None, 10), u('int_a'): (30, 1.0, 5.5, 10.0, 10)} with f.reader as r: for col in r.columns: stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None, col.max, col.nuniques) for a, b, stat_name in zip(vals[col.name], stats, stat_names): self.assertEqual(a, b, "{} failed for stat {}: {} != {}".format(col.name, stat_name, a, b))
def test_fixed(self): cache_fs = fsopendir(self.setup_temp_dir()) spec = self.sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name).load_rows(s) self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
def __iter__(self): from ambry_sources import get_source from itertools import izip, chain from ambry.etl import Slice from ambry.orm import Column from ambry.bundle.process import CallInterval table = self.source.dest_table if isinstance(table, str): table = self.table(table) start = int(table.data["start"]) length = int(table.data["length"]) slca_str = ",".join(str(e[4]) for e in self.header_cols) slcb_str = "{}:{}".format(start - 1, start + length - 1) # Slice for the stusab, logrecno, etc. slca, slc_code = Slice.make_slicer(slca_str) # Slice for the data columns slcb, slc_code = Slice.make_slicer(slcb_str) columns = [c.name for c in table.columns] # Columns before the first data column, by removing the # data columns, which are presumed to all be at the end. preamble_cols = columns[: -2 * len(slcb(range(1, 300)))] data_columns = columns[len(preamble_cols) :] header_cols = [e[0] for e in self.header_cols] # A few sanity checks assert preamble_cols[-1] == "jam_flags" assert data_columns[0][-3:] == "001" assert data_columns[1][-3:] == "m90" all_cols = [Column.mangle_name(c) for c in header_cols + data_columns] yield all_cols def progress(read_len, total_len, source_name): self.bundle.log("Downloading {}; {} bytes".format(source_name, total_len)) cache = self.library.download_cache row_n = 0 for spec1, spec2 in self.generate_source_specs(): s1 = get_source(spec1, cache, callback=CallInterval(progress, 10, source_name=spec1.url)) s2 = get_source(spec2, cache, callback=CallInterval(progress, 10, source_name=spec1.url)) for i, (row1, row2) in enumerate(izip(s1, s2)): # Interleave the slices of the of the data rows, prepend # the stusab, logrecno, etc. row_n += 1 if self.limited_run and row_n > 10000: return yield slca(row1) + tuple(chain(*zip(slcb(row1), slcb(row2))))
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219)] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])
def test_load_and_headers(self): """ Just checks that all of the sources can be loaded without exceptions. """ from ambry_sources import head, tail cache_fs = fsopendir('temp://') source_headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')] } for source_name, spec in self.sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = HDFPartition(cache_fs, spec.name) if f.exists: f.remove() # FIXME: This is really complicated setup for HDFPartition file. Try to simplify. with f.writer as w: if spec.has_rowspec: row_spec = self._spec_to_dict(spec) headers = self._get_headers(s, spec) ti = TypeIntuiter().process_header(headers).run(s) w.set_row_spec(row_spec, headers) w.set_types(ti) else: ri = RowIntuiter().run(head(s, 20), tail(s, 20), w.n_rows) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(s) w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(s) with f.reader as r: if spec.name in source_headers: self.assertEqual(source_headers[spec.name], r.headers)
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec( 'http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219) ] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [ u('origin_english'), u('name_english'), u('origin_native'), u('name_native') ], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)