def test_generator(self): from ambry_sources.sources import GeneratorSource, SourceSpec cache_fs = fsopendir(self.setup_temp_dir()) def gen(): yield list('abcde') for i in range(10): yield [i, i + 1, i + 2, i + 3, i + 4] f = MPRowsFile(cache_fs, 'foobar').load_rows( GeneratorSource(SourceSpec('foobar'), gen())) self.assertEqual(1, f.info['data_start_row']) self.assertEqual(11, f.info['data_end_row']) self.assertEqual([0], f.info['header_rows']) self.assertEqual(f.headers, list('abcde')) rows = list(f.select()) self.assertEqual(len(rows), 10) self.assertEqual(sorted(rows[0].keys()), sorted(list('abcde'))) self.assertTrue(f.is_finalized)
def test_creates_virtual_tables_for_partition_with_segment_without_errors( self): fs = fsopendir('temp://') def gen(): # generate header yield ['col1', 'col2'] # generate rows yield [0, 0] yield [1, 1] mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr') mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) # create virtual tables. This should not raise an error. # connection = apsw.Connection(':memory:') try: add_partition(connection, mprows, 'vid1') except Exception as exc: raise AssertionError( 'partition adding unexpectadly failed with {} error.'.format( exc)) # check selected rows # cursor = connection.cursor() result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall() self.assertEqual(result, [(0, 0), (1, 1)])
def test_creates_virtual_tables_for_partition_with_segment_without_errors(self): fs = fsopendir('temp://') def gen(): # generate header yield ['col1', 'col2'] # generate rows yield [0, 0] yield [1, 1] mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr') mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) # create virtual tables. This should not raise an error. # connection = apsw.Connection(':memory:') try: add_partition(connection, mprows, 'vid1') except Exception as exc: raise AssertionError('partition adding unexpectadly failed with {} error.'.format(exc)) # check selected rows # cursor = connection.cursor() result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall() self.assertEqual(result, [(0, 0), (1, 1)])
def test_spec_load(self): """Test that setting a SourceSpec propertly sets the header_lines data start position""" from ambry_sources.sources import SourceSpec import string rs = string.ascii_letters n = 500 rows, headers = self.generate_rows(n) blank = ['' for e in rows[0]] # Append a complex header, to give the RowIntuiter something to do. rows = [ ['Dataset Title'] + blank[1:], blank, blank, [rs[i] for i, e in enumerate(rows[0])], [rs[i + 1] for i, e in enumerate(rows[0])], [rs[i + 2] for i, e in enumerate(rows[0])], ] + rows f = MPRowsFile('mem://frh').load_rows(rows) d = f.info self.assertEqual(6, d['data_start_row']) self.assertEqual(506, d['data_end_row']) self.assertEqual([3, 4, 5], d['header_rows']) self.assertEqual([ u('a_b_c'), u('b_c_d'), u('c_d_e'), u('d_e_f'), u('e_f_g'), u('f_g_h') ], d['headers']) class Rows(object): spec = SourceSpec(None, header_lines=(3, 4), start_line=5) def __iter__(self): return iter(rows) f = MPRowsFile('mem://frh').load_rows(Rows()) d = f.info self.assertEqual(5, d['data_start_row']) self.assertEqual(506, d['data_end_row']) self.assertEqual([3, 4], d['header_rows']) self.assertEqual( [u('a_b'), u('b_c'), u('c_d'), u('d_e'), u('e_f'), u('f_g')], d['headers'])
def write_large_blocks(): df = MPRowsFile(fs, 'foobar') if df.exists: df.remove() with Timer() as t, df.writer as w: w.headers = headers w.insert_rows(rows) print('MSGPack write L', float(N) / t.elapsed, w.n_rows)
def write_small_blocks(): df = MPRowsFile(fs, 'foobar') if df.exists: df.remove() with Timer() as t, df.writer as w: for i in range(N): w.headers = headers w.insert_row(rows[i]) print('MSGPack write S', float(N) / t.elapsed, w.n_rows)
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [u('origin_english'), u('name_english'), u('origin_native'), u('name_native')], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv')] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)
def test_creates_virtual_table_for_simple_fixed_mpr(self): # build rows reader cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources() spec = sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) mprows = MPRowsFile(cache_fs, spec.name).load_rows(s) # first make sure file not changed. expected_names = ['id', 'uuid', 'int', 'float'] expected_types = ['int', binary_type.__name__, 'int', 'float'] self.assertEqual([x['name'] for x in mprows.reader.columns], expected_names) self.assertEqual([x['type'] for x in mprows.reader.columns], expected_types) connection = apsw.Connection(':memory:') table = 'table1' add_partition(connection, mprows, table) # check all columns and some rows. cursor = connection.cursor() query = 'SELECT count(*) FROM {};'.format(table) result = cursor.execute(query).fetchall() self.assertEqual(result, [(10000, )]) with mprows.reader as r: expected_first_row = next(iter(r)).row # query by columns. query = 'SELECT id, uuid, int, float FROM {} LIMIT 1;'.format(table) result = cursor.execute(query).fetchall() self.assertEqual(len(result), 1) self.assertEqual(result[0], expected_first_row)
def Create( self, db, modulename, dbname, tablename, # These argare are required by APSW mpr_url, *args): # These are our args. mprows = MPRowsFile(mpr_url) columns_types = [] column_names = [] for column in sorted(mprows.reader.columns, key=lambda x: x['pos']): sqlite_type = TYPE_MAP.get(column['type']) if not sqlite_type: raise Exception( 'Do not know how to convert {} to sql column.'.format( column['type'])) columns_types.append('"{}" {}'.format(column['name'], sqlite_type)) column_names.append(column['name']) columns_types_str = ',\n'.join(columns_types) schema = 'CREATE TABLE {}({});'.format(tablename, columns_types_str) return schema, Table(column_names, mprows)
def first_row_header(data_start_row=None, data_end_row=None): # Normal Headers f = MPRowsFile('mem://frh') w = f.writer w.columns = headers for row in rows: w.insert_row(row) if data_start_row is not None: w.data_start_row = data_start_row if data_end_row is not None: w.data_end_row = data_end_row w.close() self.assertEqual((u('a'), u('b'), u('c'), u('d'), u('e'), u('f')), tuple(w.parent.reader.headers)) w.parent.reader.close() return f
def test_ctor(self): d = '/tmp/socrata' from os import makedirs from os.path import exists from shutil import rmtree if exists(d): print "Make", d rmtree(d) makedirs(d) cache_fs = fsopendir(d) # fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='sources.csv') spec = sources['facilities'] source = get_source(spec, cache_fs) def cb(*args): print args mpr = MPRowsFile(cache_fs, spec.name).load_rows(source, callback=cb, limit=10)
def test_type_intuit(self): from ambry_sources.intuit import TypeIntuiter cache_fs = fsopendir(self.setup_temp_dir()) spec = self.sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) with f.writer as w: w.load_rows(s) with f.reader as r: ti = TypeIntuiter().process_header(r.headers).run(r.rows, r.n_rows) with f.writer as w: w.set_types(ti) columns = [] with f.reader as w: for col in w.columns: columns.append((col.pos, col.name, col.type)) expected_columns = [(1, u'id', u'int'), (2, u'uuid', u'str'), (3, u'int', u'int'), (4, u'float', u'float')] self.assertEqual(columns, expected_columns)
def schema_header(data_start_row=None, data_end_row=None): # Set the schema f = MPRowsFile('mem://sh') w = f.writer w.headers = ['x' + str(e) for e in range(len(headers))] for row in rows: w.insert_row(row) if data_start_row is not None: w.data_start_row = data_start_row if data_end_row is not None: w.data_end_row = data_end_row w.close() self.assertEqual( (u('x0'), u('x1'), u('x2'), u('x3'), u('x4'), u('x5')), tuple(w.parent.reader.headers)) w.parent.reader.close() return f
def test_all(self): """ Test all sources from geo_sources.csv """ cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='geo_sources.csv') for name, spec in sources.items(): if name == 'highways': # it is already tested. Skip. continue source = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) # now check its load to MPRows mpr = MPRowsFile(cache_fs, spec.name).load_rows(source) first_row = next(iter(mpr.reader)) # Are columns recognized properly? NAME_INDEX = 1 # which element of the column description contains name. # Collect all names from column descriptors. Skip first elem of the schema because # it's descriptor of column descriptor elements. columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]] self.assertIn('id', columns) self.assertIn('geometry', columns) # Is first row valid? self.assertEqual(len(columns), len(first_row))
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/'+source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next(six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def test_selects_correct_rows_from_many_mprows(self): fs = fsopendir('temp://') header = ['col1', 'col2'] # create 3 mprows files. # rows1 = [(0, 0), (1, 1)] mprows1 = MPRowsFile(fs, 'vid1') mprows1.load_rows(self._get_generator_source(header, rows1)) rows2 = [(2, 2), (3, 3)] mprows2 = MPRowsFile(fs, 'vid2') mprows2.load_rows(self._get_generator_source(header, rows2)) rows3 = [(4, 4), (5, 5)] mprows3 = MPRowsFile(fs, 'vid3') mprows3.load_rows(self._get_generator_source(header, rows3)) # create virtual tables for all mprows # connection = apsw.Connection(':memory:') add_partition(connection, mprows1, 'vid1') add_partition(connection, mprows2, 'vid2') add_partition(connection, mprows3, 'vid3') # check rows of all added mprows. # cursor = connection.cursor() query_tmpl = 'SELECT * FROM {};' # check rows of the first file. # query = query_tmpl.format('vid1') result = cursor.execute(query).fetchall() self.assertEqual(result, rows1) # check rows of the second mprows file. # query = query_tmpl.format('vid2') result = cursor.execute(query).fetchall() self.assertEqual(result, rows2) # check rows of the third mprows file. # query = query_tmpl.format('vid3') result = cursor.execute(query).fetchall() self.assertEqual(result, rows3)
def test_full_load(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') for source_name, spec in self.sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: self.assertTrue(len(r.headers) > 0)
def test_creates_foreign_data_table_for_simple_fixed_mpr( self, fake_shares): fake_shares.return_value = True # build rows reader cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources() spec = sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) mprows = MPRowsFile(cache_fs, spec.name).load_rows(s) # first make sure file was not changed. expected_names = ['id', 'uuid', 'int', 'float'] expected_types = ['int', binary_type.__name__, 'int', 'float'] self.assertEqual(sorted([x['name'] for x in mprows.reader.columns]), sorted(expected_names)) self.assertEqual(sorted([x['type'] for x in mprows.reader.columns]), sorted(expected_types)) try: # create foreign data table PostgreSQLTestBase._create_postgres_test_db() conn = psycopg2.connect(**PostgreSQLTestBase.pg_test_db_data) try: with conn.cursor() as cursor: # we have to close opened transaction. cursor.execute('COMMIT;') add_partition(cursor, mprows, 'table1') # try to query just added partition foreign data table. with conn.cursor() as cursor: table = 'table1' # count all rows query = 'SELECT count(*) FROM {}.{};'.format( POSTGRES_PARTITION_SCHEMA_NAME, table) cursor.execute(query) result = cursor.fetchall() self.assertEqual(result, [(10000, )]) # check first row cursor.execute( 'SELECT id, uuid, int, float FROM {}.{} LIMIT 1;'. format(POSTGRES_PARTITION_SCHEMA_NAME, table)) result = cursor.fetchall() self.assertEqual(len(result), 1) expected_first_row = (1, 'eb385c36-9298-4427-8925-fe09294dbd', 30, Decimal('99.734691532')) self.assertEqual(result[0], expected_first_row) finally: conn.close() finally: PostgreSQLTestBase._drop_postgres_test_db()
def test_headers(self): fs = fsopendir('mem://') df = MPRowsFile(fs, 'foobar') with df.writer as w: schema = lambda row, col: w.meta['schema'][row][col] w.headers = list('abcdefghi') self.assertEqual('a', schema(1, 1)) self.assertEqual('e', schema(5, 1)) self.assertEqual('i', schema(9, 1)) for h in w.columns: h.description = "{}-{}".format(h.pos, h.name) self.assertEqual('1-a', schema(1, 3)) self.assertEqual('5-e', schema(5, 3)) self.assertEqual('9-i', schema(9, 3)) w.column(1).description = 'one' w.column(2).description = 'two' w.column('c').description = 'C' w.column('d')['description'] = 'D' self.assertEqual('one', schema(1, 3)) self.assertEqual('two', schema(2, 3)) self.assertEqual('C', schema(3, 3)) self.assertEqual('D', schema(4, 3)) with df.reader as r: schema = lambda row, col: r.meta['schema'][row][col] self.assertEqual([ u('a'), u('b'), u('c'), u('d'), u('e'), u('f'), u('g'), u('h'), u('i') ], r.headers) self.assertEqual('one', schema(1, 3)) self.assertEqual('two', schema(2, 3)) self.assertEqual('C', schema(3, 3)) self.assertEqual('D', schema(4, 3))
def test_intuit_headers(self): sources = self.load_sources(file_name='sources.csv') for source_name in ['headers4', 'headers3', 'headers2', 'headers1']: cache_fs = fsopendir(self.setup_temp_dir()) spec = sources[source_name] f = MPRowsFile(cache_fs, spec.name) \ .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y))) self.assertEqual(spec.expect_start, f.info['data_start_row']) self.assertEqual([int(e) for e in spec.expect_headers.split(',')], f.info['header_rows'])
def test_generator(self): from ambry_sources.sources import GeneratorSource, SourceSpec cache_fs = fsopendir(self.setup_temp_dir()) def gen(): yield list('abcde') for i in range(10): yield [i, i+1, i+2, i+3, i+4] f = MPRowsFile(cache_fs, 'foobar').load_rows(GeneratorSource(SourceSpec('foobar'), gen())) self.assertEqual(1, f.info['data_start_row']) self.assertEqual(11, f.info['data_end_row']) self.assertEqual([0], f.info['header_rows']) self.assertEqual(f.headers, list('abcde')) rows = list(f.select()) self.assertEqual(len(rows), 10) self.assertEqual(sorted(rows[0].keys()), sorted(list('abcde'))) self.assertTrue(f.is_finalized)
def test_intuit_footer(self): sources = self.load_sources(file_name='sources.csv') for source_name in ['headers4', 'headers3', 'headers2', 'headers1']: cache_fs = fsopendir(self.setup_temp_dir()) spec = sources[source_name] f = MPRowsFile(cache_fs, spec.name) \ .load_rows(get_source(spec, cache_fs, callback=lambda x, y: (x, y))) with f.reader as r: last = list(r.rows)[-1] # islice isn't working on the reader. print source_name, last self.assertEqual(11999, int(last[0])) self.assertEqual('2q080z003Cg2', last[1])
def test_highways(self): # FIXME: Optimize to use local file instead of downloading it all the time. cache_fs = fsopendir(self.setup_temp_dir()) sources = self.load_sources(file_name='geo_sources.csv') spec = sources['highways'] source = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) # first check is it converted properly. row_gen = source._get_row_gen() first_row = next(row_gen) # generates valid first row self.assertEqual(len(first_row), 68) self.assertEqual(first_row[0], 0) # last element is wkt. self.assertIn('LINESTRING', first_row[-1]) # header is valid self.assertEqual(len(source._headers), 68) self.assertEqual(source._headers[0], 'id') self.assertEqual(source._headers[-1], 'geometry') # now check its load to MPRows mpr = MPRowsFile(cache_fs, spec.name).load_rows(source) # Are columns recognized properly? NAME_INDEX = 1 # which element of the column description contains name. # Collect all names from column descriptors. Skip first elem of the schema because # it's descriptor of column descriptor elements. columns = [x[NAME_INDEX] for x in mpr.meta['schema'][1:]] self.assertIn('id', columns) self.assertIn('geometry', columns) self.assertIn('length', columns) # column from shape file. # Is first row valid? first_row = next(iter(mpr.reader)) self.assertEqual(len(first_row), 68) self.assertEqual(first_row['id'], 0) self.assertIn('LINESTRING', first_row['geometry']) return # spec columns are properly populated self.assertEqual(len(spec.columns), 68) self.assertEqual(spec.columns[0]['name'], 'id') self.assertEqual(spec.columns[-1]['name'], 'geometry')
def __init__(self, options, columns): """ Args: options (dict): filesystem and path, filesystem is root directory str, path is relative name of the file. Example: { 'filesystem': '/tmp/my-root', 'path': '/dir1/file1.mpr' } """ super(MPRForeignDataWrapper, self).__init__(options, columns) self.columns = columns if 'path' not in options: log_to_postgres( 'Filename is required option of the partition msgpack fdw.', ERROR, hint= 'Try to add the `path` option to the table creation statement') raise RuntimeError( '`path` is required option of the MPR (Message Pack Rows) fdw.' ) if 'filesystem' not in options: log_to_postgres( 'filesystem is required option of the partition msgpack fdw.', ERROR, hint= 'Try to add the `filesystem` option to the table creation statement' ) raise RuntimeError( '`filesystem` is required option of the MPR (Message Pack Rows) fdw.' ) self.filesystem = fsopendir(options['filesystem']) self.path = options['path'] if logger.level == logging.DEBUG: current_user = getpass.getuser() log_to_postgres( 'Initializing Foreign Data Wrapper: user: {}, filesystem: {}, path: {}' .format(current_user, options['filesystem'], options['path']), DEBUG) self._mp_rows = MPRowsFile(self.filesystem, self.path)
def import_source(spec, cache_fs, file_path=None, account_accessor=None): """Download a source and load it into an MPR file. """ s = get_source(spec, cache_fs, account_accessor) if not file_path: file_path = spec.name f = MPRowsFile(cache_fs, file_path) w = f.writer w.set_spec(spec) for row in s: w.insert_row(row) w.close() return f
def test_stats(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" #cache_fs = fsopendir('temp://') from shutil import rmtree from os import makedirs tp = '/tmp/mpr-test' rmtree(tp, ignore_errors=True) makedirs(tp) cache_fs = fsopendir(tp) s = get_source(self.sources['simple_stats'], cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, s.spec.name).load_rows(s, run_stats=True) stat_names = ('count', 'min', 'mean', 'max', 'nuniques') vals = { u('str_a'): (30, None, None, None, 10), u('str_b'): (30, None, None, None, 10), u('float_a'): (30, 1.0, 5.5, 10.0, 10), u('float_b'): (30, 1.1, 5.5, 9.9, 10), u('float_c'): (30, None, None, None, 10), u('int_b'): (30, None, None, None, 10), u('int_a'): (30, 1.0, 5.5, 10.0, 10) } with f.reader as r: for col in r.columns: stats = (col.stat_count, col.min, round(col.mean, 1) if col.mean else None, col.max, col.nuniques) for a, b, stat_name in zip(vals[col.name], stats, stat_names): self.assertEqual( a, b, "{} failed for stat {}: {} != {}".format( col.name, stat_name, a, b))
def test_creates_virtual_table_for_source_with_header_containing_sql_reserved_words( self): # build rows reader cache_fs = fsopendir(self.setup_temp_dir()) spec = SourceSpec('foobar') def gen(): # yield header yield ['create', 'index', 'where', 'select', 'distinct'] # yield rows for i in range(10): yield [i, i + 1, i + 2, i + 3, i + 4] s = GeneratorSource(spec, gen()) mprows = MPRowsFile(cache_fs, spec.name).load_rows(s) connection = apsw.Connection(':memory:') table = 'table1' add_partition(connection, mprows, table) # check all columns and some rows. cursor = connection.cursor() query = 'SELECT count(*) FROM {};'.format(table) result = cursor.execute(query).fetchall() self.assertEqual(result, [(10, )]) with mprows.reader as r: expected_first_row = next(iter(r)).row # query by columns. query = 'SELECT "create", "index", "where", "select", "distinct" FROM {} LIMIT 1;'.format( table) result = cursor.execute(query).fetchall() self.assertEqual(len(result), 1) self.assertEqual(result[0], expected_first_row)
def test_row_load_intuit(self): """Check that the sources can be loaded and analyzed without exceptions and that the guesses for headers and start are as expected""" from itertools import islice cache_fs = fsopendir('temp://') cache_fs.makedir('/mpr') # cache_fs = fsopendir('/tmp/ritest/') sources = self.load_sources('sources-non-std-headers.csv') for source_name, spec in sources.items(): s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, '/mpr/' + source_name) if f.exists: f.remove() f.load_rows(s, intuit_type=False, run_stats=False, limit=500) self.assertEqual(f.info['data_start_row'], spec.expect_start) with f.reader as r: # First row, marked with metadata, that is marked as a data row m1, row1 = next( six.moves.filter(lambda e: e[0][2] == 'D', r.meta_raw)) with f.reader as r: # First row row2 = next(r.rows) with f.reader as r: # First row proxy row3 = next(iter(r)).row self.assertEqual(row1, row2) self.assertEqual(row1, row3) with f.reader as r: raw_rows = list(islice(r.raw, None, 40)) self.assertEqual(row2, raw_rows[f.info['data_start_row']])
def no_header(data_start_row=None, data_end_row=None): # No header, column labels. f = MPRowsFile('mem://nh') w = f.writer for row in rows: w.insert_row(row) if data_start_row is not None: w.data_start_row = data_start_row if data_end_row is not None: w.data_end_row = data_end_row w.close() self.assertEqual(['col1', 'col2', 'col3', 'col4', 'col5', 'col6'], w.parent.reader.headers) w.parent.reader.close() return f
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://public.source.civicknowledge.com/example.com/sources/simple-example.csv', name='simple') s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(10001, f.reader.info['data_end_row'])
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec('http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219)] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])
def test_fixed(self): cache_fs = fsopendir(self.setup_temp_dir()) spec = self.sources['simple_fixed'] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name).load_rows(s) self.assertEqual(f.headers, ['id', 'uuid', 'int', 'float'])
def test_bad_row_intuition(self): from ambry_sources.mpf import MPRowsFile from ambry_sources.sources.spec import SourceSpec, ColumnSpec cache_fs = fsopendir('temp://') spec = SourceSpec( 'http://www2.census.gov/acs2009_1yr/summaryfile/Entire_States/Arizona.zip', file='g2009.*\.txt', filetype='fixed', name='geofile', encoding='latin1', ) spec.columns = [ ColumnSpec(position=1, width=6, name='fileid', start=1), ColumnSpec(position=2, width=2, name='stusab', start=7), ColumnSpec(position=3, width=3, name='sumlevel', start=9), ColumnSpec(position=4, width=2, name='component', start=12), ColumnSpec(position=5, width=7, name='logrecno', start=14), ColumnSpec(position=6, width=1, name='us', start=21), ColumnSpec(position=7, width=1, name='region', start=22), ColumnSpec(position=8, width=1, name='division', start=23), ColumnSpec(position=9, width=2, name='statece', start=24), ColumnSpec(position=10, width=2, name='state', start=26), ColumnSpec(position=11, width=3, name='county', start=28), ColumnSpec(position=12, width=5, name='cousub', start=31), ColumnSpec(position=13, width=5, name='place', start=36), ColumnSpec(position=14, width=6, name='tract', start=41), ColumnSpec(position=15, width=1, name='blkgrp', start=47), ColumnSpec(position=16, width=5, name='concit', start=48), ColumnSpec(position=17, width=4, name='aianhh', start=53), ColumnSpec(position=18, width=5, name='aianhhfp', start=57), ColumnSpec(position=19, width=1, name='aihhtli', start=62), ColumnSpec(position=20, width=3, name='aitsce', start=63), ColumnSpec(position=21, width=5, name='aits', start=66), ColumnSpec(position=22, width=5, name='anrc', start=71), ColumnSpec(position=23, width=5, name='cbsa', start=76), ColumnSpec(position=24, width=3, name='csa', start=81), ColumnSpec(position=25, width=5, name='metdiv', start=84), ColumnSpec(position=26, width=1, name='macc', start=89), ColumnSpec(position=27, width=1, name='memi', start=90), ColumnSpec(position=28, width=5, name='necta', start=91), ColumnSpec(position=29, width=3, name='cnecta', start=96), ColumnSpec(position=30, width=5, name='nectadiv', start=99), ColumnSpec(position=31, width=5, name='ua', start=104), ColumnSpec(position=33, width=2, name='cdcurr', start=114), ColumnSpec(position=34, width=3, name='sldu', start=116), ColumnSpec(position=35, width=3, name='sldl', start=119), ColumnSpec(position=39, width=5, name='submcd', start=136), ColumnSpec(position=40, width=5, name='sdelm', start=141), ColumnSpec(position=41, width=5, name='sdsec', start=146), ColumnSpec(position=42, width=5, name='sduni', start=151), ColumnSpec(position=43, width=1, name='ur', start=156), ColumnSpec(position=44, width=1, name='pci', start=157), ColumnSpec(position=47, width=5, name='puma5', start=169), ColumnSpec(position=49, width=40, name='geoid', start=179), ColumnSpec(position=50, width=200, name='name', start=219) ] s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) self.assertEqual(119, f.reader.info['data_end_row'])
def test_datafile_read_write(self): from fs.opener import fsopendir import datetime from random import randint, random from contexttimer import Timer from uuid import uuid4 fs = fsopendir('mem://') # fs = fsopendir('/tmp/pmpf') N = 50000 # Basic read/ write tests. def rand_date(): return datetime.date(randint(2000, 2015), randint(1, 12), 10) def rand_datetime(): return datetime.datetime(randint(2000, 2015), randint(1, 12), 10) def rand_time(): return datetime.time(randint(0, 23), randint(0, 59), 10) row = lambda: (None, 1, random(), str(uuid4()), rand_date(), rand_datetime(), rand_time()) headers = list('abcdefghi')[:len(row())] rows = [row() for i in range(N)] def write_large_blocks(): df = MPRowsFile(fs, 'foobar') if df.exists: df.remove() with Timer() as t, df.writer as w: w.headers = headers w.insert_rows(rows) print('MSGPack write L', float(N) / t.elapsed, w.n_rows) def write_small_blocks(): df = MPRowsFile(fs, 'foobar') if df.exists: df.remove() with Timer() as t, df.writer as w: for i in range(N): w.headers = headers w.insert_row(rows[i]) print('MSGPack write S', float(N) / t.elapsed, w.n_rows) print() # Write the whole file with insert_rows() which writes all of the rows at once. write_large_blocks() # Write the file in blocks, with insert_rows collecting rows into a cache, then writting the # cached blocks. write_small_blocks() df = MPRowsFile(fs, 'foobar') with Timer() as t: count = 0 i = 0 s = 0 r = df.reader for i, row in enumerate(r): count += 1 r.close() print('MSGPack read ', float(N) / t.elapsed, i, count, s) with Timer() as t: count = 0 r = df.reader for row in r.rows: count += 1 r.close() print('MSGPack rows ', float(N) / t.elapsed) with Timer() as t: count = 0 r = df.reader for row in r.raw: count += 1 r.close() print('MSGPack raw ', float(N) / t.elapsed)
def test_load_check_headers(self): """Just check that all of the sources can be loaded without exceptions""" cache_fs = fsopendir('temp://') headers = { 'mz_with_zip_xl': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'mz_no_zip': [u('id'), u('uuid'), u('int'), u('float')], 'namesu8': [ u('origin_english'), u('name_english'), u('origin_native'), u('name_native') ], 'sf_zip': [u('id'), u('uuid'), u('int'), u('float')], 'simple': [u('id'), u('uuid'), u('int'), u('float')], 'csv_no_csv': [u('id'), u('uuid'), u('int'), u('float')], 'mz_with_zip': [u('id'), u('uuid'), u('int'), u('float')], 'rpeople': [u('name'), u('size')], 'rent07': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'simple_fixed': [u('id'), u('uuid'), u('int'), u('float')], 'altname': [u('id'), u('foo'), u('bar'), u('baz')], 'rentcsv': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'renttab': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'multiexcel': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ], 'rent97': [ u('id'), u('gvid'), u('renter_cost_gt_30'), u('renter_cost_gt_30_cv'), u('owner_cost_gt_30_pct'), u('owner_cost_gt_30_pct_cv') ] } for source_name, spec in self.sources.items(): print(source_name) s = get_source(spec, cache_fs, callback=lambda x, y: (x, y)) f = MPRowsFile(cache_fs, spec.name) if f.exists: f.remove() f.load_rows(s) with f.reader as r: if spec.name in headers: self.assertEqual(headers[spec.name], r.headers)