def test_generator(self): from ambry_sources.sources import GeneratorSource, SourceSpec from ambry_sources import head, tail cache_fs = fsopendir(self.setup_temp_dir()) def gen(): yield list('abcde') for i in range(10): yield [i, i + 1, i + 2, i + 3, i + 4] f = HDFPartition(cache_fs, 'foobar') s = GeneratorSource(SourceSpec('foobar'), gen()) ri = RowIntuiter().run(head(s, 100), tail(s, 100)) row_spec = self._row_intuiter_to_dict(ri) ti = TypeIntuiter().process_header(ri.headers).run(GeneratorSource(SourceSpec('foobar'), gen())) with f.writer as w: w.set_row_spec(row_spec, ri.headers) w.set_types(ti) f.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) self.assertEqual(f.headers, list('abcde')) rows = [] for row in f.select(): rows.append(row.dict) self.assertEqual(len(rows), 10) self.assertEqual(rows[0], {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}) self.assertEqual(rows[-1], {'a': 9, 'b': 10, 'c': 11, 'd': 12, 'e': 13})
def test_creates_virtual_tables_for_partition_with_segment_without_errors( self): fs = fsopendir('temp://') def gen(): # generate header yield ['col1', 'col2'] # generate rows yield [0, 0] yield [1, 1] mprows = MPRowsFile(fs, 'example.com/simple-0.1.3/1.mpr') mprows.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) # create virtual tables. This should not raise an error. # connection = apsw.Connection(':memory:') try: add_partition(connection, mprows, 'vid1') except Exception as exc: raise AssertionError( 'partition adding unexpectadly failed with {} error.'.format( exc)) # check selected rows # cursor = connection.cursor() result = cursor.execute('SELECT * FROM {}'.format('vid1')).fetchall() self.assertEqual(result, [(0, 0), (1, 1)])
def test_executes_select_query_without_any_error(self, fake_shares): fake_shares.return_value = True def gen(): # generate header yield ['col1', 'col2'] # generate first row yield [0, 0] fs = fsopendir('temp://') datafile = MPRowsFile(fs, 'vid1') datafile.load_rows(GeneratorSource(SourceSpec('foobar'), gen())) connection = None try: PostgreSQLTestBase._create_postgres_test_db() connection = psycopg2.connect(**PostgreSQLTestBase.pg_test_db_data) # create foreign table for partition with connection.cursor() as cursor: # we have to close opened transaction. cursor.execute('COMMIT;') add_partition(cursor, datafile, 'vid1') # query just created foreign table. with connection.cursor() as cursor: cursor.execute('SELECT * FROM partitions.vid1;') finally: if connection: connection.close() PostgreSQLTestBase._drop_postgres_test_db()
def test_generator(self): from ambry_sources.sources import GeneratorSource, SourceSpec cache_fs = fsopendir(self.setup_temp_dir()) def gen(): yield list('abcde') for i in range(10): yield [i, i + 1, i + 2, i + 3, i + 4] f = MPRowsFile(cache_fs, 'foobar').load_rows( GeneratorSource(SourceSpec('foobar'), gen())) self.assertEqual(1, f.info['data_start_row']) self.assertEqual(11, f.info['data_end_row']) self.assertEqual([0], f.info['header_rows']) self.assertEqual(f.headers, list('abcde')) rows = list(f.select()) self.assertEqual(len(rows), 10) self.assertEqual(sorted(rows[0].keys()), sorted(list('abcde'))) self.assertTrue(f.is_finalized)
def test_uses_url_as_table(self): fake_execute = Mock(return_value=iter([[1], [2]])) connection = AttrDict({'execute': fake_execute}) spec = SourceSpec('table1') relation_source = DatabaseRelationSource(spec, 'sqlite', connection) rows = [x for x in relation_source] self.assertEqual(rows, [[1], [2]]) fake_execute.assert_called_once_with( 'SELECT * FROM {};'.format('table1'))
def spec(self): """Return a SourceSpec to describe this source""" from ambry_sources.sources import SourceSpec d = self.dict d['url'] = self.url # Will get the URL twice; once as ref and once as URL, but the ref is ignored return SourceSpec(**d)
def _get_generator_source(self, header, rows): def gen(): # generate header yield header # generate rows for row in rows: yield row return GeneratorSource(SourceSpec('foobar'), gen())
def load_sources(cls, file_name='sources.csv'): import tests import csv from os.path import join, dirname from ambry_sources.sources import ColumnSpec, SourceSpec test_data = fsopendir(join(dirname(tests.__file__), 'test_data')) sources = {} fixed_widths = ( ('id', 1, 6), ('uuid', 7, 34), ('int', 41, 3), ('float', 44, 14), ) fw_columns = [ ColumnSpec(**dict(list(zip('name start width'.split(), e)))) for e in fixed_widths ] with test_data.open(file_name) as f: r = csv.DictReader(f) for row in r: if row['name'] == 'simple_fixed': row['columns'] = fw_columns ss = SourceSpec(**row) if 'expect_headers' in row or 'expect_start' in row: ss.expect_headers = row.get('expect_headers') try: ss.expect_start = int(row.get('expect_start')) except ValueError: ss.expect_start = None sources[ss.name] = ss return sources
def test_reads_layer_specified_by_segment(self, fake_open, fake_shape, fake_dumps): fake_collection = self._get_fake_collection() fake_open.return_value = fake_collection spec = SourceSpec('http://example.com', segment=5) fstor = Mock(spec=DelayedOpen) fstor._fs = Mock() source = ShapefileSource(spec, fstor) next(source._get_row_gen()) self.assertEqual(len(fake_open.mock_calls), 1) self.assertEqual(fake_open.call_args_list[0][1]['layer'], 5, 'open function was called with wrong layer.')
def _get_generator_source(rows=None): if not rows: rows = [[0, 0], [1, 1], [2, 2]] def gen(rows=rows): # generate header yield ['col1', 'col2'] # generate some rows for row in rows: yield row return GeneratorSource(SourceSpec('foobar'), gen())
def load_sources(cls, file_name='sources.csv'): import tests import csv from os.path import join, dirname from ambry_sources.sources import ColumnSpec, SourceSpec test_data = fsopendir(join(dirname(tests.__file__), 'test_data')) sources = {} fixed_widths = (('id', 1, 6), ('uuid', 7, 34), ('int', 41, 3), ('float', 44, 14), ) fw_columns = [ColumnSpec(**dict(list(zip('name start width'.split(), e)))) for e in fixed_widths] with test_data.open(file_name) as f: r = csv.DictReader(f) for row in r: if row['name'] == 'simple_fixed': row['columns'] = fw_columns ss = SourceSpec(**row) if 'expect_headers' in row or 'expect_start' in row: ss.expect_headers = row.get('expect_headers') try: ss.expect_start = int(row.get('expect_start')) except ValueError: ss.expect_start = None sources[ss.name] = ss return sources
def test_populates_columns_of_the_spec(self, fake_open, fake_get, fake_shape, fake_dumps): fake_collection = self._get_fake_collection() fake_open.return_value = fake_collection fake_get.return_value = [{'name': 'col1', 'type': 'int'}] spec = SourceSpec('http://example.com') fstor = Mock(spec=DelayedOpen) fstor._fs = Mock() source = ShapefileSource(spec, fstor) next(source._get_row_gen()) self.assertEqual(len(source.spec.columns), 1) self.assertEqual(source.spec.columns[0].name, 'col1') self.assertEqual(len(fake_open.mock_calls), 1) self.assertEqual(len(fake_get.mock_calls), 2)
def test_converts_row_id_to_integer(self, fake_open, fake_get, fake_shape, fake_dumps): fake_collection = self._get_fake_collection() fake_open.return_value = fake_collection fake_shape.expects_call().is_a_stub() fake_dumps.expects_call().is_a_stub() fake_get.return_value = [{'name': 'col1', 'type': 'int'}] spec = SourceSpec('http://example.com') fstor = Mock(spec=DelayedOpen) fstor._fs = Mock() source = ShapefileSource(spec, fstor) row_gen = source._get_row_gen() first_row = next(row_gen) self.assertEqual(first_row[0], 0) self.assertEqual(len(fake_open.mock_calls), 1) self.assertEqual(len(fake_get.mock_calls), 2)
def test_last_element_in_the_row_is_wkt(self, fake_open, fake_get, fake_shape, fake_dumps): fake_collection = self._get_fake_collection() fake_open.return_value = fake_collection fake_shape.expects_call().is_a_stub() fake_dumps.return_value = 'I AM FAKE WKT' fake_get.return_value = [{'name': 'col1', 'type': 'int'}] spec = SourceSpec('http://example.com') fstor = Mock(spec=DelayedOpen) fstor._fs = Mock() source = ShapefileSource(spec, fstor) row_gen = source._get_row_gen() first_row = next(row_gen) self.assertEqual(first_row[-1], 'I AM FAKE WKT') self.assertEqual(len(fake_open.mock_calls), 1) self.assertEqual(len(fake_get.mock_calls), 2)
def test_saves_header(self, fake_open, fake_get, fake_shape, fake_dumps): fake_collection = self._get_fake_collection() fake_open.return_value = fake_collection fake_get.return_value = [{ 'name': 'id', 'type': 'int' }, { 'name': 'col1', 'type': 'int' }, { 'name': 'geometry', 'type': 'geometry_type' }] spec = SourceSpec('http://example.com') fstor = Mock(spec=DelayedOpen) fstor._fs = Mock() source = ShapefileSource(spec, fstor) next(source._get_row_gen()) self.assertEqual(source._headers, ['id', 'col1', 'geometry']) self.assertEqual(len(fake_open.mock_calls), 1) self.assertEqual(len(fake_get.mock_calls), 2)
def test_allow_source_data_to_start_from_0_row(self): columns = [ ColumnSpec(name='col1', position=1, start=1, width=1), ColumnSpec(name='col2', position=2, start=3, width=1) ] spec = SourceSpec('http://example.com', header_lines=[], start_line=0, columns=columns) assert spec.start_line == 0 fs = fsopendir('temp://') with fs.open('temp.txt', 'w') as f: f.write(u'1 1\n') f.write(u'2 2\n') f.write(u'3 3\n') fstor = DelayedOpen(fs, 'temp.txt') fixed_source = FixedSource(spec, fstor) source_data = [x for x in fixed_source] self.assertEqual(len(source_data), 3) self.assertEqual(source_data, [['1', '1'], ['2', '2'], ['3', '3']])
def test_creates_virtual_table_for_source_with_header_containing_sql_reserved_words( self): # build rows reader cache_fs = fsopendir(self.setup_temp_dir()) spec = SourceSpec('foobar') def gen(): # yield header yield ['create', 'index', 'where', 'select', 'distinct'] # yield rows for i in range(10): yield [i, i + 1, i + 2, i + 3, i + 4] s = GeneratorSource(spec, gen()) mprows = MPRowsFile(cache_fs, spec.name).load_rows(s) connection = apsw.Connection(':memory:') table = 'table1' add_partition(connection, mprows, table) # check all columns and some rows. cursor = connection.cursor() query = 'SELECT count(*) FROM {};'.format(table) result = cursor.execute(query).fetchall() self.assertEqual(result, [(10, )]) with mprows.reader as r: expected_first_row = next(iter(r)).row # query by columns. query = 'SELECT "create", "index", "where", "select", "distinct" FROM {} LIMIT 1;'.format( table) result = cursor.execute(query).fetchall() self.assertEqual(len(result), 1) self.assertEqual(result[0], expected_first_row)
class Rows(object): spec = SourceSpec(None, header_lines=(3, 4), start_line=5) def __iter__(self): return iter(rows)