def test_select(): dataset = mock_data_set() qb = QueryBuilder(dataset) eq_(qb.column_exps, '*') qb_w_select = qb.select('x,y') assert_is_not(qb, qb_w_select) eq_(qb_w_select.column_exps, 'x,y') qb_w_select_and_from = qb_w_select.frm('bogus') q = qb_w_select_and_from.query assert_is_instance(q, Query) assert_sequence_equal(q.schema.fields, [ Field(name="x", type="INTEGER", schema_name="bogus"), Field(name="y", type="INTEGER", schema_name="bogus") ]) compare(q.operations, ProjectionOp(LoadOp('bogus'), Var('x'), Var('y'))) qb_select_y_from_bogus = qb.select('y').frm('bogus') eq_(qb_select_y_from_bogus.column_exps, 'y') assert_sequence_equal( qb_select_y_from_bogus.query.schema.fields, [Field(name="y", type="INTEGER", schema_name="bogus")]) compare(qb_select_y_from_bogus.query.operations, ProjectionOp(LoadOp('bogus'), Var('y')))
def test_dict_adapter_with_schemas(): adapter = DictAdapter( employees = dict( schema = dict( fields=[ dict(name="employee_id", type="INTEGER"), dict(name="full_name", type="STRING"), dict(name="employment_date", type="DATE"), dict(name="manager_id", type="INTEGER") ] ), rows = employee_records ) ) employees = adapter.get_relation('employees') assert_sequence_equal( employees.schema.fields, [ Field(name="employee_id", type="INTEGER", schema_name="employees"), Field(name="full_name", type="STRING", schema_name="employees"), Field(name="employment_date", type="DATE", schema_name="employees"), Field(name="manager_id", type="INTEGER", schema_name="employees") ] ) assert_sequence_equal( list(employees), [ (1234, 'Tom Tompson', date(2009, 1, 17), None), (4567, 'Sally Sanders', date(2010, 2, 24), 1234), (8901, 'Mark Markty', date(2010, 3, 1), 1234) ] )
def test_decode_csv(): stream = StringIO(u"field1,field2,field3\nfoo,1,0\nbaz,2,0") schema = codecs.schema_from(stream, mime_type='text/csv') expected = Schema([ Field(name='field1', type='STRING'), Field(name='field2', type='STRING'), Field(name='field3', type='STRING') ]) eq_(schema, expected)
def __init__(self, name, root_dir, **options): self.name = name if not root_dir.endswith('/'): root_dir += '/' self.root_dir = root_dir self.pattern = options.pop('pattern', None) if self.pattern: tokens = tokenize_pattern(self.pattern) self.path_schema = Schema([ Field(name=c, type="STRING") for c in columns(tokens) ]) self.content_column = options.pop('content_column', None) self.filename_column = options.pop('filename_column', None) self.decode = options.pop('decode', "none") schema = options.pop('schema',None) if isinstance(schema, Schema): self.schema = schema else: self.schema = schema and Schema(**schema) if options: raise ValueError("Unrecognized options {}".format(options.keys()))
def test_index_repeating_scalar(self): field = Field(name="count", type="INTEGER", mode="REPEATED") results = list(features.index_repeating_scalar(field, [1, 2, 3], '')) assert_sequence_equal(results, [('count', 1), ('count', 2), ('count', 3)])
def test_index_record(self): field = Field(name="point", type="RECORD", fields=[ dict(name="x", type="INTEGER"), dict(name="y", type="INTEGER") ]) results = list(features.index_record(field, dict(x=10, y=1), '')) assert_sequence_equal(results, [('point.x', 10), ('point.y', 1)])
def test_decode_csv(): stream = StringIO("field1,field2,field3\nfoo,1,0\nbaz,2,0") schema = codecs.schema_from(stream, mime_type='text/csv') eq_( schema, Schema([ Field(name='field1', type='STRING'), Field(name='field2', type='STRING'), Field(name='field3', type='STRING') ]) ) relation = codecs.relation_from(stream, mime_type='text/csv') assert_sequence_equal( list(relation), [ ['foo','1','0'], ['baz','2','0'] ] )
def test_repeating_index_record(self): field = Field(name="point", type="RECORD", mode="REPEATED", fields=[ dict(name="x", type="INTEGER"), dict(name="y", type="INTEGER") ]) results = list( features.index_repeating_record( field, [dict(x=1, y=1), dict(x=2, y=2)], '')) assert_sequence_equal(results, [('point.x', 1), ('point.y', 1), ('point.x', 2), ('point.y', 2)])
def test_projection_and_selection(): dataset = mock_data_set() qb = QueryBuilder(dataset).select('full_name').frm('employees').where( 'employee_id = 123') query = qb.query assert_equal( query.operations, ProjectionOp( SelectionOp(LoadOp('employees'), EqOp(Var('employee_id'), NumberConst(123))), Var('full_name'))) assert_sequence_equal( query.schema.fields, [Field(name="full_name", type="STRING", schema_name="employees")])
def dataset(source_control, docker_client, containers): dataset = splicer.DataSet() dataset.add_aggregate( "maxwhen", func=maxwhen, returns=Field(name="min", type="STRING"), initial=(None, None), finalize=lambda state: state[0] ) # data collected at the start of the program static_data = DictAdapter( branch=dict( schema=dict(fields=COMMIT_SCHEMA), rows=branches(source_control) ), image=dict( schema=dict(fields=IMAGE_SCHEMA), rows=images(docker_client, containers) ) ) dataset.add_adapter(static_data) # splicer doesn't have select distinct yet.. use group by instead query = """ select branch.branch, image, maxwhen(branch.commit, branch.rel_commit) as commit, max(branch.rel_commit) as rel_commit from image join branch on image.tag = branch.commit group by branch, image union all select branch, null as image, branch as commit, -1 from branch group by branch """ dataset.create_view( 'latest_commit', query, ) return dataset
def test_index_scalar(self): field = Field(name="count", type="INTEGER") results = list(features.index_scalar(field, 2, '')) assert_sequence_equal(results, [('count', 2)])