def get_range(self, base_key: Key, start_time: datetime, end_time: datetime = None, count: int = 0) -> List[Tuple[Key, Any]]: """ Returns the list of items from the store based on the given time range or count. :param base_key: Items which don't start with the base_key are filtered out. :param start_time: Start time to for the range query :param end_time: End time of the range query. If None count is used. :param count: The number of items to be returned. Used if end_time is not specified. """ if end_time and count: raise ValueError('Only one of `end` or `count` can be set') if count: end_time = datetime.min.replace( tzinfo=timezone.utc) if count < 0 else datetime.max.replace( tzinfo=timezone.utc) end_time = self._add_timezone_if_required(end_time) start_time = self._add_timezone_if_required(start_time) if end_time < start_time: start_time, end_time = end_time, start_time if base_key.key_type == KeyType.TIMESTAMP: start_key = Key(KeyType.TIMESTAMP, base_key.identity, base_key.group, [], start_time) end_key = Key(KeyType.TIMESTAMP, base_key.identity, base_key.group, [], end_time) return self._get_range_timestamp_key(start_key, end_key, count) else: return self._get_range_dimension_key(base_key, start_time, end_time, count)
def _prepare_window(self, start_time: datetime) -> None: """ Prepares window if any is specified. :param start_time: The anchor block start_time from where the window should be generated. """ # evaluate window first which sets the correct window in the store store = self._schema.schema_loader.get_store( self._schema.source.store_schema.fully_qualified_name) if Type.is_type_equal(self._schema.window_type, Type.DAY) or Type.is_type_equal( self._schema.window_type, Type.HOUR): block_list = self._load_blocks( store.get_range( Key(self._schema.source.key_type, self._identity, self._schema.source.name), start_time, self._get_end_time(start_time))) else: block_list = self._load_blocks( store.get_range( Key(self._schema.source.key_type, self._identity, self._schema.source.name), start_time, None, self._schema.window_value)) self._window_source = _WindowSource(block_list) self._validate_view()
def test_save_simple(store: DynamoStore) -> None: store.save(Key(KeyType.DIMENSION, 'test_user', 'test_group'), { 'string_field': 'string', 'int_field': 1 }) assert store.get(Key(KeyType.DIMENSION, 'test_user', 'test_group')) == { 'string_field': 'string', 'int_field': 1 }
def test_set_simple(empty_memory_store) -> None: """ Tests that the setter stores an item in memory that can be retrieved by the same key :return: """ store = empty_memory_store store.save(Key(KeyType.DIMENSION, 'test_user', 'test_group'), 1) assert store.get(Key(KeyType.DIMENSION, 'test_user', 'test_group')) == 1
def test_equals_dimension_key(): assert Key(KeyType.DIMENSION, 'a', 'b') == Key(KeyType.DIMENSION, 'a', 'b') assert Key(KeyType.DIMENSION, 'a', 'b') != Key(KeyType.DIMENSION, 'a', 'c') assert Key(KeyType.DIMENSION, 'a', 'b') != Key(KeyType.DIMENSION, 'a', 'b', ['c']) assert Key(KeyType.DIMENSION, 'a', 'b', ['c']) == Key(KeyType.DIMENSION, 'a', 'b', ['c']) assert Key(KeyType.DIMENSION, 'a', 'b', ['c']) != Key( KeyType.DIMENSION, 'a', 'b', ['d'])
def test_get_range_no_items_on_boundary(loaded_store: DynamoStore) -> None: items = loaded_store.get_range( Key('test_user', 'test_group', datetime(2018, 1, 1, 2, 1, 1, 0, tzinfo=timezone.utc)), Key('test_user', 'test_group', datetime(2018, 1, 1, 6, 1, 1, 2, tzinfo=timezone.utc))) assert len(items) == 5 assert items[0][1]['int_field'] == 2 assert items[-1][1]['int_field'] == 6
def test_save_time(store: DynamoStore) -> None: start_time = datetime(2018, 1, 1, 1, 1, 1, 1, tzinfo=timezone.utc) store.save(Key('test_user', 'test_group', start_time), { 'string_field': 'string2', 'int_field': 2 }) assert store.get(Key('test_user', 'test_group', start_time)) == { 'string_field': 'string2', 'int_field': 2 }
def _prepare_key(self, timestamp: datetime = None): """ Generates the Key object based on dimension fields. """ if self._dimension_fields: return Key( self._identity, self._name + '.' + (':').join([ str(item.value) for item in self._dimension_fields.values() ]), timestamp) return Key(self._identity, self._name, timestamp)
def test_set_get_date(empty_memory_store) -> None: """ Tests that the timestamp is used as part of the key object """ store = empty_memory_store now = datetime.utcnow() key = Key('user1', 'session', now) store.save(key, 'test') assert store.get(Key('user1', 'session', now)) == 'test'
def test_sort_prefix_key(): assert Key(KeyType.DIMENSION, 'user1', 'group1').sort_prefix_key == 'group1/' assert Key(KeyType.DIMENSION, 'user1', 'group1', ['a', 'b']).sort_prefix_key == 'group1/a:b' assert Key(KeyType.TIMESTAMP, 'user1', 'group1').sort_prefix_key == 'group1//' assert Key( KeyType.TIMESTAMP, 'user1', 'group1', [], datetime(2018, 3, 7, 22, 35, 31)).sort_prefix_key == 'group1//2018-03-07T22:35:31+00:00'
def test_key_type_and_args_error(): with pytest.raises( ValueError, match='`timestamp` should not be set for KeyType.DIMENSION.'): Key(KeyType.DIMENSION, 'id', 'group', [], datetime(2018, 3, 7, 22, 35, 31)) with pytest.raises( ValueError, match='`dimensions` should not be set for KeyType.TIMESTAMP.'): Key(KeyType.TIMESTAMP, 'id', 'group', ['dim1'], None)
def test_less_than_dimension_key(): assert (Key(KeyType.DIMENSION, 'a', 'b') < Key(KeyType.DIMENSION, 'a', 'b')) is False assert (Key(KeyType.DIMENSION, 'a', 'b') < Key(KeyType.DIMENSION, 'a', 'c')) is False assert (Key(KeyType.DIMENSION, 'a', 'b') < Key(KeyType.DIMENSION, 'a', 'b', ['c'])) is True assert (Key(KeyType.DIMENSION, 'a', 'b', ['c']) < Key( KeyType.DIMENSION, 'a', 'b', ['c'])) is False assert (Key(KeyType.DIMENSION, 'a', 'b', ['c']) < Key( KeyType.DIMENSION, 'a', 'b', ['d'])) is True
def test_aggregate_final_state( activity_aggregate_schema: ActivityAggregateSchema, activity_events: List[Record]) -> None: # Initialize the starting state identity = 'user1' evaluation_context = EvaluationContext() evaluation_context.global_add('identity', identity) activity_aggregate = ActivityAggregate(activity_aggregate_schema, identity, evaluation_context) evaluation_context.global_add(activity_aggregate._schema.name, activity_aggregate) for record in activity_events: evaluate_event(record, activity_aggregate) activity_aggregate.run_finalize() store_state = activity_aggregate._store.get_all(identity) assert len(store_state) == 3 assert store_state.get( Key('user1', 'activity_aggr', datetime(2018, 1, 1, 1, 1, 1, 0, timezone.utc))) == { '_identity': 'user1', '_start_time': datetime(2018, 1, 1, 1, 1, 1, 0, timezone.utc).isoformat(), '_end_time': datetime(2018, 1, 1, 1, 2, 1, 0, timezone.utc).isoformat(), 'sum': 111, 'count': 3 } assert store_state.get( Key('user1', 'activity_aggr', datetime(2018, 1, 1, 3, 1, 1, 0, timezone.utc))) == { '_identity': 'user1', '_start_time': datetime(2018, 1, 1, 3, 1, 1, 0, timezone.utc).isoformat(), '_end_time': datetime(2018, 1, 1, 3, 1, 1, 0, timezone.utc).isoformat(), 'sum': 1000, 'count': 1 } assert store_state.get( Key('user1', 'activity_aggr', datetime(2018, 1, 2, 1, 1, 1, 0, timezone.utc))) == { '_identity': 'user1', '_start_time': datetime(2018, 1, 2, 1, 1, 1, 0, timezone.utc).isoformat(), '_end_time': datetime(2018, 1, 2, 1, 1, 1, 0, timezone.utc).isoformat(), 'sum': 10000, 'count': 1 }
def test_greater_than_dimension_key(): assert (Key(KeyType.DIMENSION, 'a', 'b') > Key(KeyType.DIMENSION, 'a', 'b')) is False assert (Key(KeyType.DIMENSION, 'a', 'b') > Key(KeyType.DIMENSION, 'a', 'c')) is False assert (Key(KeyType.DIMENSION, 'a', 'b') > Key(KeyType.DIMENSION, 'a', 'b', ['c'])) is False assert (Key(KeyType.DIMENSION, 'a', 'b', ['c']) > Key( KeyType.DIMENSION, 'a', 'b', ['c'])) is False assert (Key(KeyType.DIMENSION, 'a', 'b', ['d']) > Key( KeyType.DIMENSION, 'a', 'b', ['c'])) is True
def test_get_range_start_end(memory_store: MemoryStore) -> None: """ Tests that the range get does not include the blocks that lie on the boundary """ start = Key('user1', 'session', datetime(2018, 3, 7, 19, 35, 31, 0, timezone.utc)) end = Key('user1', 'session', datetime(2018, 3, 7, 22, 38, 31, 0, timezone.utc)) blocks = memory_store.get_range(start, end) assert len(blocks) == 2 assert blocks[0][1]['_start_time'] == datetime(2018, 3, 7, 20, 35, 35, 0, timezone.utc).isoformat()
def test_get(memory_store: MemoryStore) -> None: key = Key(KeyType.DIMENSION, 'user1', 'state') assert memory_store.get(key) == { 'variable_1': 1, 'variable_a': 'a', 'variable_true': True } date = datetime(2018, 3, 7, 19, 35, 31, 0, timezone.utc) key = Key(KeyType.TIMESTAMP, 'user1', 'session', [], date) assert memory_store.get(key) == { 'events': 1, '_start_time': date.isoformat() }
def test_two_key_fields_in_aggregate( identity_aggregate_schema_spec_with_two_key_fields: Dict[str, Any], store_spec: Dict[str, Any], records: List[Record]): schema = identity_aggregate_schema( identity_aggregate_schema_spec_with_two_key_fields, store_spec) # Initialize the starting state identity = 'user1' evaluation_context = EvaluationContext() evaluation_context.global_add('identity', identity) identity_aggregate = IdentityAggregate(schema, identity, evaluation_context) evaluation_context.global_add(identity_aggregate._schema.name, identity_aggregate) # Evaluate all the events for event in records: evaluate_event(event, identity_aggregate) identity_aggregate.run_finalize() store_state = identity_aggregate._store.get_all('user1') assert len(store_state) == 3 assert store_state.get( Key(KeyType.DIMENSION, 'user1', 'label_aggr', ['a', '97'])) == { '_identity': 'user1', 'label': 'a', 'label_ascii': 97, 'sum': 110, 'count': 2 } assert store_state.get( Key(KeyType.DIMENSION, 'user1', 'label_aggr', ['b', '98'])) == { '_identity': 'user1', 'label': 'b', 'label_ascii': 98, 'sum': 1, 'count': 1 } assert store_state.get( Key(KeyType.DIMENSION, 'user1', 'label_aggr', ['c', '99'])) == { '_identity': 'user1', 'label': 'c', 'label_ascii': 99, 'sum': 11000, 'count': 2 }
def test_split_when_label_evaluates_to_none( identity_aggregate_schema_spec: Dict[str, Any], store_spec: Dict[str, Any], records: List[Record]): identity_aggregate_schema_spec['Dimensions'][0][ 'Value'] = '1/0 if source.label == \'a\' else source.label' schema = identity_aggregate_schema(identity_aggregate_schema_spec, store_spec) # Initialize the starting state identity = 'user1' evaluation_context = EvaluationContext() evaluation_context.global_add('identity', identity) identity_aggregate = IdentityAggregate(schema, identity, evaluation_context) evaluation_context.global_add(identity_aggregate._schema.name, identity_aggregate) # Check for error states evaluate_event(records[0], identity_aggregate) evaluate_event(records[1], identity_aggregate) evaluate_event(records[2], identity_aggregate) assert identity_aggregate._dimension_fields['label'].value == 'b' identity_aggregate.run_finalize() store_state = identity_aggregate._store.get_all(identity) assert len(store_state) == 1 assert store_state.get(Key('user1', 'label_aggr.b')) == { '_identity': 'user1', 'label': 'b', 'sum': 1, 'count': 1 }
def test_get_range_start_end_time_no_dimensions_match( memory_store: MemoryStore) -> None: key = Key(KeyType.DIMENSION, 'user1', 'session_dim', ['dimC']) blocks = memory_store.get_range( key, datetime(2018, 3, 7, 19, 35, 31, 0, timezone.utc), datetime(2018, 3, 7, 22, 38, 31, 0, timezone.utc)) assert len(blocks) == 0
def test_no_variable_aggreate_data_stored(): runner, data = execute_runner('tests/data/stream.yml', None, ['tests/data/raw.json']) block_data = {k: v for (k, v) in data.collect()} # Variables should not be stored assert Key('userA', 'vars') not in block_data
def test_block_aggregate_schema_evaluate_without_split( block_aggregate_schema_spec, schema_loader): name = schema_loader.add_schema_spec(block_aggregate_schema_spec) block_aggregate_schema = BlockAggregateSchema(name, schema_loader) identity = 'userA' time = datetime(2018, 3, 7, 19, 35, 31, 0, timezone.utc) block_aggregate = create_block_aggregate(block_aggregate_schema, time, identity) block_aggregate.run_evaluate() # Check eval results of various fields assert len(block_aggregate._nested_items) == 4 assert check_fields( block_aggregate._nested_items, { '_identity': identity, 'event_count': 1, '_start_time': time, '_end_time': time }) # aggregate snapshot should not exist in store assert block_aggregate._store.get( Key(identity=block_aggregate._identity, group=block_aggregate._name, timestamp=block_aggregate._start_time)) is None
def _persist(self, timestamp=None) -> None: """ Persists the current data group :param timestamp: Optional timestamp to include in the Key construction """ if self._store: self._store.save(Key(self._identity, self._name, timestamp), self._snapshot)
def test_streaming_transformer_finalize(schema_loader: SchemaLoader, schema_spec: Dict[str, Any]) -> None: streaming_bts = schema_loader.add_schema_spec(schema_spec) transformer_schema = StreamingTransformerSchema(streaming_bts, schema_loader) transformer = StreamingTransformer(transformer_schema, 'user1') store = schema_loader.get_store('test.memstore') transformer.run_finalize() assert store.get(Key(KeyType.DIMENSION, 'user1', 'test_group')) is None transformer.run_evaluate(Record()) transformer.run_finalize() assert store.get(Key(KeyType.DIMENSION, 'user1', 'test_group')) == { '_identity': 'user1', 'events': 1 }
def test_get_range_count_backward(loaded_store: DynamoStore) -> None: items = loaded_store.get_range( Key('test_user', 'test_group', datetime(2018, 1, 1, 6, 1, 1, 1, tzinfo=timezone.utc)), None, -3) assert len(items) == 3 assert items[0][1]['int_field'] == 5 assert items[-1][1]['int_field'] == 3
def test_get_range_count_positive_partial_dimensions_match( loaded_store: DynamoStore) -> None: key = Key(KeyType.DIMENSION, 'user1', 'session_dim', ['dimA']) blocks = loaded_store.get_range( key, datetime(2018, 3, 7, 19, 35, 31, 0, timezone.utc), None, 2) assert len(blocks) == 2 assert blocks[0][1]['_start_time'] == datetime(2018, 3, 7, 21, 36, 31, 0, timezone.utc).isoformat()
def test_get_range_count_negative_partial_dimensions_match( memory_store: MemoryStore) -> None: key = Key(KeyType.DIMENSION, 'user1', 'session_dim', ['dimA']) blocks = memory_store.get_range( key, datetime(2018, 3, 7, 22, 38, 31, 0, timezone.utc), None, -2) assert len(blocks) == 2 assert blocks[0][1]['_start_time'] == datetime(2018, 3, 7, 19, 35, 31, 0, timezone.utc).isoformat()
def stream_transformer(schema_loader, stream_schema_spec): stream_bts_name = schema_loader.add_schema_spec(stream_schema_spec) stream_transformer = StreamingTransformer( schema_loader.get_schema_object(stream_bts_name), 'user1') stream_transformer.run_restore( {Key(KeyType.DIMENSION, 'user1', 'state'): { 'country': 'US' }}) return stream_transformer
def test_aggregate_finalize(aggregate_schema_with_store): aggregate = MockAggregate(schema=aggregate_schema_with_store, identity="12345", evaluation_context=EvaluationContext()) aggregate.run_finalize() snapshot_aggregate = aggregate._store.get( Key(identity="12345", group="user")) assert snapshot_aggregate is not None assert snapshot_aggregate == aggregate._snapshot
def test_get_range_count_negative_from_first_element( memory_store: MemoryStore, key_type_and_group) -> None: """ Tests that the range get does not include the blocks that lie on the boundary """ key = Key(key_type_and_group[0], 'user1', key_type_and_group[1]) blocks = memory_store.get_range( key, datetime(2018, 3, 7, 19, 35, 31, 0, timezone.utc), None, -2) assert len(blocks) == 0
def test_no_variable_aggreate_data_stored(): runner, data = execute_runner('tests/data/stream.yml', None, ['tests/data/raw.json']) block_data = {} for id, (per_id_block_data, _) in data.collect(): block_data[id] = per_id_block_data # Variables should not be stored assert Key(KeyType.DIMENSION, 'userA', 'vars') not in block_data['userA']